In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read tsv file into a dataframe object
# Press tab to check you are in the correct folder location and to browse
# to the tsv file
# The sep command indicates this files is separated by tabs
dataframe = pd.read_csv("SMSSpamCollection.tsv", sep="\t")

In [3]:
message_length_col = []
for index, row in dataframe.iterrows():
    length_message_text = len(row.message)
    # add the length of each message to list
    message_length_col.append(length_message_text)

In [5]:
# now we'll add the contents of this list to a new column
# called "length" to the end of our dataframe
# See https://www.geeksforgeeks.org/adding-new-column-to-existing-dataframe-in-pandas/
dataframe['length'] = message_length_col

In [6]:
ham_data = []
spam_data = []
for index, row in dataframe.iterrows():
    # If the label data is recognised to be "ham"
    if row["label"] == "ham":
        ham_data.append(row)
    else:
        spam_data.append(row)

# Convert list to a dataframe before performing descriptive statistics on it
ham_dataframe = pd.DataFrame(ham_data)
spam_dataframe = pd.DataFrame(spam_data)

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [8]:
# Define variables first
punct_length_col = []
punct_count = 0

for index, textrow in dataframe.iterrows():
    doc_object = nlp(textrow.message)
    for word in doc_object:
        if word.pos_ == 'PUNCT':           
            punct_count += 1
    # Sentence is checked so add count to list
    punct_length_col.append(punct_count)
    punct_count =0

In [9]:
#Add punct list to dataframe
dataframe['punct'] = punct_length_col

In [10]:
# View top of dataframe content
dataframe.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,4
1,ham,Ok lar... Joking wif u oni...,29,2
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,1
3,ham,U dun say so early hor... U c already then say...,49,2
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,1


## ML

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# X is the feature data
# We're creating a list of column names to
# use from our dataframe.
# We need 2 brackets as there is more than 1 entry
X = dataframe[['length', 'punct']]

# This is the label data - 1 entry
# so only need 1 set of brackets
y = dataframe['label']

# Use SHIFT + TAB to see full options and to
# copy some contents below
# test-size represents percentage to use for testing data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 1)

In [13]:
# Contains 2 columns
print('X train data shape', X_train.shape)
print('X test data shape', X_test.shape)

X train data shape (3900, 2)
X test data shape (1672, 2)


In [14]:
# 1 column of label data
print('y train data shape', y_train.shape)
print('y test data shape', y_test.shape)

y train data shape (3900,)
y test data shape (1672,)


In [15]:
# Index position matches with index
# position in X_train
y_train

4393     ham
216      ham
4471     ham
3889     ham
5030    spam
2000     ham
4135     ham
1055     ham
3646    spam
4774     ham
1758     ham
4796     ham
4857     ham
5311     ham
2879    spam
1416     ham
3680     ham
460      ham
3431     ham
1784     ham
4570     ham
1135     ham
267      ham
1892     ham
1926     ham
3853     ham
3663     ham
1521    spam
3225     ham
4583     ham
        ... 
1031     ham
1110     ham
1888    spam
3550     ham
1527     ham
753      ham
3049     ham
2628     ham
562      ham
4764     ham
3562    spam
252      ham
2516     ham
2962     ham
4453     ham
5374     ham
5396     ham
1202     ham
3462     ham
2797     ham
4225     ham
144      ham
5056     ham
2895     ham
2763     ham
905      ham
5192     ham
3980     ham
235     spam
5157     ham
Name: label, Length: 3900, dtype: object

In [16]:
print(X_test)

      length  punct
1078      28      1
4028      45      3
958       26      0
4642       7      1
4674     107      5
5461      51      3
4210      74      5
4216      26      1
1603      25      2
1504      31      2
1783      53      2
3465       8      0
5534      28      0
4267      85      1
2498      51      2
4259      29      1
147      159      4
141       33      1
4517     161     10
3053      49      2
5392      59      0
2346      41      2
1242      59      2
3224      33      0
4872      35      2
3044      42      1
1660      28      2
3214      14      1
501      149      6
1827     332     11
...      ...    ...
1673     157      4
1433      38      1
616      145      2
3416      29      0
4035      54      3
1646      31      1
1395      36      1
630      148      2
955       41      0
194      111      2
4392      85      1
909       31      0
5540     158      5
1006      37      0
5080      93      3
4548     124      6
5345      16      1
4545      35      1


In [17]:
y_test

1078     ham
4028     ham
958      ham
4642     ham
4674     ham
5461     ham
4210     ham
4216     ham
1603     ham
1504     ham
1783     ham
3465     ham
5534     ham
4267     ham
2498     ham
4259     ham
147     spam
141      ham
4517    spam
3053     ham
5392     ham
2346     ham
1242     ham
3224     ham
4872     ham
3044     ham
1660     ham
3214     ham
501      ham
1827     ham
        ... 
1673    spam
1433     ham
616      ham
3416     ham
4035     ham
1646     ham
1395     ham
630     spam
955     spam
194      ham
4392     ham
909      ham
5540    spam
1006     ham
5080     ham
4548     ham
5345     ham
4545     ham
368     spam
3677     ham
4692     ham
3531     ham
3409    spam
4964     ham
2332     ham
3954    spam
619      ham
1987     ham
2358     ham
3594     ham
Name: label, Length: 1672, dtype: object

### Logistic regression

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
lin_reg_model = LogisticRegression(solver='lbfgs')
# Note that the "fit" option must be run in the same cell as line above
lin_reg_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [20]:
from sklearn import metrics

# Create a prediction set:
# The model has not yet seen contents of X_test
# which is a dataset of message length and punctuation
# And we know to expect answers in y_test
# which is a list of expected label output
lin_reg_model_predictions = lin_reg_model.predict(X_test)

In [21]:
# This is the predicted output from the model
lin_reg_model_predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [22]:
# Now we compare what the model predicted 
# with what is expected as output
# Print a confusion matrix
print(metrics.confusion_matrix(y_test,lin_reg_model_predictions))

[[1391   51]
 [ 220   10]]


In [23]:
# You can make the confusion matrix less confusing by adding labels:
dataframe_labels = pd.DataFrame(metrics.confusion_matrix(y_test,lin_reg_model_predictions), 
                  index=['correct ham','correct spam'], 
                  columns=['predicted ham','predicted spam'])
dataframe_labels

Unnamed: 0,predicted ham,predicted spam
correct ham,1391,51
correct spam,220,10


In [24]:
# Print a classification report
print(metrics.classification_report(y_test,lin_reg_model_predictions))

              precision    recall  f1-score   support

         ham       0.86      0.96      0.91      1442
        spam       0.16      0.04      0.07       230

   micro avg       0.84      0.84      0.84      1672
   macro avg       0.51      0.50      0.49      1672
weighted avg       0.77      0.84      0.80      1672



### Naive-Bayes classifier

In [25]:
# First import the model we want to use
from sklearn.naive_bayes import MultinomialNB

# Create an instance of the model - common model for text data and spam filtering
nb_model = MultinomialNB()

# Fit model to training data
nb_model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
# Predict answers to data from the X_text dataset
# containing text length and punctuation count
nb_model_predictions = nb_model.predict(X_test)

# Show results in a confusion matrix
print(metrics.confusion_matrix(y_test,nb_model_predictions))

[[1442    0]
 [ 230    0]]


In [28]:
# You can make the confusion matrix less confusing by adding labels:
dataframe_labels = pd.DataFrame(metrics.confusion_matrix(y_test,nb_model_predictions), 
                  index=['correct ham','correct spam'], 
                  columns=['predicted ham','predicted spam'])
dataframe_labels

Unnamed: 0,predicted ham,predicted spam
correct ham,1442,0
correct spam,230,0


In [27]:
print(metrics.classification_report(y_test,nb_model_predictions))

              precision    recall  f1-score   support

         ham       0.86      1.00      0.93      1442
        spam       0.00      0.00      0.00       230

   micro avg       0.86      0.86      0.86      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.74      0.86      0.80      1672



  'precision', 'predicted', average, warn_for)


In [30]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,nb_model_predictions))

0.8624401913875598


### Random forest

In [46]:
# First import the model we want to use
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the model - common model for text data and spam filtering
rf_model = RandomForestClassifier(random_state=1)

# Fit model to training data
rf_model.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [47]:
# Predict answers to data from the X_text dataset
# containing text length and punctuation count
rf_model_predictions = rf_model.predict(X_test)

# Show results in a confusion matrix
print(metrics.confusion_matrix(y_test,rf_model_predictions))

[[1359   83]
 [ 126  104]]


In [34]:
# You can make the confusion matrix less confusing by adding labels:
dataframe_labels = pd.DataFrame(metrics.confusion_matrix(y_test,rf_model_predictions), 
                  index=['correct ham','correct spam'], 
                  columns=['predicted ham','predicted spam'])
dataframe_labels

Unnamed: 0,predicted ham,predicted spam
correct ham,1364,78
correct spam,111,119


In [48]:
print(metrics.classification_report(y_test,rf_model_predictions))

              precision    recall  f1-score   support

         ham       0.92      0.94      0.93      1442
        spam       0.56      0.45      0.50       230

   micro avg       0.88      0.88      0.88      1672
   macro avg       0.74      0.70      0.71      1672
weighted avg       0.87      0.88      0.87      1672



In [49]:
print(metrics.accuracy_score(y_test,rf_model_predictions))

0.875


### SVM