In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('smsspamcollection.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [5]:
len(df)

5572

In [6]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [7]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
#X feature data
X = df[['length', 'punct']]
#y label data
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
lr_model = LogisticRegression(solver='lbfgs')

In [12]:
lr_model.fit(X_train, y_train)

In [13]:
from sklearn import metrics

In [14]:
predictions = lr_model.predict(X_test)

In [15]:
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [16]:
print(metrics.confusion_matrix(y_test, predictions))

[[1404   44]
 [ 219    5]]


In [17]:
df = pd.DataFrame(metrics.confusion_matrix(y_test, predictions), index=
                  ['ham','spam'], columns=['ham','spam'])

In [18]:
df.head()

Unnamed: 0,ham,spam
ham,1404,44
spam,219,5


In [27]:
from sklearn.naive_bayes import MultinomialNB
callable(MultinomialNB)

True

In [43]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB(force_alpha=True)
nb_model.fit(X=X_train, y=y_train)

In [44]:
predictions = nb_model.predict(X_test)

In [45]:
print(metrics.confusion_matrix(y_test, predictions))
print(metrics.classification_report(y_test, predictions))

[[1438   10]
 [ 224    0]]
              precision    recall  f1-score   support

         ham       0.87      0.99      0.92      1448
        spam       0.00      0.00      0.00       224

    accuracy                           0.86      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.86      0.80      1672



In [46]:
from sklearn.svm import SVC

In [47]:
svc_model = SVC(gamma='auto')
svc_model.fit(X_train, y_train)
predictions = svc_model.predict(X_test)
print(metrics.confusion_matrix(y_test, predictions))
print(metrics.classification_report(y_test, predictions))

[[1373   75]
 [ 121  103]]
              precision    recall  f1-score   support

         ham       0.92      0.95      0.93      1448
        spam       0.58      0.46      0.51       224

    accuracy                           0.88      1672
   macro avg       0.75      0.70      0.72      1672
weighted avg       0.87      0.88      0.88      1672

