### Model Exercises

Do your work for this exercise in a file named `model`.

Take the work we did in the lessons further:

- What other types of models (i.e. different classifcation algorithms) could you use?
- How do the models compare when trained on term frequency data alone, instead of TF-IDF values?

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import prepare

from prepare import basic_clean, lemmatize
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from env import user, password, host

In [66]:
def get_db_url(database, host=host, user=user, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

url = get_db_url("spam_db")
sql = "SELECT * FROM spam"

df = pd.read_sql(sql, url, index_col="id")
df.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Logistic Regression - TFIDF

In [5]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

In [8]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 97.37%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3857   115
spam          2   483
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      3859
        spam       1.00      0.81      0.89       598

    accuracy                           0.97      4457
   macro avg       0.98      0.90      0.94      4457
weighted avg       0.97      0.97      0.97      4457



In [9]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 96.86%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        965    34
spam         1   115
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       0.99      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



#### Logistic Regression - TF

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X_bag_of_words = cv.fit_transform(df.text)
y = df.label

In [73]:
X_bag_of_words.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [76]:
cv.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

In [74]:
cv.vocabulary_

{'go': 3550,
 'until': 8030,
 'jurong': 4350,
 'point': 5920,
 'crazy': 2327,
 'available': 1303,
 'only': 5537,
 'in': 4087,
 'bugis': 1751,
 'great': 3634,
 'world': 8489,
 'la': 4476,
 'buffet': 1749,
 'cine': 2048,
 'there': 7645,
 'got': 3594,
 'amore': 1069,
 'wat': 8267,
 'ok': 5504,
 'lar': 4512,
 'joking': 4318,
 'wif': 8392,
 'oni': 5533,
 'free': 3358,
 'entry': 2949,
 'wkly': 8447,
 'comp': 2165,
 'to': 7756,
 'win': 8405,
 'fa': 3087,
 'cup': 2386,
 'final': 3207,
 'tkts': 7743,
 '21st': 411,
 'may': 4930,
 '2005': 402,
 'text': 7595,
 '87121': 784,
 'receive': 6297,
 'question': 6190,
 'std': 7230,
 'txt': 7933,
 'rate': 6242,
 'apply': 1156,
 '08452810075over18': 77,
 'dun': 2802,
 'say': 6633,
 'so': 7024,
 'early': 2823,
 'hor': 3927,
 'already': 1042,
 'then': 7640,
 'nah': 5238,
 'don': 2712,
 'think': 7660,
 'he': 3781,
 'goes': 3558,
 'usf': 8075,
 'lives': 4665,
 'around': 1207,
 'here': 3831,
 'though': 7680,
 'freemsg': 3365,
 'hey': 3841,
 'darling': 2443,
 'it

In [75]:
bow = pd.DataFrame(X_bag_of_words.todense(), columns = cv.get_feature_names())
bow

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,ó_,û_,û_thanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
X_train, X_test, y_train, y_test = train_test_split(X_bag_of_words, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

In [81]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 99.73%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3859    12
spam          0   586
---
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3859
        spam       1.00      0.98      0.99       598

    accuracy                           1.00      4457
   macro avg       1.00      0.99      0.99      4457
weighted avg       1.00      1.00      1.00      4457



In [82]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 98.39%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        964    16
spam         2   133
---
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.99      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



#### Decision Tree - TFIDF

In [41]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

dtc = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train)

train['predicted'] = dtc.predict(X_train)
test['predicted'] = dtc.predict(X_test)

In [42]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 93.25%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3829   271
spam         30   327
---
              precision    recall  f1-score   support

         ham       0.93      0.99      0.96      3859
        spam       0.92      0.55      0.68       598

    accuracy                           0.93      4457
   macro avg       0.92      0.77      0.82      4457
weighted avg       0.93      0.93      0.92      4457



In [43]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 93.27%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        960    69
spam         6    80
---
              precision    recall  f1-score   support

         ham       0.93      0.99      0.96       966
        spam       0.93      0.54      0.68       149

    accuracy                           0.93      1115
   macro avg       0.93      0.77      0.82      1115
weighted avg       0.93      0.93      0.92      1115



In [50]:
for i in range(3,11):
    dtc = DecisionTreeClassifier(max_depth = i).fit(X_train, y_train)

    train['predicted'] = dtc.predict(X_train)
    test['predicted'] = dtc.predict(X_test)
    
    print('Decision Tree Depth =', i)
    print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
    print('---')
    print('Train Data - Confusion Matrix')
    print(pd.crosstab(train.predicted, train.actual))
    print('---')
    print(classification_report(train.actual, train.predicted))
    
    print('Decision Tree Depth =', i)
    print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
    print('---')
    print('Test Data - Confusion Matrix')
    print(pd.crosstab(test.predicted, test.actual))
    print('---')
    print(classification_report(test.actual, test.predicted))
    print('------------------------------------------------------')

Decision Tree Depth = 3
Accuracy: 94.50%
---
Train Data - Confusion Matrix
actual      ham  spam
predicted            
ham        3847   233
spam         12   365
---
              precision    recall  f1-score   support

         ham       0.94      1.00      0.97      3859
        spam       0.97      0.61      0.75       598

    accuracy                           0.95      4457
   macro avg       0.96      0.80      0.86      4457
weighted avg       0.95      0.95      0.94      4457

Decision Tree Depth = 3
Accuracy: 93.90%
---
Test Data - Confusion Matrix
actual     ham  spam
predicted           
ham        964    66
spam         2    83
---
              precision    recall  f1-score   support

         ham       0.94      1.00      0.97       966
        spam       0.98      0.56      0.71       149

    accuracy                           0.94      1115
   macro avg       0.96      0.78      0.84      1115
weighted avg       0.94      0.94      0.93      1115

-----------------

#### Decision Tree - TF

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X_bag_of_words, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

dtc = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train)

train['predicted'] = dtc.predict(X_train)
test['predicted'] = dtc.predict(X_test)

In [78]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 99.80%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3859     9
spam          0   589
---
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3859
        spam       1.00      0.98      0.99       598

    accuracy                           1.00      4457
   macro avg       1.00      0.99      1.00      4457
weighted avg       1.00      1.00      1.00      4457



In [79]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 97.76%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        965    24
spam         1   125
---
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.99      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [83]:
for i in range(3,11):
    dtc = DecisionTreeClassifier(max_depth = i).fit(X_train, y_train)

    train['predicted'] = dtc.predict(X_train)
    test['predicted'] = dtc.predict(X_test)
    
    print('Decision Tree Depth =', i)
    print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
    print('---')
    print('Train Data - Confusion Matrix')
    print(pd.crosstab(train.predicted, train.actual))
    print('---')
    print(classification_report(train.actual, train.predicted))
    
    print('Decision Tree Depth =', i)
    print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
    print('---')
    print('Test Data - Confusion Matrix')
    print(pd.crosstab(test.predicted, test.actual))
    print('---')
    print(classification_report(test.actual, test.predicted))
    print('------------------------------------------------------')

Decision Tree Depth = 3
Accuracy: 94.23%
---
Train Data - Confusion Matrix
actual      ham  spam
predicted            
ham        3790   188
spam         69   410
---
              precision    recall  f1-score   support

         ham       0.95      0.98      0.97      3859
        spam       0.86      0.69      0.76       598

    accuracy                           0.94      4457
   macro avg       0.90      0.83      0.86      4457
weighted avg       0.94      0.94      0.94      4457

Decision Tree Depth = 3
Accuracy: 93.99%
---
Test Data - Confusion Matrix
actual     ham  spam
predicted           
ham        947    48
spam        19   101
---
              precision    recall  f1-score   support

         ham       0.95      0.98      0.97       966
        spam       0.84      0.68      0.75       149

    accuracy                           0.94      1115
   macro avg       0.90      0.83      0.86      1115
weighted avg       0.94      0.94      0.94      1115

-----------------

#### Random Forest - TFIDF

In [53]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123).fit(X_train, y_train)

train['predicted'] = rf.predict(X_train)
test['predicted'] = rf.predict(X_test)

In [54]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 86.58%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3859   598
---
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      3859
        spam       0.00      0.00      0.00       598

    accuracy                           0.87      4457
   macro avg       0.43      0.50      0.46      4457
weighted avg       0.75      0.87      0.80      4457



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 86.64%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        966   149
---
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93       966
        spam       0.00      0.00      0.00       149

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Random Forest - TF

In [84]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X_bag_of_words, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123).fit(X_train, y_train)

train['predicted'] = rf.predict(X_train)
test['predicted'] = rf.predict(X_test)

In [85]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 86.58%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3859   598
---
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      3859
        spam       0.00      0.00      0.00       598

    accuracy                           0.87      4457
   macro avg       0.43      0.50      0.46      4457
weighted avg       0.75      0.87      0.80      4457



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [86]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 86.64%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        966   149
---
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93       966
        spam       0.00      0.00      0.00       149

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### KNN - TFIDF

In [57]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

knn = KNeighborsClassifier(n_neighbors = 2).fit(X_train, y_train)

train['predicted'] = dtc.predict(X_train)
test['predicted'] = dtc.predict(X_test)

In [58]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 97.44%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3850   105
spam          9   493
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      3859
        spam       0.98      0.82      0.90       598

    accuracy                           0.97      4457
   macro avg       0.98      0.91      0.94      4457
weighted avg       0.97      0.97      0.97      4457



In [59]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 97.58%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        960    21
spam         6   128
---
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       966
        spam       0.96      0.86      0.90       149

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [60]:
for i in range(3,11):
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train, y_train)

    train['predicted'] = dtc.predict(X_train)
    test['predicted'] = dtc.predict(X_test)
    
    print('Value of K =', i)
    print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
    print('---')
    print('Train Data - Confusion Matrix')
    print(pd.crosstab(train.predicted, train.actual))
    print('---')
    print(classification_report(train.actual, train.predicted))
    
    print('Value of K =', i)
    print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
    print('---')
    print('Test Data - Confusion Matrix')
    print(pd.crosstab(test.predicted, test.actual))
    print('---')
    print(classification_report(test.actual, test.predicted))
    print('------------------------------------------------------')

Value of K = 3
Accuracy: 97.44%
---
Train Data - Confusion Matrix
actual      ham  spam
predicted            
ham        3850   105
spam          9   493
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      3859
        spam       0.98      0.82      0.90       598

    accuracy                           0.97      4457
   macro avg       0.98      0.91      0.94      4457
weighted avg       0.97      0.97      0.97      4457

Value of K = 3
Accuracy: 97.58%
---
Test Data - Confusion Matrix
actual     ham  spam
predicted           
ham        960    21
spam         6   128
---
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       966
        spam       0.96      0.86      0.90       149

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

-----------------------------------

#### KNN - TF

In [87]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X_bag_of_words, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

knn = KNeighborsClassifier(n_neighbors = 2).fit(X_train, y_train)

train['predicted'] = dtc.predict(X_train)
test['predicted'] = dtc.predict(X_test)

In [88]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 97.31%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3817    78
spam         42   520
---
              precision    recall  f1-score   support

         ham       0.98      0.99      0.98      3859
        spam       0.93      0.87      0.90       598

    accuracy                           0.97      4457
   macro avg       0.95      0.93      0.94      4457
weighted avg       0.97      0.97      0.97      4457



In [89]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 97.31%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        961    25
spam         5   124
---
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       966
        spam       0.96      0.83      0.89       149

    accuracy                           0.97      1115
   macro avg       0.97      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [90]:
for i in range(3,11):
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train, y_train)

    train['predicted'] = dtc.predict(X_train)
    test['predicted'] = dtc.predict(X_test)
    
    print('Value of K =', i)
    print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
    print('---')
    print('Train Data - Confusion Matrix')
    print(pd.crosstab(train.predicted, train.actual))
    print('---')
    print(classification_report(train.actual, train.predicted))
    
    print('Value of K =', i)
    print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
    print('---')
    print('Test Data - Confusion Matrix')
    print(pd.crosstab(test.predicted, test.actual))
    print('---')
    print(classification_report(test.actual, test.predicted))
    print('------------------------------------------------------')

Value of K = 3
Accuracy: 97.31%
---
Train Data - Confusion Matrix
actual      ham  spam
predicted            
ham        3817    78
spam         42   520
---
              precision    recall  f1-score   support

         ham       0.98      0.99      0.98      3859
        spam       0.93      0.87      0.90       598

    accuracy                           0.97      4457
   macro avg       0.95      0.93      0.94      4457
weighted avg       0.97      0.97      0.97      4457

Value of K = 3
Accuracy: 97.31%
---
Test Data - Confusion Matrix
actual     ham  spam
predicted           
ham        961    25
spam         5   124
---
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       966
        spam       0.96      0.83      0.89       149

    accuracy                           0.97      1115
   macro avg       0.97      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115

-----------------------------------