In [1]:
from pprint import pprint
import pandas as pd
import nltk
import re

def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = (text.encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Load data and clean it"
df = pd.read_csv('spam.csv')
df['clean_text'] = df.text.apply(clean).apply(' '.join)

In [3]:
# check the data
df

Unnamed: 0,id,label,text,clean_text
0,0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though
...,...,...,...,...
5567,5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u u 750 pound prize 2...
5568,5568,ham,Will Ì_ b going to esplanade fr home?,_ b going esplanade fr home
5569,5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestion
5570,5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like id interested buying s...


In [4]:
# Split data into X and y (features and target)
X = df.clean_text
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=12)

In [5]:
# transform X into a Series of vectors
X_train = pd.Series(X_train)
pprint(X_train)

1562               dude saw parked car sunroof popped sux
3362                                              im free
3686                       great shoot big load get ready
2457                                     kkhow sister kid
353     yo guy ever figure much need alcohol jay tryin...
                              ...                        
3916    eh ur laptop got stock lei say mon muz come ta...
1283                                   yes thought thanks
3714    lateso call tomorrow morningtake care sweet dr...
3325         huh fast dat mean u havent finished painting
1414                    wats ur opinion abt abt character
Name: clean_text, Length: 4457, dtype: object


Term Frequency

In [6]:
# From the Series we can extract the value_counts, which is our raw count
# for term frequency. Once we have the raw counts, we can calculate the
# other measures.
(pd.DataFrame({'raw_count': X_train.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

Unnamed: 0,raw_count,frequency,augmented_frequency
sorry ill call later,22,0.004936,1.000000
ok,15,0.003365,0.681818
cant pick phone right pls send message,10,0.002244,0.454545
okie,5,0.001122,0.227273
please call customer service representative freephone 0808 145 4742 9am11pm guaranteed 1000 cash 5000 prize,4,0.000897,0.181818
...,...,...,...
think rule tamilnaduthen tough people,1,0.000224,0.045455
babe go day sip cappuccino think love send kiss across sea,1,0.000224,0.045455
pete please ring meive hardly gotany credit,1,0.000224,0.045455
there ring come guy costume gift future yowifes hint hint,1,0.000224,0.045455


Bag of Words

In [7]:
# convert X_train into a list of lists
X_train = X_train.tolist()

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
# create a CountVectorizer object
cv = CountVectorizer()
bag_of_words = cv.fit_transform(X_train)

In [11]:
bag_of_words.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [14]:
# Look at the feature names
cv.get_feature_names()

['008704050406',
 '0089my',
 '0121',
 '01223585334',
 '0125698789',
 '02',
 '020603',
 '0207',
 '02070836089',
 '02072069400',
 '02073162414',
 '02085076972',
 '020903',
 '021',
 '050703',
 '060505',
 '061104',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '071104',
 '07123456789',
 '07734396839',
 '07742676969',
 '07753741225',
 '07786200117',
 '077xxx',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '0784987',
 '0789xxxxxxx',
 '0794674629107880867867',
 '07973788240',
 '07xxxxxxxxx',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081560665',
 '0825',
 '0844',
 '08448350055',
 '0845',
 '08452810073',
 '08452810075over18s',
 '0870',
 '08700621170150p',
 '08701237397',
 '08701417012',
 '08701417012150p',
 '0870141701216',
 '087016248',
 '08701752560',
 '087018728737',
 '0870241182716',
 '0870

In [16]:
# Look at the vocabulary and the counts
cv.vocabulary_

{'dude': 2475,
 'saw': 5877,
 'parked': 5087,
 'car': 1663,
 'sunroof': 6529,
 'popped': 5298,
 'sux': 6560,
 'im': 3598,
 'free': 2953,
 'great': 3208,
 'shoot': 6048,
 'big': 1360,
 'load': 4154,
 'get': 3085,
 'ready': 5570,
 'kkhow': 3938,
 'sister': 6122,
 'kid': 3912,
 'yo': 7656,
 'guy': 3261,
 'ever': 2652,
 'figure': 2813,
 'much': 4608,
 'need': 4702,
 'alcohol': 942,
 'jay': 3779,
 'trying': 6966,
 'safely': 5828,
 'spend': 6313,
 'weed': 7338,
 'messageit': 4441,
 'da': 2139,
 'aight': 923,
 'tomorrow': 6867,
 'around': 1094,
 'ltgt': 4240,
 'time': 6807,
 'week': 7339,
 'ryan': 5814,
 'sitting': 6128,
 'mu': 4607,
 'waiting': 7259,
 'everyone': 2659,
 'suite': 6515,
 'take': 6607,
 'shower': 6068,
 'havent': 3331,
 'eaten': 2511,
 'day': 2182,
 'staring': 6381,
 'juicy': 3840,
 'pizza': 5213,
 'cant': 1654,
 'eat': 2510,
 'med': 4395,
 'ruining': 5802,
 'life': 4092,
 'miss': 4500,
 'vday': 7162,
 'parachute': 5073,
 'double': 2422,
 'coin': 1893,
 'must': 4634,
 'know': 3

In [18]:
pprint(X_train)
# Taking a look at the bag of words transformation for education and diagnostics.
# In practice this is not necesssary and the resulting data might be to big to be reasonably helpful.
bow = pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names())

['dude saw parked car sunroof popped sux',
 'im free',
 'great shoot big load get ready',
 'kkhow sister kid',
 'yo guy ever figure much need alcohol jay trying figure much safely spend '
 'weed',
 'saw messageit k da',
 'aight tomorrow around ltgt',
 'time week ryan',
 'sitting mu waiting everyone get suite take shower',
 'havent eaten day im sitting staring juicy pizza cant eat med ruining life',
 'miss vday parachute double coin u must know well',
 'pls help tell sura im expecting battery hont pls send message download movie '
 'thanks',
 'please call customer service representative 0800 169 6031 10am9pm guaranteed '
 '1000 cash 5000 prize',
 'bishan lei tot _ say lavender',
 '7 wonder world 7th 6th ur style 5th ur smile 4th ur personality 3rd ur '
 'nature 2nd ur sm 1st ur lovely friendship good morning dear',
 'rock yr chik get 100 filthy film xxx pic yr phone rply filth 69669 saristar '
 'ltd e14 9yt 08701752560 450p per 5 day stop2 cancel',
 'pls confirm time collect cheque',
 '

In [21]:
# bow

In [20]:
bow.apply(lambda row: row / row.sum(), axis=1)

Unnamed: 0,008704050406,0089my,0121,01223585334,0125698789,02,020603,0207,02070836089,02072069400,...,zealand,zebra,zed,zero,zhong,zindgi,zoe,zogtorius,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### TF-IDF

- term frequency - inverse document frequency
- $\text{tf} \times \text{idf} = \frac{\text{tf}}{\text{df}}$
- a measure that helps identify how important a word is in a document
- combination of how often a word appears in a document (**tf**) and how unqiue the word
  is among documents (**idf**)
- used by search engines
- naturally helps filter out stopwords
- tf is for a single document, idf is for a corpus

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bag_of_words = tfidf.fit_transform(X_train)

pprint(X_train)
pd.DataFrame(bag_of_words.todense(), columns=tfidf.get_feature_names())

['dude saw parked car sunroof popped sux',
 'im free',
 'great shoot big load get ready',
 'kkhow sister kid',
 'yo guy ever figure much need alcohol jay trying figure much safely spend '
 'weed',
 'saw messageit k da',
 'aight tomorrow around ltgt',
 'time week ryan',
 'sitting mu waiting everyone get suite take shower',
 'havent eaten day im sitting staring juicy pizza cant eat med ruining life',
 'miss vday parachute double coin u must know well',
 'pls help tell sura im expecting battery hont pls send message download movie '
 'thanks',
 'please call customer service representative 0800 169 6031 10am9pm guaranteed '
 '1000 cash 5000 prize',
 'bishan lei tot _ say lavender',
 '7 wonder world 7th 6th ur style 5th ur smile 4th ur personality 3rd ur '
 'nature 2nd ur sm 1st ur lovely friendship good morning dear',
 'rock yr chik get 100 filthy film xxx pic yr phone rply filth 69669 saristar '
 'ltd e14 9yt 08701752560 450p per 5 day stop2 cancel',
 'pls confirm time collect cheque',
 '

Unnamed: 0,008704050406,0089my,0121,01223585334,0125698789,02,020603,0207,02070836089,02072069400,...,zealand,zebra,zed,zero,zhong,zindgi,zoe,zogtorius,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


To get the idf score for each word (these aren't terribly usefule themselves):

In [24]:
pd.Series(dict(zip(tfidf.get_feature_names(), tfidf.idf_))).sort_values()

call        3.288773
im          3.535987
get         3.742973
ur          3.873026
ok          4.004293
              ...   
hangin      8.709308
hanger      8.709308
hang        8.709308
happiest    8.709308
zyada       8.709308
Length: 7709, dtype: float64

### Bag Of Ngrams

For either `CountVectorizer` or `TfidfVectorizer`, you can set the `ngram_range`
parameter.

In [25]:
cv = CountVectorizer(ngram_range=(2, 2))
bag_of_words = cv.fit_transform(X_train)

pprint(X_train)
pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names())

['dude saw parked car sunroof popped sux',
 'im free',
 'great shoot big load get ready',
 'kkhow sister kid',
 'yo guy ever figure much need alcohol jay trying figure much safely spend '
 'weed',
 'saw messageit k da',
 'aight tomorrow around ltgt',
 'time week ryan',
 'sitting mu waiting everyone get suite take shower',
 'havent eaten day im sitting staring juicy pizza cant eat med ruining life',
 'miss vday parachute double coin u must know well',
 'pls help tell sura im expecting battery hont pls send message download movie '
 'thanks',
 'please call customer service representative 0800 169 6031 10am9pm guaranteed '
 '1000 cash 5000 prize',
 'bishan lei tot _ say lavender',
 '7 wonder world 7th 6th ur style 5th ur smile 4th ur personality 3rd ur '
 'nature 2nd ur sm 1st ur lovely friendship good morning dear',
 'rock yr chik get 100 filthy film xxx pic yr phone rply filth 69669 saristar '
 'ltd e14 9yt 08701752560 450p per 5 day stop2 cancel',
 'pls confirm time collect cheque',
 '

Unnamed: 0,008704050406 sp,0089my last,0121 2025050,01223585334 cum,0125698789 ring,02 user,020603 2nd,0207 153,02072069400 bx,02073162414 cost,...,zed 08701417012150p,zed pobox,zero saving,zhong se,zindgi wo,zoe 18,zoe hit,zogtorius ive,zouk nichols,zyada kisi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


MODELING

Changed the hyperparameters of the model to see how it performs. max_depth=5 to max_depth=10.

In [36]:
cv = CountVectorizer()
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=10)
tree.fit(X_bow, y_train)

tree.score(X_bow, y_train)

0.9652232443347544

In [37]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

0.9793583127664348

Max_depth=5: cv_score: 0.931, tfidf_score: 0.953

Max_depth=10: cv_score: 0.965, tfidf_score: 0.979

I will try a random_forest model.

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)


In [52]:
# fit the model to the data
rf.fit(X_bow, y_train)
rf.fit(X_tfidf, y_train)

RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)

Feature Importance

Evaluate importance, or weight, of each feature.

In [53]:
pprint(rf.feature_importances_)

array([0., 0., 0., ..., 0., 0., 0.])



Make Predictions for both CountVectorizer and TfidfVectorizer

In [46]:
y_pred_cv = rf.predict(X_bow)

In [47]:
y_pred_tf = rf.predict(X_tfidf)

Estimate Probability

In [50]:
y_pred_proba_cv = rf.predict_proba(X_bow)
y_pred_proba_tf = rf.predict_proba(X_tfidf)

Evaluate Model

Compute the Accuracy

In [54]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_bow, y_train)))

Accuracy of random forest classifier on training set: 0.87


In [55]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_tfidf, y_train)))

Accuracy of random forest classifier on training set: 0.87


Create a confusion matrix

In [56]:
print(confusion_matrix(y_train, y_pred_cv))

[[3860    0]
 [ 597    0]]


In [57]:
print(confusion_matrix(y_train, y_pred_tf))

[[3860    0]
 [ 597    0]]


Classification Report

In [58]:
print(classification_report(y_train, y_pred_cv))

              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      3860
        spam       0.00      0.00      0.00       597

    accuracy                           0.87      4457
   macro avg       0.43      0.50      0.46      4457
weighted avg       0.75      0.87      0.80      4457



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
print(classification_report(y_train, y_pred_tf))

              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      3860
        spam       0.00      0.00      0.00       597

    accuracy                           0.87      4457
   macro avg       0.43      0.50      0.46      4457
weighted avg       0.75      0.87      0.80      4457



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validate Model

Evaluate on Out-of-Sample data

Compute the accuracy of the model when run on the validate dataset.

In [61]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))


ValueError: could not convert string to float: 'wat make people dearer de happiness dat u feel u meet de pain u feel u miss dem'