In [68]:
import pandas as pd, numpy as np
import os, re

# Reading in the csv using pandas

In [69]:
mail_data = pd.read_csv('spam.csv',encoding="ISO-8859-1")
mail_data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [70]:
mail_data.v1.value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: v1, dtype: float64

In [71]:
mail_data.v2.sample().values[0]

'Carry on not disturbing both of you'

# Get the mails into a list for easy test cleaning and manipulation

In [72]:
mail = mail_data.v2.values

In [73]:
len(mail)

5572

In [74]:
mail[:5]

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       'U dun say so early hor... U c already then say...',
       "Nah I don't think he goes to usf, he lives around here though"],
      dtype=object)

# Normalizing case

In [75]:
mail_lower = [mails.lower() for mails in mail]

In [76]:
mail_lower[:5]

['go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...',
 'ok lar... joking wif u oni...',
 "free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's",
 'u dun say so early hor... u c already then say...',
 "nah i don't think he goes to usf, he lives around here though"]

# Remove @

In [77]:
import re

In [78]:
mail_nouser = [re.sub("@\w+","", mails) for mails in mail_lower]

In [79]:
mail_nouser[:5]

['go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...',
 'ok lar... joking wif u oni...',
 "free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's",
 'u dun say so early hor... u c already then say...',
 "nah i don't think he goes to usf, he lives around here though"]

# Token using Tweet Tokenizer form NLTK

In [80]:
from nltk.tokenize import TweetTokenizer

In [81]:
mails = TweetTokenizer()

In [82]:
print(mails.tokenize(mail_nouser[0]))

['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...']


In [83]:
mails_token = [mails.tokenize(sent) for sent in mail_nouser]
print(mails_token[0])

['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...']


# Remove punctuations and stop words

In [84]:
from nltk.corpus import stopwords
from string import punctuation

In [85]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)

In [86]:
stop_punct.extend(['...','``',"''","..","?","/","#"])

In [87]:
stop_final = stop_nltk + stop_punct 

In [88]:
def del_stop(sent):
    return [re.sub("#","",term) for term in sent if ((term not in stop_final) & (len(term)>1))]

In [89]:
del_stop(mails_token[4])

['nah', 'think', 'goes', 'usf', 'lives', 'around', 'though']

In [90]:
mail_clean = [del_stop(mail) for mail in mails_token]

# Checkout the top terms in the mails

In [91]:
from collections import Counter

In [92]:
term_list = []
for mail in mail_clean:
    term_list.extend(mail)

In [93]:
res = Counter(term_list)
res.most_common(10)

[('call', 592),
 ("i'm", 394),
 ('get', 388),
 ('ur', 385),
 ('ok', 284),
 ('free', 280),
 ('go', 279),
 ('<>', 276),
 ('know', 261),
 (':)', 251)]

# Join the tokens back into strings

In [94]:
mail_clean[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [95]:
mail_clean = [" ".join(mail) for mail in mail_clean]

In [96]:
mail_clean[0]

'go jurong point crazy available bugis great world la buffet cine got amore wat'

# Seperate x and y and perform train test split 

In [97]:
len(mail_clean)

5572

In [98]:
len(mail_data.v1)

5572

In [99]:
X = mail_clean
y = mail_data.v1.values

In [100]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)

# Create a document term metric using count vectorizer

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [102]:
vectorizer = TfidfVectorizer(max_features = 5566)

In [103]:
len(X_train), len(X_test)

(3900, 1672)

In [104]:
X_train_bow = vectorizer.fit_transform(X_train)

X_test_bow = vectorizer.transform(X_test)

In [105]:
X_train_bow.shape, X_test_bow.shape

((3900, 5566), (1672, 5566))

# Using a simple logistic regression

In [106]:
from sklearn.linear_model import LogisticRegression

In [107]:
logreg = LogisticRegression()

In [108]:
logreg.fit(X_train_bow, y_train)

LogisticRegression()

In [109]:
y_train_pred = logreg.predict(X_train_bow)
y_test_pred = logreg.predict(X_test_bow)

In [110]:
from sklearn.metrics import accuracy_score, classification_report

In [111]:
accuracy_score(y_train, y_train_pred)

0.9705128205128205

In [112]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      3372
        spam       0.99      0.79      0.88       528

    accuracy                           0.97      3900
   macro avg       0.98      0.90      0.93      3900
weighted avg       0.97      0.97      0.97      3900



# Adjusting for class imbalance

# Logistic Regression

In [113]:
logreg = LogisticRegression(class_weight="balanced")

In [114]:
logreg.fit(X_train_bow, y_train)

LogisticRegression(class_weight='balanced')

In [115]:
y_train_pred = logreg.predict(X_train_bow)
y_test_pred = logreg.predict(X_test_bow)

In [116]:
accuracy_score(y_train, y_train_pred)

0.9884615384615385

In [117]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         ham       1.00      0.99      0.99      3372
        spam       0.93      0.99      0.96       528

    accuracy                           0.99      3900
   macro avg       0.96      0.99      0.98      3900
weighted avg       0.99      0.99      0.99      3900



# Naive Bayes

In [118]:
from sklearn.naive_bayes import MultinomialNB

In [119]:
Multinom = MultinomialNB()

In [120]:
Multinom.fit(X_train_bow, y_train)

MultinomialNB()

In [121]:
y_train_pred = Multinom.predict(X_train_bow)
y_test_pred = Multinom.predict(X_test_bow)

In [122]:
accuracy_score(y_train, y_train_pred)

0.9853846153846154

In [123]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      3372
        spam       1.00      0.89      0.94       528

    accuracy                           0.99      3900
   macro avg       0.99      0.95      0.97      3900
weighted avg       0.99      0.99      0.99      3900



# Random Forest

In [124]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [125]:
rf.fit(X_train_bow, y_train)

RandomForestClassifier()

In [126]:
y_train_pred = rf.predict(X_train_bow)
y_test_pred = rf.predict(X_test_bow)

In [127]:
accuracy_score(y_train, y_train_pred)

1.0

In [128]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3372
        spam       1.00      1.00      1.00       528

    accuracy                           1.00      3900
   macro avg       1.00      1.00      1.00      3900
weighted avg       1.00      1.00      1.00      3900



# KNN

In [129]:
from sklearn.neighbors import NearestCentroid

In [130]:
knn = NearestCentroid()

In [131]:
knn.fit(X_train_bow, y_train)

NearestCentroid()

In [132]:
y_train_pred = Multinom.predict(X_train_bow)
y_test_pred = Multinom.predict(X_test_bow)

In [133]:
accuracy_score(y_train, y_train_pred)

0.9853846153846154

In [134]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      3372
        spam       1.00      0.89      0.94       528

    accuracy                           0.99      3900
   macro avg       0.99      0.95      0.97      3900
weighted avg       0.99      0.99      0.99      3900

