In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score

In [2]:
# read the data and replace null values with a null string
df1 = pd.read_csv("spamham.csv")
df = df1.where((pd.notnull(df1)), '')

In [4]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [6]:
# Categorize Spam as 0 and Not spam as 1 
df.loc[df["Category"] == 'ham', "Category",] = 1
df.loc[df["Category"] == 'spam', "Category",] = 0

In [7]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# split data as label and text . System should be capable of predicting the label based on the  text
df_x = df['Message']
df_y = df['Category']

In [9]:
# split the table - 80 percent for training and 20 percent for test size
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, train_size=0.8, test_size=0.2, random_state=4)

In [10]:
# feature extraction, coversion to lower case and removal of stop words using TFIDF VECTORIZER
tfvec = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_trainFeat = tfvec.fit_transform(x_train)
x_testFeat = tfvec.transform(x_test)

In [13]:
# SVM is used to model
y_trainSvm = y_train.astype('int')
classifierModel = LinearSVC()
classifierModel.fit(x_trainFeat, y_trainSvm)
predResult = classifierModel.predict(x_testFeat)

In [14]:
predResult

array([1, 1, 1, ..., 1, 1, 0])

In [16]:
x_test

4004    somewhere out there beneath the pale moon ligh...
2276           Is that on the telly? No its Brdget Jones!
4498                                                   Ok
3755    Bloomberg -Message center +447797706009 Why wa...
111              What is the plural of the noun research?
3662                  Well then you have a great weekend!
4282    Wn u r hurt by d prsn who s close 2 u, do figh...
1991    HI DARLIN IVE JUST GOT BACK AND I HAD A REALLY...
2298                            Draw va?i dont think so:)
2438    For ur chance to win £250 cash every wk TXT: P...
4164                  I told that am coming on wednesday.
5336    Sounds better than my evening im just doing my...
4919    Sitting in mu waiting for everyone to get out ...
387                       Customer place i will call you.
3968    YOU HAVE WON! As a valued Vodafone customer ou...
4844                I need details about that online job.
260     I‘m parked next to a MINI!!!! When are you com...
3484    Hello,

In [17]:
# GNB is used to model
y_trainGnb = y_train.astype('int')
classifierModel2 = MultinomialNB()
classifierModel2.fit(x_trainFeat, y_trainGnb)
predResult2 = classifierModel2.predict(x_testFeat)

In [18]:
predResult2

array([1, 1, 1, ..., 1, 1, 0])

In [19]:
# Calc accuracy,converting to int - solves - cant handle mix of unknown and binary
y_test = y_test.astype('int')
actual_Y = y_test.as_matrix()

  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
actual_Y

array([1, 1, 1, ..., 1, 1, 0])

In [23]:
print("~~~~~~~~~~SVM RESULTS~~~~~~~~~~")
#Accuracy score using SVM
print("Accuracy Score using SVM: {0:.4f}".format(accuracy_score(actual_Y, predResult)*100))
#FScore MACRO using SVM
print("F Score using SVM: {0: .4f}".format(f1_score(actual_Y, predResult, average='macro')*100))
cmSVM=confusion_matrix(actual_Y, predResult)
print("[True negative  False Positive\nFalse Negative True Positive]")
print("Confusion matrix using SVM:")
print(cmSVM)
print("~~~~~~~~~~MNB RESULTS~~~~~~~~~~")
#Accuracy score using MNB
print("Accuracy Score using MNB: {0:.4f}".format(accuracy_score(actual_Y, predResult2)*100))
#FScore MACRO using MNB
print("F Score using MNB:{0: .4f}".format(f1_score(actual_Y, predResult2, average='macro')*100))
cmMNb=confusion_matrix(actual_Y, predResult2)
print("[True negative  False Positive\nFalse Negative True Positive]")
print("Confusion matrix using MNB:")
print(cmMNb)

~~~~~~~~~~SVM RESULTS~~~~~~~~~~
Accuracy Score using SVM: 98.4753
F Score using SVM:  96.9068
[True negative  False Positive
False Negative True Positive]
Confusion matrix using SVM:
[[152  16]
 [  1 946]]
~~~~~~~~~~MNB RESULTS~~~~~~~~~~
Accuracy Score using MNB: 95.6951
F Score using MNB: 90.4870
[True negative  False Positive
False Negative True Positive]
Confusion matrix using MNB:
[[121  47]
 [  1 946]]
