In [23]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import string

from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from xgboost import XGBClassifier

In [2]:
s = pd.read_csv('spam.csv',  encoding='cp1252')

In [3]:
s.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
s = s.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [5]:
s = s.rename(columns={'v1' : 'type', 'v2' : 'text'})

In [6]:
s['len'] = s['text'].apply(len)

Next thing I did was to add a new feature column 'len' which tells us the length of each text.

In [7]:
s.head()

Unnamed: 0,type,text,len
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


First step is to do some preprocessing such as removing the stop words, convert all the words to a single case (in this case I converted them all into lower case), and also stem each word. The function below does all of that.

In [8]:
def pre_process(text):
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [9]:
# =This line copies the text column of the data so nothing happens to original data.
textFeatures = s['text'].copy()

# here the pre_process funstion is applied to the text
textFeatures = textFeatures.apply(pre_process)

#ses a TFIDF vectoriser to provide useful numerical values related to the data. 
#TFIDF (term frequency - inverse document frequency) is a statistical method to tell how important 
a word is to a particular document by increasing the numerical value for an occurrence in the specific document but decreasing relative to number of occurrences in the entire corpus.
vectorizer = TfidfVectorizer("english")
features = vectorizer.fit_transform(textFeatures)


In [39]:
x_train, x_test, y_train, y_test = train_test_split(features, s['type'], test_size=0.3)

In [17]:
s.type.count()

5572

In [18]:
s.type.value_counts()

ham     4825
spam     747
Name: type, dtype: int64

In [21]:
4825/5572

0.8659368269921034

#### First basic model

In [29]:
svc = SVC()
svc.fit(x_train, y_train)
prediction = svc.predict(x_test)
accuracy_score(y_test,prediction)

0.8696172248803827

#### Second basic model

In [30]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
prediction = mnb.predict(x_test)
accuracy_score(y_test,prediction)

0.9617224880382775

In [31]:
model = XGBClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)
accuracy_score(y_test,pred)

  if diff:


0.9730861244019139