In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import string

from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
sms_df = pd.read_csv('spam.csv',  encoding='cp1252')

In [3]:
sms_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
sms_df = sms_df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [5]:
sms_df = sms_df.rename(columns={'v1' : 'type', 'v2' : 'text'})

In [6]:
sms_df['len'] = sms_df['text'].apply(len)

Next thing I did was to add a new feature column 'len' which tells us the length of each text.

In [8]:
sms_df.head()

Unnamed: 0,type,text,len
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


First step is to do some preprocessing such as removing the stop words, convert all the words to a single case (in this case I converted them all into lower case), and also stem each word. The function below does all of that.

In [9]:
def pre_process(text):
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [10]:
# =This line copies the text column of the data so nothing happens to original data.
textFeatures = sms_df['text'].copy()

# here the pre_process funstion is applied to the text
textFeatures = textFeatures.apply(pre_process)

'''
ses a TFIDF vectoriser to provide useful numerical values related to the data. 
TFIDF (term frequency - inverse document frequency) is a statistical method to tell how important 
a word is to a particular document by increasing the numerical value for an occurrence in the 
specific document but decreasing relative to number of occurrences in the entire corpus. 
'''

vectorizer = TfidfVectorizer("english")

#here we fit and transform the vectorizer before the train test split
features = vectorizer.fit_transform(textFeatures)


In [12]:
#here I randomly split the data into train and test data sets. I used to keep 30% of the data as test but can be any number of your choice as long as you have enough data.
x_train, x_test, y_train, y_test = train_test_split(features, sms_df['type'], test_size=0.3)


Here we see that there are a total of 5572 texts and out of which 757 are spam. So, about 86.59% of texts are not spam.

In [13]:
print('total texts: ',sms_df.type.count())
print(sms_df.type.value_counts())
print('% not spam: ', 4825/5572)

total texts:  5572
ham     4825
spam     747
Name: type, dtype: int64
% not spam:  0.8659368269921034


#### First basic model (MVP)

In [14]:
svc = SVC()
svc.fit(x_train, y_train)
pred = svc.predict(x_test)
accuracy_score(y_test,pred)

0.8690191387559809


The first model or my MVP predicted the text correctly 87% of the time so my goal is to make a model that has a prediction rate of higher than 87%


#### Second basic model

I tried some other models just to see what kind of results they give me. I decided to use Naive Bayes model as it is a simple but a powerful classifier based on probabilistic model derived from Bayes theorem. This model assumes that each feature is indipendent of the rest. Naive Bayes should be used whenever we trying a NLP problem. And as you can see this already provides much better results in comparison to our first model.

In [15]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
prediction = mnb.predict(x_test)
accuracy_score(y_test,prediction)

0.9581339712918661

I also decided to do a Random Forest classifier model just to compare the results and this one also gives us very good results with any model tuning.

In [16]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
prediction = model.predict(x_test)
accuracy_score(y_test,prediction)

0.9635167464114832

Next step is to do some basic grid search to imporve our model. Since the models are already giving pretty good results, I will not do a in depth grid search and will do it for all the models.

#### Grid Search

In [17]:
svc = SVC(kernel='linear', gamma=1)
svc.fit(x_train, y_train)
prediction = svc.predict(x_test)
accuracy_score(y_test,prediction)

0.9754784688995215

In [18]:
mnb = MultinomialNB(alpha=.1)
mnb.fit(x_train, y_train)
prediction = mnb.predict(x_test)
accuracy_score(y_test,prediction)

0.979066985645933

In [19]:
model = RandomForestClassifier(max_depth=120, min_samples_split=7, n_estimators=80)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy_score(y_test,y_pred)

0.972488038277512

After doing some basic grid search we can see that all are models are predictions have increased in accuarcy and Naive Bayes model gives us the best results so we should choose this model. I can try another model or can do a in depth grid search for these models for a better result but I think the answer we got now is more than sufficent for thsi project,