In [1]:
import pandas as pd
import nltk
import re

In [3]:
data = pd.read_csv("SMSSpamCollection", sep = "\t", names = ["label", "text"])
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [18]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [21]:
#stemming and lemmatization
corpus_stemmed = []
corpus_lemmatized = []
for i in range(len(data)):
    temp = data.iloc[i, 1]
    temp = re.sub("[^a-zA-Z]", " ", temp)
    temp = temp.lower().split()
    stem = [stemmer.stem(word) for word in temp if word not in stopwords.words("english")]
    lemma = [lemmatizer.lemmatize(word) for word in temp if word not in stopwords.words("english")]
    stem = " ".join(stem)
    lemma = " ".join(lemma)
    corpus_stemmed.append(stem)
    corpus_lemmatized.append(lemma)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [55]:
cv = CountVectorizer(max_features = 5000)
tfidf = TfidfVectorizer()

In [33]:
X_stem_cv = cv.fit_transform(corpus_stemmed).toarray()
X_stem_tfidf = tfidf.fit_transform(corpus_stemmed).toarray()
X_lemma_cv = cv.fit_transform(corpus_lemmatized).toarray()
X_lemma_tfidf = tfidf.fit_transform(corpus_lemmatized).toarray()

In [36]:
Y = pd.get_dummies(data.iloc[:,0]).iloc[:,1]

In [62]:
def f(X, confusion = 0):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 1)
    model = MultinomialNB().fit(X_train, Y_train)
    pred = model.predict(X_test)
    if confusion == 1:
        return(confusion_matrix(Y_test, pred))
    return accuracy_score(Y_test, pred)

In [63]:
f(X_stem_cv, 1)

array([[952,  16],
       [  4, 143]], dtype=int64)

In [64]:
f(X_stem_tfidf, 1)

array([[967,   1],
       [ 28, 119]], dtype=int64)

In [65]:
f(X_lemma_cv, 1)

array([[949,  19],
       [  6, 141]], dtype=int64)

In [66]:
f(X_lemma_tfidf, 1)

array([[967,   1],
       [ 27, 120]], dtype=int64)

In [73]:
#tfidf may be better as we don't want to risk marking important messages as spam
print(f(X_stem_tfidf, 1)[0, 1]) #it is better to use stem as it has the same amount of error but takes less time
print(f(X_lemma_tfidf, 1)[0, 1])

1
1


In [76]:
print(f(X_stem_tfidf))
print(f(X_lemma_tfidf)) #the accuracy is similar too so we should go with stemming

0.9739910313901345
0.9748878923766816
