In [162]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [163]:
# Import data. Shapes like tilde will crash  ctf-8 so we use latin1
spamDf = pd.read_csv('./Data/spam.csv', encoding='latin1')

In [164]:
# Drop empty columns
spamDf.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
# Rename remaining columns
spamDf.columns = ['label', 'text']

# Flip columns so text is first column
columnsTitles=["text","label"]
spamDf = spamDf.reindex(columns = columnsTitles)
spamDf.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [165]:
import re
# Find all instances of nonstandard punctuation. ie space before a comma but not after
# match on , . ' ! # / @ % ^ ; "
def freqOfNonstardardPunc(message):
    count = 0
    regexList = ['\s,\S', '\s\.\S', '\s\'\S', '\s!\S', '\s#\S', '\s/\S', '\s@\S', '\s%\S', '\s\^\S', '\s;\S', 
                 '\s\?\S', '\s\"\S']
    for regex in regexList:
        prog = re.compile(regex)
        result = prog.findall(message)
        count += len(result)
    return count

In [166]:
# Find number of known spam words
with open('./Data/blacklist.txt') as f:
    content = f.readlines()
# You may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content] 

def spamCount(message):
    count = 0
    for word in content:
        count += message.count(word)
    return count

In [167]:
# Find ratio of capitalized to lowercase letters
def capRatio(message):
    upperCase = sum(1 for c in message if c.isupper())
    lowerCase = sum(1 for c in message if c.islower())
    if lowerCase == 0:
        lowerCase = 1
    return upperCase / lowerCase

In [168]:
# Set of lemmatized words

from nltk.stem.wordnet import WordNetLemmatizer
lemm = WordNetLemmatizer()
def lemSet(wordList):
    wordSet = set()
    for word in wordList:
        wordSet.add(lemm.lemmatize(word))
    return list(wordSet)

In [169]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
 
def stemSet(wordList):
    wordSet = set()
    for word in wordList:
        wordSet.add(ps.stem(word))
    return list(wordSet)

In [170]:
# Add column for number of nonstandard punctuations
spamDf['puncCount'] = spamDf.apply(lambda row: freqOfNonstardardPunc(row['text']), axis = 1)

# Add column for number of known spam words
spamDf['spamCount'] = spamDf.apply(lambda row: spamCount(row['text'].lower()), axis = 1)

# Add column for ratio of upper case to lower case words
spamDf['caseRatio'] = spamDf.apply(lambda row: capRatio(row['text']), axis = 1)

# Let's remove all punctuations and stop words from the 'text' column

# Remove all punctuation using regular expression
charTokenizer = RegexpTokenizer(r'\w+')

# Find set of stop words 
stopWords = set(stopwords.words('english'))

# Clean out text column and turn it to lowercase
spamDf['text'] = spamDf.apply(lambda row: [word for word in charTokenizer.tokenize(row['text'].lower()) if word not in stopWords], axis=1)


# Create column of lemmatized words
spamDf['lemWordSet'] = spamDf.apply(lambda row: lemSet(row['text']), axis=1)

#Create column of stemmed words
spamDf['stemWordSet'] = spamDf.apply(lambda row: stemSet(row['text']), axis=1)
spamDf.head()

Unnamed: 0,text,label,puncCount,spamCount,caseRatio,lemWordSet,stemWordSet
0,"[go, jurong, point, crazy, available, bugis, n...",ham,0,0,0.0375,"[available, cine, n, wat, la, point, e, amore,...","[crazi, avail, amor, cine, n, wat, la, point, ..."
1,"[ok, lar, joking, wif, u, oni]",ham,0,0,0.125,"[ok, oni, wif, u, joking, lar]","[ok, oni, joke, wif, u, lar]"
2,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",spam,0,0,0.114943,"[win, cup, apply, 08452810075over18, 2005, fa,...","[win, cup, wkli, 08452810075over18, 2005, fa, ..."
3,"[u, dun, say, early, hor, u, c, already, say]",ham,0,0,0.064516,"[say, early, u, hor, already, c, dun]","[say, u, hor, earli, alreadi, c, dun]"
4,"[nah, think, goes, usf, lives, around, though]",ham,0,0,0.044444,"[usf, think, life, nah, around, though, go]","[goe, live, usf, think, nah, around, though]"


In [171]:
# Turn our emails into strings so the sklearn vectorizer
# can handle them
spamDf["text"] = [" ".join(email) for email in spamDf["text"].values]
X_train, X_test, y_train, y_test = train_test_split(spamDf["text"], spamDf["label"], test_size=0.33, random_state=42)

In [172]:
spamDf

Unnamed: 0,text,label,puncCount,spamCount,caseRatio,lemWordSet,stemWordSet
0,go jurong point crazy available bugis n great ...,ham,0,0,0.037500,"[available, cine, n, wat, la, point, e, amore,...","[crazi, avail, amor, cine, n, wat, la, point, ..."
1,ok lar joking wif u oni,ham,0,0,0.125000,"[ok, oni, wif, u, joking, lar]","[ok, oni, joke, wif, u, lar]"
2,free entry 2 wkly comp win fa cup final tkts 2...,spam,0,0,0.114943,"[win, cup, apply, 08452810075over18, 2005, fa,...","[win, cup, wkli, 08452810075over18, 2005, fa, ..."
3,u dun say early hor u c already say,ham,0,0,0.064516,"[say, early, u, hor, already, c, dun]","[say, u, hor, earli, alreadi, c, dun]"
4,nah think goes usf lives around though,ham,0,0,0.044444,"[usf, think, life, nah, around, though, go]","[goe, live, usf, think, nah, around, though]"
5,freemsg hey darling 3 week word back like fun ...,spam,0,0,0.072165,"[ok, tb, std, like, send, xxx, freemsg, week, ...","[ok, tb, std, like, send, xxx, chg, freemsg, w..."
6,even brother like speak treat like aids patent,ham,0,0,0.034483,"[speak, like, treat, brother, aid, even, patent]","[speak, like, treat, brother, aid, even, patent]"
7,per request melle melle oru minnaminunginte nu...,ham,1,0,0.084746,"[press, copy, set, callertune, caller, 9, per,...","[mell, press, set, friend, minnaminungint, cal..."
8,winner valued network customer selected receiv...,spam,0,0,0.126316,"[selected, valued, network, kl341, winner, rec...","[å, custom, network, kl341, winner, receivea, ..."
9,mobile 11 months u r entitled update latest co...,spam,0,0,0.144330,"[camera, colour, latest, entitled, free, 08002...","[camera, colour, mobil, latest, entitl, free, ..."


In [173]:
# Approach from: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
text_clf = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf", MultinomialNB()), ])
parameters = {"vect__ngram_range": [(1, 1), (1, 2)], "tfidf__use_idf": (True, False), "clf__alpha": (1e-2, 1e-3), }
gs_clf = GridSearchCV(text_clf, parameters, n_jobs= -1)
gs_clf = gs_clf.fit(X_train, y_train)
preds = gs_clf.predict(X_test)
np.mean(preds == y_test)

0.98096791734638389