In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [2]:
# import data. Shapes like tilde will crash  ctf-8 so we use latin1
spamDf = pd.read_csv('./Data/spam.csv', encoding='latin1')

In [3]:
# drop empty columns
spamDf.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
# rename remaining columns
spamDf.columns = ['label', 'text']

# flip columns so text is first column
columnsTitles=["text","label"]
spamDf = spamDf.reindex(columns = columnsTitles)
spamDf.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [4]:
import re
# Find all instances of nonstandard punctuation. ie space before a comma but not after
# match on , . ' ! # / @ % ^ ; "
def freqOfNonstardardPunc(message):
    count = 0
    regexList = ['\s,\S', '\s\.\S', '\s\'\S', '\s!\S', '\s#\S', '\s/\S', '\s@\S', '\s%\S', '\s\^\S', '\s;\S', 
                 '\s\?\S', '\s\"\S']
    for regex in regexList:
        prog = re.compile(regex)
        result = prog.findall(message)
        count += len(result)
    return count

In [5]:
# find number of known spam words
with open('./Data/blacklist.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content] 

def spamCount(message):
    count = 0
    for word in content:
        count += message.count(word)
    return count

In [6]:
# Find ratio of capitalized to lowercase letters
def capRatio(message):
    upperCase = sum(1 for c in message if c.isupper())
    lowerCase = sum(1 for c in message if c.islower())
    if lowerCase == 0:
        lowerCase = 1
    return upperCase / lowerCase

In [7]:
# set of lemmatized words

from nltk.stem.wordnet import WordNetLemmatizer
lemm = WordNetLemmatizer()
def lemSet(wordList):
    wordSet = set()
    for word in wordList:
        wordSet.add(lemm.lemmatize(word))
    return list(wordSet)

In [8]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
 
def stemSet(wordList):
    wordSet = set()
    for word in wordList:
        wordSet.add(ps.stem(word))
    return list(wordSet)

In [9]:
# Add column for number of nonstandard punctuations
spamDf['puncCount'] = spamDf.apply(lambda row: freqOfNonstardardPunc(row['text']), axis = 1)

# Add column for number of known spam words
spamDf['spamCount'] = spamDf.apply(lambda row: spamCount(row['text'].lower()), axis = 1)

# Add column for ratio of upper case to lower case words
spamDf['caseRatio'] = spamDf.apply(lambda row: capRatio(row['text']), axis = 1)

# let's remove all punctuations and stop words from the 'text' column

# Remove all punctuation using regular expression
charTokenizer = RegexpTokenizer(r'\w+')

# Find set of stop words 
stopWords = set(stopwords.words('english'))

# Clean out text column and turn it to lowercase
spamDf['text'] = spamDf.apply(lambda row: [word for word in charTokenizer.tokenize(row['text'].lower()) if word not in stopWords], axis=1)


# Create column of lemmatized words
spamDf['lemWordSet'] = spamDf.apply(lambda row: lemSet(row['text']), axis=1)

#Create column of stemmed words
spamDf['stemWordSet'] = spamDf.apply(lambda row: stemSet(row['text']), axis=1)
spamDf.head()

Unnamed: 0,text,label,puncCount,spamCount,caseRatio,lemWordSet,stemWordSet
0,"[go, jurong, point, crazy, available, bugis, n...",ham,0,0,0.0375,"[amore, wat, point, available, got, bugis, la,...","[e, avail, wat, point, got, la, n, cine, amor,..."
1,"[ok, lar, joking, wif, u, oni]",ham,0,0,0.125,"[wif, ok, oni, lar, joking, u]","[wif, ok, joke, oni, lar, u]"
2,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",spam,0,0,0.114943,"[receive, 2005, c, 08452810075over18, final, r...","[entri, 2005, c, 08452810075over18, final, rat..."
3,"[u, dun, say, early, hor, u, c, already, say]",ham,0,0,0.064516,"[already, dun, say, hor, c, u, early]","[earli, dun, say, hor, alreadi, c, u]"
4,"[nah, think, goes, usf, lives, around, though]",ham,0,0,0.044444,"[around, think, usf, though, life, go, nah]","[around, think, usf, goe, though, live, nah]"
