In [35]:
import pandas as pd

In [36]:
pd.set_option('display.max_colwidth', 200)
data = pd.read_csv('data/SMSSpamCollection.csv')
data = data[['label', 'text']]
data.head()

Unnamed: 0,label,text
0,0,Go until jurong point
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s
3,0,U dun say so early hor... U c already then say...
4,0,Nah I dont think he goes to usf


## Text Cleaning Pipeline

### Removing Punctuation

In [37]:
import string

In [38]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [39]:
data['text'] = data['text'].apply(str)

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
label    5574 non-null int64
text     5574 non-null object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [41]:
def remove_punctuation(text):
    text_no_punctuation = "".join([char for char in text if char not in string.punctuation])
    return text_no_punctuation

data['body_text_clean'] = data['text'].apply(lambda x: remove_punctuation(x))

data.head()

Unnamed: 0,label,text,body_text_clean
0,0,Go until jurong point,Go until jurong point
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,0,Nah I dont think he goes to usf,Nah I dont think he goes to usf


## Tokenization

In [42]:
import re

In [43]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data['body_text_tokenized'] = data['body_text_clean'].apply(lambda x: tokenize(x.lower()))

In [44]:
data.head()

Unnamed: 0,label,text,body_text_clean,body_text_tokenized
0,0,Go until jurong point,Go until jurong point,"[go, until, jurong, point]"
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]"
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]"
4,0,Nah I dont think he goes to usf,Nah I dont think he goes to usf,"[nah, i, dont, think, he, goes, to, usf]"


## Remove Stopwords

In [45]:
import nltk

In [46]:
stopword = nltk.corpus.stopwords.words('english')

In [47]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

data['body_text_no_stop'] = data['body_text_tokenized'].apply(lambda x: remove_stopwords(x))

In [48]:
data.head()

Unnamed: 0,label,text,body_text_clean,body_text_tokenized,body_text_no_stop
0,0,Go until jurong point,Go until jurong point,"[go, until, jurong, point]","[go, jurong, point]"
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]"
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]"
4,0,Nah I dont think he goes to usf,Nah I dont think he goes to usf,"[nah, i, dont, think, he, goes, to, usf]","[nah, dont, think, goes, usf]"


## Stemming

### Porter Stemmer

In [49]:
ps = nltk.PorterStemmer()

In [50]:
# dir(ps)

In [51]:
#eg 
ps.stem('grows')

'grow'

In [52]:
ps.stem('growing')

'grow'

In [53]:
ps.stem('grow')

'grow'

In [54]:
# stem our text
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

data['body_text_stem'] = data['body_text_no_stop'].apply(lambda x: stemming(x))

data.head()

Unnamed: 0,label,text,body_text_clean,body_text_tokenized,body_text_no_stop,body_text_stem
0,0,Go until jurong point,Go until jurong point,"[go, until, jurong, point]","[go, jurong, point]","[go, jurong, point]"
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv, entri, questionstd, txt, ratetc, appli, 08452810075over18]"
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,0,Nah I dont think he goes to usf,Nah I dont think he goes to usf,"[nah, i, dont, think, he, goes, to, usf]","[nah, dont, think, goes, usf]","[nah, dont, think, goe, usf]"


## Lemmatization

#### WordNet Lemmatizer

In [55]:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [56]:
# read in raw text
print(ps.stem('meanness'))
print(ps.stem('meaning'))

mean
mean


In [57]:
print(wn.lemmatize('meanness'))
print(wn.lemmatize('meaning'))

meanness
meaning


In [58]:
print(ps.stem('goose'))
print(ps.stem('geese'))

goos
gees


In [59]:
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

goose
goose


In [60]:
### lemmatize text in dataset
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data['body_text_lemmatized'] = data['body_text_no_stop'].apply(lambda x: lemmatizing(x))
data.head()

Unnamed: 0,label,text,body_text_clean,body_text_tokenized,body_text_no_stop,body_text_stem,body_text_lemmatized
0,0,Go until jurong point,Go until jurong point,"[go, until, jurong, point]","[go, jurong, point]","[go, jurong, point]","[go, jurong, point]"
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv, entri, questionstd, txt, ratetc, appli, 08452810075over18]","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]"
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,0,Nah I dont think he goes to usf,Nah I dont think he goes to usf,"[nah, i, dont, think, he, goes, to, usf]","[nah, dont, think, goes, usf]","[nah, dont, think, goe, usf]","[nah, dont, think, go, usf]"


In [67]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopword]
    return text

## Vectorization

#### Count Vectorization

In [68]:
# create document-term matrix where the entryo f each cell be a count of the number f times that word occured in that document.

In [69]:
from sklearn.feature_extraction.text import CountVectorizer

In [70]:
count_vect = CountVectorizer(analyzer=clean_text)

In [71]:
X_counts = count_vect.fit_transform(data['text'])

In [76]:
print(X_counts.shape)
# print(count_vect.get_feature_names())


(5574, 6868)


In [77]:
# apply countVectorizer to smaller sample

In [78]:
data_sample = data.head(20)

In [80]:
# data_sample

In [81]:
count_vect_sample = CountVectorizer(analyzer=clean_text)
X_count_sample = count_vect_sample.fit_transform(data_sample['text'])

In [82]:
print(X_count_sample.shape)

(20, 170)


In [83]:
print(count_vect_sample.get_feature_names())

['08002986030', '08452810075over18', '09061701461', '1', '100', '11', '12', '2', '20', '2005', '21st', '3', '87077', '87121', '9', '900', 'aid', 'alreadi', 'anymor', 'appli', 'b', 'back', 'bless', 'breather', 'brother', 'c', 'call', 'caller', 'callertun', 'camera', 'cash', 'chanc', 'chg', 'claim', 'co', 'code', 'colour', 'comp', 'copi', 'credit', 'cup', 'custom', 'darl', 'date', 'dont', 'dun', 'earli', 'eg', 'eh', 'england', 'entitl', 'entri', 'even', 'fa', 'feel', 'final', 'fine', 'free', 'freemsg', 'friend', 'fulfil', 'fun', 'go', 'goalsteam', 'goe', 'gonna', 'gota', 'grant', 'help', 'hey', 'home', 'hor', 'hour', 'id', 'im', 'ive', 'joke', 'jurong', 'kim', 'kl341', 'lar', 'latest', 'like', 'macedonia', 'make', 'may', 'mell', 'membership', 'minnaminungint', 'miss', 'mobil', 'month', 'nah', 'name', 'nation', 'naughti', 'network', 'news', 'nurungu', 'oh', 'ok', 'oni', 'oru', 'patent', 'per', 'point', 'press', 'prize', 'promis', 'questionstd', 'r', 'ratetc', 'receiv', 'receivea', 'rememb