# Read raw data

In [2]:
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

# Read in the raw text (file downloaded from kaggle and renamed https://www.kaggle.com/uciml/sms-spam-collection-dataset)
rawData = open("SMSSpamCollection.csv").read()
parsedData = rawData.replace("ham,", "ham\t")
parsedData = parsedData.replace("spam,", "spam\t")
parsedData = parsedData.replace("\t", "\n").split("\n")
parsedData = parsedData[1:]
labelList = parsedData[0::2]
textList = parsedData[1::2]
data = pd.DataFrame({'label': labelList,
                     'body_text': textList})
data.head()

Unnamed: 0,label,body_text
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ..."
1,ham,"Ok lar... Joking wif u oni...,,,"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,"U dun say so early hor... U c already then say...,,,"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,"


In [3]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

Unnamed: 0,label,body_text,body_text_nonstop
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ...","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,"Ok lar... Joking wif u oni...,,,","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,"U dun say so early hor... U c already then say...,,,","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,","[nah, dont, think, goe, usf, live, around, though]"


## Apply Count Vectorization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['body_text'])
print(X_counts.shape)
print(count_vect.get_feature_names())

(5573, 5796)
['', '0', '008704050406', '0121', '01223585236', '01223585334', '02', '02070836089', '02073162414', '02085076972', '020903', '021', '06', '060505', '061104', '07008009200', '07090201529', '07090298926', '07099833605', '07123456789', '07732584351', '07734396839', '0776xxxxxxx', '07786200117', '078', '07808247860', '07808726822', '07821230901', '0784987', '0789xxxxxxx', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '0800', '08000776320', '08000839402', '08000930705', '08001950382', '08002888812', '08002986030', '08002986906', '08006344447', '0808', '08081263000', '08081560665', '0825', '0844', '08448350055', '08448714184', '0845', '08452810073', '08452810075over18', '0870', '08700621170150p', '08701213186', '08701417012', '08701417012150p', '08701752560', '08702840625comuk', '08704439680', '08707500020', '08707509020', '08708034412', '08709222922', '08709501522', '0870k', '087104711148', '08712101358', '08712103738', '08712300220', '08712317606', '08712400200', '08712400603', 

## Apply CountVectorizer on smaller sample

In [8]:
data_sample = data[:20]
count_vect_sample = CountVectorizer(analyzer=clean_text)
X_counts_sample = count_vect_sample.fit_transform(data_sample['body_text'])
print(X_counts_sample.shape)
print(count_vect_sample.get_feature_names())

(20, 240)
['08002986030', '08452810075over18', '09061701461', '1', '100', '100000', '11', '12', '150', '150pday', '16', '2', '20000', '2005', '21st', '3', '4', '4403ldnw1a7rw18', '4txtì¼120', '6day', '81010', '87077', '87121', '87575', '9', '900', 'A', 'As', 'Co', 'Eh', 'FA', 'Go', 'HL', 'He', 'I', 'Id', 'Im', 'No', 'ON', 'Oh', 'Ok', 'R', 'TC', 'Tb', 'To', 'U', 'aid', 'alreadi', 'amor', 'anymor', 'appli', 'around', 'avail', 'b', 'back', 'bless', 'breather', 'brother', 'buffet', 'bugi', 'c', 'call', 'caller', 'callertun', 'camera', 'cash', 'chanc', 'chg', 'cine', 'claim', 'click', 'code', 'colour', 'comp', 'copi', 'cost', 'crazi', 'credit', 'cri', 'csh11', 'cup', 'custom', 'darl', 'date', 'dont', 'dun', 'e', 'earli', 'eg', 'england', 'enough', 'entitl', 'entri', 'even', 'feel', 'final', 'fine', 'free', 'freemsg', 'friend', 'from', 'fulfil', 'fun', 'goalsteam', 'goe', 'gonna', 'got', 'gota', 'grant', 'great', 'had', 'have', 'help', 'hey', 'home', 'hor', 'hour', 'httpwap', 'info', 'ive', 

In [9]:
X_counts_sample

<20x240 sparse matrix of type '<class 'numpy.int64'>'
	with 272 stored elements in Compressed Sparse Row format>

In [10]:
Xcounts_df = pd.DataFrame(X_counts_sample.toarray())

In [12]:
Xcounts_df.columns = count_vect_sample.get_feature_names()
Xcounts_df

Unnamed: 0,08002986030,08452810075over18,09061701461,1,100,100000,11,12,150,150pday,...,wont,word,world,wwwdbuknet,xxx,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,ye,you,å
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
9,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# N-Grams

In [13]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([ps.stem(word) for word in tokens if word not in stopwords])
    return text

data['cleaned_text'] = data['body_text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label,body_text,body_text_nonstop,cleaned_text
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ...","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]",Go jurong point crazi avail bugi n great world la e buffet cine got amor wat
1,ham,"Ok lar... Joking wif u oni...,,,","[ok, lar, joke, wif, u, oni]",Ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...",free entri 2 wkli comp win FA cup final tkt 21st may 2005 text FA 87121 receiv entri questionstd...
3,ham,"U dun say so early hor... U c already then say...,,,","[u, dun, say, earli, hor, u, c, alreadi, say]",U dun say earli hor U c alreadi say
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,","[nah, dont, think, goe, usf, live, around, though]",nah I dont think goe usf live around though


In [19]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vect = CountVectorizer(ngram_range=(2,2)) # 2,2 for only bigrams, 1,2 for unigrams and bigrams
Xcounts = ngram_vect.fit_transform(data['cleaned_text'])
print(Xcounts.shape)
print(ngram_vect.get_feature_names())

(5573, 19765)
['008704050406 sp', '0121 2025050', '01223585236 xx', '01223585334 cum', '02 user', '02073162414 cost', '02085076972 repli', '020903 thi', '021 3680', '06 good', '07090298926 reschedul', '07099833605 reschedul', '07123456789 87077', '07732584351 rodger', '07734396839 ibh', '0776xxxxxxx uve', '07808247860 show', '07808726822 award', '0784987 show', '0789xxxxxxx today', '0796xxxxxx today', '07973788240 show', '07xxxxxxxxx show', '0800 169', '0800 18', '0800 195', '0800 1956669', '0800 505060', '0800 542', '08000776320 repli', '08000839402 2stoptxt', '08000839402 call2optout4qf2', '08000839402 call2optouthf8', '08000839402 call2optoutj5q', '08000839402 call2optoutlf56', '08000839402 call2optoutn9dx', '08000839402 call2optoutyhl', '08000839402 now', '08000930705 del', '08000930705 deliveri', '08000930705 free', '08000930705 now', '08001950382 call2optout674', '08002888812 repli', '08006344447 claim', '0808 145', '08081263000 charg', '08081560665 speak', '0844 861', '084483500

### smaller sample

In [25]:
data_sample = data[:20]

ngram_vect_sample = CountVectorizer(ngram_range=(2, 2))
Xcounts_sample = ngram_vect_sample.fit_transform(data_sample['cleaned_text'])
print(Xcounts_sample.shape)
print(ngram_vect_sample.get_feature_names())
Xcounts_sample

(20, 242)
['09061701461 claim', '100 20000', '100000 prize', '11 month', '12 hour', '150 rcv', '150pday 6day', '16 tsandc', '20000 pound', '2005 text', '21st may', '4txtì¼120 poboxox36504w45wq', '6day 16', '81010 tc', '87077 eg', '87077 trywal', '87121 receiv', '87575 cost', '900 prize', 'aid patent', 'alreadi say', 'amor wat', 'anymor tonight', 'appli 08452810075over18', 'appli repli', 'around though', 'as per', 'as valu', 'avail bugi', 'back id', 'bless time', 'breather promis', 'brother like', 'buffet cine', 'bugi great', 'call 09061701461', 'call the', 'caller press', 'callertun caller', 'camera free', 'cash from', 'chanc win', 'chg send', 'cine got', 'claim call', 'claim code', 'claim no', 'click httpwap', 'click wap', 'co free', 'code kl341', 'colour mobil', 'comp win', 'copi friend', 'cost 150pday', 'crazi avail', 'credit click', 'cri enough', 'csh11 send', 'cup final', 'custom select', 'darl week', 'date on', 'dont miss', 'dont think', 'dont want', 'dun say', 'earli hor', 'eg e

<20x242 sparse matrix of type '<class 'numpy.int64'>'
	with 242 stored elements in Compressed Sparse Row format>

In [24]:
Xcount_df = pd.DataFrame(Xcounts_sample.toarray())
Xcount_df.columns = ngram_vect_sample.get_feature_names()
Xcount_df

Unnamed: 0,09061701461 claim,100 20000,100000 prize,11 month,12 hour,150 rcv,150pday 6day,16 tsandc,20000 pound,2005 text,...,word back,word claim,word thank,world la,wwwdbuknet lccltd,xxx std,xxxmobilemovieclub to,ye he,you week,you wonder
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF Term Frequency - Inverse Document Frequency

In [27]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())

(5573, 5796)
['', '0', '008704050406', '0121', '01223585236', '01223585334', '02', '02070836089', '02073162414', '02085076972', '020903', '021', '06', '060505', '061104', '07008009200', '07090201529', '07090298926', '07099833605', '07123456789', '07732584351', '07734396839', '0776xxxxxxx', '07786200117', '078', '07808247860', '07808726822', '07821230901', '0784987', '0789xxxxxxx', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '0800', '08000776320', '08000839402', '08000930705', '08001950382', '08002888812', '08002986030', '08002986906', '08006344447', '0808', '08081263000', '08081560665', '0825', '0844', '08448350055', '08448714184', '0845', '08452810073', '08452810075over18', '0870', '08700621170150p', '08701213186', '08701417012', '08701417012150p', '08701752560', '08702840625comuk', '08704439680', '08707500020', '08707509020', '08708034412', '08709222922', '08709501522', '0870k', '087104711148', '08712101358', '08712103738', '08712300220', '08712317606', '08712400200', '08712400603', 

## Samller sample

In [30]:
data_sample = data[:20]

tfidf_vect_sample = TfidfVectorizer(analyzer=clean_text)
X_tfidf_sample = tfidf_vect_sample.fit_transform(data_sample['body_text'])
print(X_tfidf_sample.shape)
print(tfidf_vect_sample.get_feature_names())

(20, 240)
['08002986030', '08452810075over18', '09061701461', '1', '100', '100000', '11', '12', '150', '150pday', '16', '2', '20000', '2005', '21st', '3', '4', '4403ldnw1a7rw18', '4txtì¼120', '6day', '81010', '87077', '87121', '87575', '9', '900', 'A', 'As', 'Co', 'Eh', 'FA', 'Go', 'HL', 'He', 'I', 'Id', 'Im', 'No', 'ON', 'Oh', 'Ok', 'R', 'TC', 'Tb', 'To', 'U', 'aid', 'alreadi', 'amor', 'anymor', 'appli', 'around', 'avail', 'b', 'back', 'bless', 'breather', 'brother', 'buffet', 'bugi', 'c', 'call', 'caller', 'callertun', 'camera', 'cash', 'chanc', 'chg', 'cine', 'claim', 'click', 'code', 'colour', 'comp', 'copi', 'cost', 'crazi', 'credit', 'cri', 'csh11', 'cup', 'custom', 'darl', 'date', 'dont', 'dun', 'e', 'earli', 'eg', 'england', 'enough', 'entitl', 'entri', 'even', 'feel', 'final', 'fine', 'free', 'freemsg', 'friend', 'from', 'fulfil', 'fun', 'goalsteam', 'goe', 'gonna', 'got', 'gota', 'grant', 'great', 'had', 'have', 'help', 'hey', 'home', 'hor', 'hour', 'httpwap', 'info', 'ive', 

## Vectorizing output sparse matrices

In [32]:
X_tfidf_sample

<20x240 sparse matrix of type '<class 'numpy.float64'>'
	with 272 stored elements in Compressed Sparse Row format>

In [36]:
Xtfidf_df = pd.DataFrame(X_tfidf_sample.toarray())
Xtfidf_df.columns=tfidf_vect_sample.get_feature_names()
Xtfidf_df

Unnamed: 0,08002986030,08452810075over18,09061701461,1,100,100000,11,12,150,150pday,...,wont,word,world,wwwdbuknet,xxx,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,ye,you,å
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.198423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.232018,0.0,...,0.0,0.184031,0.0,0.0,0.232018,0.0,0.0,0.0,0.0,0.184031
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.219209,0.0,0.0,0.0,0.0,0.219209,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173871
9,0.188774,0.0,0.0,0.0,0.0,0.0,0.188774,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
