# Read raw data

In [2]:
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

# Read in the raw text (file downloaded from kaggle and renamed https://www.kaggle.com/uciml/sms-spam-collection-dataset)
rawData = open("SMSSpamCollection.csv").read()
parsedData = rawData.replace("ham,", "ham\t")
parsedData = parsedData.replace("spam,", "spam\t")
parsedData = parsedData.replace("\t", "\n").split("\n")
parsedData = parsedData[1:]
labelList = parsedData[0::2]
textList = parsedData[1::2]
data = pd.DataFrame({'label': labelList,
                     'body_text': textList})
data.head()

Unnamed: 0,label,body_text
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ..."
1,ham,"Ok lar... Joking wif u oni...,,,"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,"U dun say so early hor... U c already then say...,,,"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,"


In [3]:
stopwords = nltk.corpus.stopwords.words('english')


def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

data['body_text_nonstop'] = data['body_text'].apply(lambda x: clean_text(x.lower()))
data.head()

Unnamed: 0,label,body_text,body_text_nonstop
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ...","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,"Ok lar... Joking wif u oni...,,,","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,"U dun say so early hor... U c already then say...,,,","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,","[nah, dont, think, goe, usf, live, around, though]"


## Apply Count Vectorization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['body_text'])
print(X_counts.shape)
print(count_vect.get_feature_names())

(5573, 5796)
['', '0', '008704050406', '0121', '01223585236', '01223585334', '02', '02070836089', '02073162414', '02085076972', '020903', '021', '06', '060505', '061104', '07008009200', '07090201529', '07090298926', '07099833605', '07123456789', '07732584351', '07734396839', '0776xxxxxxx', '07786200117', '078', '07808247860', '07808726822', '07821230901', '0784987', '0789xxxxxxx', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '0800', '08000776320', '08000839402', '08000930705', '08001950382', '08002888812', '08002986030', '08002986906', '08006344447', '0808', '08081263000', '08081560665', '0825', '0844', '08448350055', '08448714184', '0845', '08452810073', '08452810075over18', '0870', '08700621170150p', '08701213186', '08701417012', '08701417012150p', '08701752560', '08702840625comuk', '08704439680', '08707500020', '08707509020', '08708034412', '08709222922', '08709501522', '0870k', '087104711148', '08712101358', '08712103738', '08712300220', '08712317606', '08712400200', '08712400603', 

## Apply CountVectorizer on smaller sample

In [8]:
data_sample = data[:20]
count_vect_sample = CountVectorizer(analyzer=clean_text)
X_counts_sample = count_vect_sample.fit_transform(data_sample['body_text'])
print(X_counts_sample.shape)
print(count_vect_sample.get_feature_names())

(20, 240)
['08002986030', '08452810075over18', '09061701461', '1', '100', '100000', '11', '12', '150', '150pday', '16', '2', '20000', '2005', '21st', '3', '4', '4403ldnw1a7rw18', '4txtì¼120', '6day', '81010', '87077', '87121', '87575', '9', '900', 'A', 'As', 'Co', 'Eh', 'FA', 'Go', 'HL', 'He', 'I', 'Id', 'Im', 'No', 'ON', 'Oh', 'Ok', 'R', 'TC', 'Tb', 'To', 'U', 'aid', 'alreadi', 'amor', 'anymor', 'appli', 'around', 'avail', 'b', 'back', 'bless', 'breather', 'brother', 'buffet', 'bugi', 'c', 'call', 'caller', 'callertun', 'camera', 'cash', 'chanc', 'chg', 'cine', 'claim', 'click', 'code', 'colour', 'comp', 'copi', 'cost', 'crazi', 'credit', 'cri', 'csh11', 'cup', 'custom', 'darl', 'date', 'dont', 'dun', 'e', 'earli', 'eg', 'england', 'enough', 'entitl', 'entri', 'even', 'feel', 'final', 'fine', 'free', 'freemsg', 'friend', 'from', 'fulfil', 'fun', 'goalsteam', 'goe', 'gonna', 'got', 'gota', 'grant', 'great', 'had', 'have', 'help', 'hey', 'home', 'hor', 'hour', 'httpwap', 'info', 'ive', 

In [9]:
X_counts_sample

<20x240 sparse matrix of type '<class 'numpy.int64'>'
	with 272 stored elements in Compressed Sparse Row format>

In [10]:
Xcounts_df = pd.DataFrame(X_counts_sample.toarray())

In [12]:
Xcounts_df.columns = count_vect_sample.get_feature_names()
Xcounts_df

Unnamed: 0,08002986030,08452810075over18,09061701461,1,100,100000,11,12,150,150pday,...,wont,word,world,wwwdbuknet,xxx,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,ye,you,å
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
9,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# N-Grams