# Part 0:Importing modules and pre-wrangling

In [1]:
# Importing modules

import pandas as pd
import csv
import nltk
import json
import re
import os

In [2]:
# Specify data directory

data_dir = os.path.join(os.path.dirname(os.getcwd()),'Data')

In [3]:
# Specify data path

data_path = os.path.join(data_dir,'SMSSpamCollection.txt')

In [4]:
# Importing raw text

df_raw = pd.read_csv(data_path, delimiter = '\t', header = None)
df_raw.columns = ['label', 'text']
df_raw.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Derive the label array
from sklearn.preprocessing import LabelEncoder

# convert class labels to binary values, 0 = ham and 1 = spam
encoder = LabelEncoder()
y = encoder.fit_transform(df_raw.label)

print(y[:10])

[0 0 1 0 0 1 0 0 1 1]


# Part 1: Processing various text features

In [6]:
def regex_preprocess(df):
    """Replace email addresses, urls, money and phone numbers with placeholders 
    """
    for index, row in df.iterrows():
            # Replace email addresses with 'EmAd'
            row['text'] = re.sub(r'[^\s]+@.[^\s]+', '{EmAd}', row['text'])

            # Replace URLs with 'Url'
            row['text'] = re.sub(r'http[^\s]+', '{Url}', row['text'])

            # Replace money symbols with 'MoSy'
            row['text'] = re.sub(r'£|\$', '{MoSy}', row['text'])

            # Replace 10 or 11 digit phone numbers
            row['text'] = re.sub(r'0?(\d{10,}?)','{PhNu}', row['text'])
    return df

In [7]:
df_processed = regex_preprocess(df_raw)

In [8]:
df_processed['label'] = y

In [9]:
# Write the dataframe

df_processed.to_json(os.path.join(data_dir,'sms_processed.json'))

# Part 2: Tokenizing text

In [10]:
# Tokenization of text

df_processed['token'] = df_raw.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
df_tokens = df_processed.drop('text', axis = 1)
df_tokens.head()

Unnamed: 0,label,token
0,0,"[Go, until, jurong, point, ,, crazy.., Availab..."
1,0,"[Ok, lar, ..., Joking, wif, u, oni, ...]"
2,1,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,0,"[U, dun, say, so, early, hor, ..., U, c, alrea..."
4,0,"[Nah, I, do, n't, think, he, goes, to, usf, ,,..."


In [11]:
# Removing punctuations and numerics

df_tokens['token'] = df_tokens.apply(lambda row: [word for word in row['token'] if word.isalpha()], axis = 1)
df_tokens.head()

Unnamed: 0,label,token
0,0,"[Go, until, jurong, point, Available, only, in..."
1,0,"[Ok, lar, Joking, wif, u, oni]"
2,1,"[Free, entry, in, a, wkly, comp, to, win, FA, ..."
3,0,"[U, dun, say, so, early, hor, U, c, already, t..."
4,0,"[Nah, I, do, think, he, goes, to, usf, he, liv..."


In [12]:
# Converting to lowercase

df_tokens['token'] = df_tokens.apply(lambda row: [word.lower() for word in row['token']], axis = 1)
df_tokens.head()

Unnamed: 0,label,token
0,0,"[go, until, jurong, point, available, only, in..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, i, do, think, he, goes, to, usf, he, liv..."


In [13]:
# Removing stopwords

from nltk.corpus import stopwords
df_tokens['token'] = df_tokens.apply(lambda row: [word for word in row['token'] if not word in stopwords.words('english')], axis = 1)
df_tokens.head()

Unnamed: 0,label,token
0,0,"[go, jurong, point, available, bugis, n, great..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, think, goes, usf, lives, around, though]"


In [14]:
# Lemmatizing words

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
df_tokens['token'] = df_tokens.apply(lambda row: [porter.stem(word) for word in row['token']], axis = 1)
df_tokens.head()

Unnamed: 0,label,token
0,0,"[go, jurong, point, avail, bugi, n, great, wor..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, wkli, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,0,"[nah, think, goe, usf, live, around, though]"


In [15]:
# Filtering out the infrequent words

# Create a document with all the tokens

all_text = []
for row in df_processed['token']:
    all_text += row

    
# Create an infrequent word list

all_fdist = nltk.FreqDist(all_text)
uncommon_list = [word for word in all_text if all_fdist[word] <20]


# Filter the dataframe of the infrequent words

df_final = df_tokens
df_final['token'] = df_final.apply(lambda row: [word for word in row['token'] if not word in uncommon_list], axis = 1)
df_final.head()

Unnamed: 0,label,token
0,0,"[go, avail, bugi, n, great, world, e, got, amo..."
1,0,"[ok, lar, wif, u]"
2,1,"[free, entri, wkli, win, fa, tkt, may, text, f..."
3,0,"[u, dun, say, earli, u, c, alreadi, say]"
4,0,"[nah, think, goe, live, around, though]"


In [16]:
# Write the dataframe

df_final.to_json(os.path.join(data_dir,'sms_final.json'))