In [4]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
nltk.download_shell()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> stopwords


    Downloading package stopwords to
        C:\Users\chandra.dhiraj\AppData\Roaming\nltk_data...
      Package stopwords is already up-to-date!



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


### Reading and exploring dataset

In [5]:
rawData = open('SMSSpamCollection').read()

In [6]:
import pandas as pd

In [7]:
data = pd.read_csv('SMSSpamCollection', sep='\t',names=['label', 'bodytext'])
data.head()

Unnamed: 0,label,bodytext
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
print(f'Input data has {len(data)} rows and {len(data.columns)} columns')

Input data has 5572 rows and 2 columns


In [9]:
print(f"Out of {len(data)} rows there are {len(data[data['label']=='spam'])} spam and {len(data[data['label']=='ham'])} ham")

Out of 5572 rows there are 747 spam and 4825 ham


In [10]:
print(f"Number of null values in label is {data['label'].isnull().sum()}")
print(f"Number of null values in Text is {data['bodytext'].isnull().sum()}")

Number of null values in label is 0
Number of null values in Text is 0


### Preprocessing dataset

Importing libraries

In [11]:
import string #for punctuation removal
import re #for splitting sentence into words
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()

Remove Punctuation

In [12]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
def remove_punct(text):
    no_punct = "".join([char for char in text if char not in string.punctuation])
    return no_punct

In [15]:
data['bodytext_without_punct'] = data['bodytext'].apply(lambda x: remove_punct(x))

In [16]:
data.head()

Unnamed: 0,label,bodytext,bodytext_without_punct
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


Tokenization or splitting

In [17]:
def tokenize(text):
    tokens = re.split('\W+',text)
    return tokens

In [18]:
data['tokenize_bodytext'] = data['bodytext_without_punct'].apply(lambda x: tokenize(x.lower()))

In [19]:
data.head()

Unnamed: 0,label,bodytext,bodytext_without_punct,tokenize_bodytext
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


Remove stopwords

In [20]:
stopword = nltk.corpus.stopwords.words('english')

In [21]:
def remove_stopwords(tokenize_list):
    text = [word for word in tokenize_list if word not in stopword]
    return text

In [22]:
data['nostop_bodytext'] = data['tokenize_bodytext'].apply(lambda x: remove_stopwords(x))

In [23]:
data.head()

Unnamed: 0,label,bodytext,bodytext_without_punct,tokenize_bodytext,nostop_bodytext
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t..."


stemming

In [24]:
def stemming(tokenize_text):
    text = [ps.stem(word) for word in tokenize_text]
    return text

In [25]:
data['Stemmed_text'] = data['nostop_bodytext'].apply(lambda x: stemming(x))

In [26]:
data.head()

Unnamed: 0,label,bodytext,bodytext_without_punct,tokenize_bodytext,nostop_bodytext,Stemmed_text
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho..."


Lemmatizer

In [27]:
def lemmatizing(tokenize_text):
    text = [wn.lemmatize(word) for word in tokenize_text]
    return text

In [28]:
data['lemmatized_text'] = data['nostop_bodytext'].apply(lambda x: lemmatizing(x))

In [29]:
data.head()

Unnamed: 0,label,bodytext,bodytext_without_punct,tokenize_bodytext,nostop_bodytext,Stemmed_text,lemmatized_text
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, go, usf, life, around, though]"


Preprocessing data function

In [34]:
def content_process(text):
    no_punct = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+',no_punct)
    no_stopwords = [word for word in tokens if word not in stopword]
    #stemmed = [ps.stem(word) for word in no_stopwords]
    lemmat = [wn.lemmatize(word) for word in no_stopwords]
    return lemmat

In [35]:
sample = 'This is a sample message! Please note: it has punctuation.'
content_process(sample)

['This', 'sample', 'message', 'Please', 'note', 'punctuation']

Apply CountVectorizer

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
bow = CountVectorizer(analyzer=content_process).fit(data['bodytext'])

In [39]:
len(bow.vocabulary_)

11045

In [40]:
data_bow = bow.transform(data['bodytext'])

In [41]:
print('Shape of sparse matrix: ',data_bow.shape)
print('Amount of non-zero occurances: ', data_bow.nnz)

Shape of sparse matrix:  (5572, 11045)
Amount of non-zero occurances:  56326


Term weighting and Normalization with TF-IDF

In [42]:
from sklearn.feature_extraction.text import TfidfTransformer

In [48]:
tfidf = TfidfTransformer().fit(data_bow)

In [49]:
data_tfidf = tfidf.transform(data_bow)

In [50]:
data_tfidf.shape

(5572, 11045)

Splitting data into TrainTest

In [51]:
from sklearn.model_selection import train_test_split

In [53]:
X = data_tfidf
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Training the Model

In [54]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
rfc = RandomForestClassifier().fit(X_train,y_train)

In [56]:
prediction = rfc.predict(X_test)

Evaluation of the Model

In [57]:
from sklearn.metrics import classification_report

In [59]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1448
        spam       1.00      0.83      0.91       224

    accuracy                           0.98      1672
   macro avg       0.99      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672

