# NLP with NLTK

## Data Cleansing

In this notebook we will process the dataset, train the models, and save the best model. Link to download the dataset from kaggle.
https://www.kaggle.com/chandramoulinaidu/spam-classification-for-basic-nlp

In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, nltk

In [2]:
data = pd.read_csv('dataset/spam_email_raw_text.csv')
display(data.head())

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [3]:
data.drop(columns = ['FILE_NAME'], inplace = True)

In [4]:
print(data.shape)

(5796, 2)


In [5]:
print(data.CATEGORY.value_counts())

0    3900
1    1896
Name: CATEGORY, dtype: int64


with `1` means spam and `0` means not spam, we have an unbalanced dataset

In [6]:
# example of a spam email
print(data.MESSAGE[0])

Dear Homeowner,

 

Interest Rates are at their lowest point in 40 years!



We help you find the best rate for your situation by

matching your needs with hundreds of lenders!



Home Improvement, Refinance, Second Mortgage,

Home Equity Loans, and More! Even with less than

perfect credit!



This service is 100% FREE to home owners and new

home buyers without any obligation. 



Just fill out a quick, simple form and jump-start

your future plans today!





Visit http://61.145.116.186/user0201/index.asp?Afft=QM10













To unsubscribe, please visit:



http://61.145.116.186/light/watch.asp







In [7]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
test_message = data.MESSAGE[0]
test_message_tokenized = tokenizer.tokenize(test_message)
print(test_message_tokenized)

['Dear', 'Homeowner', 'Interest', 'Rates', 'are', 'at', 'their', 'lowest', 'point', 'in', '40', 'years', 'We', 'help', 'you', 'find', 'the', 'best', 'rate', 'for', 'your', 'situation', 'by', 'matching', 'your', 'needs', 'with', 'hundreds', 'of', 'lenders', 'Home', 'Improvement', 'Refinance', 'Second', 'Mortgage', 'Home', 'Equity', 'Loans', 'and', 'More', 'Even', 'with', 'less', 'than', 'perfect', 'credit', 'This', 'service', 'is', '100', 'FREE', 'to', 'home', 'owners', 'and', 'new', 'home', 'buyers', 'without', 'any', 'obligation', 'Just', 'fill', 'out', 'a', 'quick', 'simple', 'form', 'and', 'jump', 'start', 'your', 'future', 'plans', 'today', 'Visit', 'http', '61', '145', '116', '186', 'user0201', 'index', 'asp', 'Afft', 'QM10', 'To', 'unsubscribe', 'please', 'visit', 'http', '61', '145', '116', '186', 'light', 'watch', 'asp']


In [8]:
test_message_lower = [string.lower() for string in test_message_tokenized]
print(test_message_lower)

['dear', 'homeowner', 'interest', 'rates', 'are', 'at', 'their', 'lowest', 'point', 'in', '40', 'years', 'we', 'help', 'you', 'find', 'the', 'best', 'rate', 'for', 'your', 'situation', 'by', 'matching', 'your', 'needs', 'with', 'hundreds', 'of', 'lenders', 'home', 'improvement', 'refinance', 'second', 'mortgage', 'home', 'equity', 'loans', 'and', 'more', 'even', 'with', 'less', 'than', 'perfect', 'credit', 'this', 'service', 'is', '100', 'free', 'to', 'home', 'owners', 'and', 'new', 'home', 'buyers', 'without', 'any', 'obligation', 'just', 'fill', 'out', 'a', 'quick', 'simple', 'form', 'and', 'jump', 'start', 'your', 'future', 'plans', 'today', 'visit', 'http', '61', '145', '116', '186', 'user0201', 'index', 'asp', 'afft', 'qm10', 'to', 'unsubscribe', 'please', 'visit', 'http', '61', '145', '116', '186', 'light', 'watch', 'asp']


In [9]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
test_message_lemmatized = [lemmatizer.lemmatize(string) for string in test_message_lower]
print(test_message_lemmatized)

['dear', 'homeowner', 'interest', 'rate', 'are', 'at', 'their', 'lowest', 'point', 'in', '40', 'year', 'we', 'help', 'you', 'find', 'the', 'best', 'rate', 'for', 'your', 'situation', 'by', 'matching', 'your', 'need', 'with', 'hundred', 'of', 'lender', 'home', 'improvement', 'refinance', 'second', 'mortgage', 'home', 'equity', 'loan', 'and', 'more', 'even', 'with', 'le', 'than', 'perfect', 'credit', 'this', 'service', 'is', '100', 'free', 'to', 'home', 'owner', 'and', 'new', 'home', 'buyer', 'without', 'any', 'obligation', 'just', 'fill', 'out', 'a', 'quick', 'simple', 'form', 'and', 'jump', 'start', 'your', 'future', 'plan', 'today', 'visit', 'http', '61', '145', '116', '186', 'user0201', 'index', 'asp', 'afft', 'qm10', 'to', 'unsubscribe', 'please', 'visit', 'http', '61', '145', '116', '186', 'light', 'watch', 'asp']


In [10]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
test_message_important = [string for string in test_message_lemmatized if string not in stop_words]
print(test_message_important)

['dear', 'homeowner', 'interest', 'rate', 'lowest', 'point', '40', 'year', 'help', 'find', 'best', 'rate', 'situation', 'matching', 'need', 'hundred', 'lender', 'home', 'improvement', 'refinance', 'second', 'mortgage', 'home', 'equity', 'loan', 'even', 'le', 'perfect', 'credit', 'service', '100', 'free', 'home', 'owner', 'new', 'home', 'buyer', 'without', 'obligation', 'fill', 'quick', 'simple', 'form', 'jump', 'start', 'future', 'plan', 'today', 'visit', 'http', '61', '145', '116', '186', 'user0201', 'index', 'asp', 'afft', 'qm10', 'unsubscribe', 'please', 'visit', 'http', '61', '145', '116', '186', 'light', 'watch', 'asp']


In [11]:
def text_to_tokens(message):
    message_tokenized = tokenizer.tokenize(message)
    message_lower = [string.lower() for string in message_tokenized]
    message_lemmatized = [lemmatizer.lemmatize(string) for string in message_lower]
    message_important = [string for string in message_lemmatized if string not in stop_words]
    return message_important

In [12]:
print(text_to_tokens(data.MESSAGE[0]))

['dear', 'homeowner', 'interest', 'rate', 'lowest', 'point', '40', 'year', 'help', 'find', 'best', 'rate', 'situation', 'matching', 'need', 'hundred', 'lender', 'home', 'improvement', 'refinance', 'second', 'mortgage', 'home', 'equity', 'loan', 'even', 'le', 'perfect', 'credit', 'service', '100', 'free', 'home', 'owner', 'new', 'home', 'buyer', 'without', 'obligation', 'fill', 'quick', 'simple', 'form', 'jump', 'start', 'future', 'plan', 'today', 'visit', 'http', '61', '145', '116', '186', 'user0201', 'index', 'asp', 'afft', 'qm10', 'unsubscribe', 'please', 'visit', 'http', '61', '145', '116', '186', 'light', 'watch', 'asp']


In [13]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, random_state = 42, stratify = data.CATEGORY)
train.reset_index(inplace = True, drop = True)
test.reset_index(inplace = True, drop = True)

In [14]:
train.CATEGORY.value_counts(normalize = True)

0    0.672878
1    0.327122
Name: CATEGORY, dtype: float64

In [15]:
test.CATEGORY.value_counts(normalize = True)

0    0.672878
1    0.327122
Name: CATEGORY, dtype: float64

In [16]:
token_counter = {}

for message in train.MESSAGE:
    tokens = text_to_tokens(message)
    for token in tokens:
        if token in token_counter:
            token_counter[token] += 1
        else:
            token_counter[token] = 1

print(len(token_counter))

81624


In [17]:
# function to access the most frequent tokens based on a threshold
def keep_token(token, threshold):
    if token not in token_counter:
        return False
    else:
        return token_counter[token] > threshold

print(keep_token('http', 10000))

True


In [18]:
# getting tokens repeated more than 1000 times
features = []

for token in token_counter:
    if keep_token(token, 1000):
        features.append(token)

print(features)

['30', '2002', 'wrote', '10', '8', 'go', 'linux', 'border', 'year', 'know', 'ie', 'web', 'http', 'user', 'group', 'ilug', 'www', 'mailman', 'listinfo', 'information', 'list', 'would', 'could', 'u', 'message', 'mailto', 'net', '6', '22', 'c', 'way', 'com', 'click', '3', '2', 'spamassassin', 'org', 'subject', 'b', 'like', 'people', '1', 'want', 'work', 'ha', 'system', 'wa', 'right', 'server', 'service', 'program', 'time', 'need', '4', 'day', 'even', 'one', 'may', 'see', 'get', 'use', 'also', '50', 'mail', 'please', 'world', 'content', 'type', 'text', '5', '00', '0', '20', 'change', 'e', 'form', 'name', 'state', 'mailing', 'html', 'head', 'title', '3d', 'body', 'bgcolor', 'ffffff', '000000', 'table', 'width', 'cellpadding', 'cellspacing', 'align', 'center', 'tr', 'td', '100', 'img', 'src', 'image', 'height', 'gif', 'br', 'font', 'size', 'face', 'arial', 'helvetica', 'sans', 'serif', 'div', 'nbsp', 'href', 'colspan', 'color', 'input', 'email', 'valign', 'value', 'p', 'make', 'free', 'link'

In [19]:
def text_to_count_vector(message):
    base_dict = {word:0 for word in features}
    tokens = text_to_tokens(message)
    for token in tokens:
        if token in features:
            base_dict[token] += 1
    count_vector = np.array(list(base_dict.values()))
    return count_vector

print(text_to_count_vector(data.MESSAGE[0]))

[0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0
 0 0 0 0 0 0 0 0]


In [20]:
def df_to_X_y(df):
    y = df.CATEGORY.to_numpy().astype(int)
    messages = df.MESSAGE
    count_vectors = []
    for message in messages:
        count_vector = text_to_count_vector(message)
        count_vectors.append(count_vector)
    X = np.array(count_vectors).astype(int)
    return X, y

In [21]:
X_train, y_train = df_to_X_y(train)
X_test, y_test = df_to_X_y(test)

In [22]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4347, 156), (4347,), (1449, 156), (1449,))

## Model Training

In [23]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier().fit(X_train, y_train)

In [24]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rfc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       975
           1       0.97      0.95      0.96       474

    accuracy                           0.98      1449
   macro avg       0.97      0.97      0.97      1449
weighted avg       0.98      0.98      0.98      1449



## Saving model and tokenizer

In [25]:
import json, joblib
joblib.dump(rfc, 'nltk_model.joblib')
fid = open('nltk_features.json', 'w')
json.dump(features, fid)
fid.close()