In [4]:
import pandas as pd
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/galogonzalvo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/galogonzalvo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv("Spam Email raw text for NLP.csv")
df.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


**category** is the label, 1 is spam, 0 is good mail

**message** is the body of the message

**file_name** is an unique identifier of every file

In [3]:
df['CATEGORY'].value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

In [5]:
tokenizer = nltk.RegexpTokenizer(r'\w+')
test_message = 'Hey,, GGggGG feet it going? <HTML><bads> bads "randoms" badly'

test_message_tokenized = tokenizer.tokenize(test_message) #keeps the important part of the message
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [7]:
test_message_lowercased = [t.lower() for t in test_message_tokenized] #make lowercase
test_message_lowercased

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [8]:
from nltk.stem import WordNetLemmatizer

In [10]:
#This is very similar. What it does is to extract meaning of the previous form
lemmatizer = WordNetLemmatizer()
test_message_lemmatized_tokens = [lemmatizer.lemmatize(t) for t in test_message_lowercased]
test_message_lemmatized_tokens

['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [11]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english') #Eliminates stop words (useless words)

test_message_useful_tokens = [t for t in test_message_lemmatized_tokens if t not in stopwords]
test_message_useful_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [12]:
def message_to_token_list(s):
    tokens = tokenizer.tokenize(s)
    lowercased_tokens = [t.lower() for t in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
    useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]
    
    return useful_tokens

message_to_token_list(test_message)

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [13]:
#Train and test sample
df = df.sample(frac=1,random_state=1) # Mix the dataset
df = df.reset_index(drop=True) #If drop is set to false (default) the previous index is added as a new column
df.head()


Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"\n\n<HTML><FONT BACK=""#ffffff"" style=""BACKGRO...",00118.141d803810acd9d4fc23db103dddfcd9
1,1,"<html><body bgColor=""#CCCCCC"" topmargin=1 onMo...",00463.0bc4e08af0529dd773d9f10f922547db
2,0,Quoting Paul Linehan (plinehan@yahoo.com):\n\n...,00358.87ee38040ac1f42320c7b89628b1850a
3,0,<a href=http://www.aaronsw.com/weblog/>\n\nAar...,01274.0d083a2d3b30061efdc2cc73ee9e76e3
4,0,"Oh yeah, the link for more info:\n\n\n\nhttp:/...",00756.2b2ec73ad20a4e0bdf31632ac019233b


In [15]:
split_index = int(len(df)*0.8) 
train_df, test_df = df[:split_index], df[split_index:]

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df, test_df

(      CATEGORY                                            MESSAGE  \
 0            1  \n\n<HTML><FONT  BACK="#ffffff" style="BACKGRO...   
 1            1  <html><body bgColor="#CCCCCC" topmargin=1 onMo...   
 2            0  Quoting Paul Linehan (plinehan@yahoo.com):\n\n...   
 3            0  <a href=http://www.aaronsw.com/weblog/>\n\nAar...   
 4            0  Oh yeah, the link for more info:\n\n\n\nhttp:/...   
 ...        ...                                                ...   
 4631         0  Gregory Alan Bolcer:\n\n>I'm not sure since I ...   
 4632         1  New Account For: zzzz@spamassassin.taint.org\n...   
 4633         0  >>>>> "O" == Owen Byrne <owen@permafrost.net> ...   
 4634         0  This is an automated response to a message you...   
 4635         0  http://www.ouchytheclown.com/welcome.html\n\n\...   
 
                                    FILE_NAME  
 0     00118.141d803810acd9d4fc23db103dddfcd9  
 1     00463.0bc4e08af0529dd773d9f10f922547db  
 2     00358.8

In [16]:
token_counter = {}

for message in train_df['MESSAGE']:
    message_as_token_lst = message_to_token_list(message)
    
    for token in message_as_token_lst:
        if token in token_counter:
            token_counter[token] +=1
        else:
            token_counter[token] =1
len(token_counter) #length of unique words in messages

86415

In [17]:
#Just keep words which appear many times

def keep_token(processed_token,threshold):
    if processed_token not in token_counter:
        return False
    else:
        return token_counter[processed_token] > threshold
    
features = set()
for token in token_counter:
    if keep_token(token,10000):
        features.add(token)
        
#This are the most popular words. Assume we can use them to distinguish fake and true mails
features

{'3d', 'b', 'br', 'com', 'font', 'http', 'p', 'size', 'td', 'tr'}

In [18]:
features = list(features)

token_to_index_mapping = {t:i for t,i in zip(features,range(len(features)))}
token_to_index_mapping

{'com': 0,
 'tr': 1,
 'br': 2,
 'td': 3,
 'b': 4,
 'size': 5,
 'font': 6,
 'http': 7,
 'p': 8,
 '3d': 9}

In [19]:
features

['com', 'tr', 'br', 'td', 'b', 'size', 'font', 'http', 'p', '3d']

In [20]:
#Produce sparse vector: bag of words, based on our main words
import numpy as np

def message_to_count_vector(message):
    count_vector = np.zeros(len(features))
    
    processed_list_of_tokens = message_to_token_list(message)
    
    for token in processed_list_of_tokens:
        if token not in features:
            continue
        index = token_to_index_mapping[token]
        count_vector[index] += 1
    return count_vector

message_to_count_vector('3d b <br> .com bad font font com randoms')

array([2., 0., 1., 0., 1., 0., 2., 0., 0., 1.])

In [21]:
message_to_count_vector(train_df['MESSAGE'].iloc[0])

array([ 9.,  0., 33.,  0.,  2.,  2.,  4.,  6.,  0.,  0.])

In [22]:
train_df.iloc[0]

CATEGORY                                                     1
MESSAGE      \n\n<HTML><FONT  BACK="#ffffff" style="BACKGRO...
FILE_NAME               00118.141d803810acd9d4fc23db103dddfcd9
Name: 0, dtype: object

In [23]:
def df_to_X_y(dff):
    y = dff['CATEGORY'].to_numpy().astype(int)
    
    message_col = dff['MESSAGE']
    count_vectors = []
    
    for message in message_col:
        count_vector = message_to_count_vector(message)
        count_vectors.append(count_vector)
        
    X = np.array(count_vectors).astype(int)
    
    return X, y

In [24]:
X_train, y_train = df_to_X_y(train_df)

X_test, y_test = df_to_X_y(test_df)

In [25]:
#Transform input array so that all numbers are much closer to 0

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

#Use logistic regression model and build classification report
lr = LogisticRegression().fit(X_train,y_train)
print(classification_report(y_test,lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.76      1.00      0.86       788
           1       0.99      0.32      0.48       372

    accuracy                           0.78      1160
   macro avg       0.87      0.66      0.67      1160
weighted avg       0.83      0.78      0.74      1160



In [29]:
# Compare logistic regression to random forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(X_train,y_train)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.83      0.97      0.89       788
           1       0.91      0.57      0.70       372

    accuracy                           0.84      1160
   macro avg       0.87      0.77      0.80      1160
weighted avg       0.85      0.84      0.83      1160

