In [75]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [84]:
# dataset comes from Kaggle https://www.kaggle.com/nitishabharathi/email-spam-dataset?select=lingSpam.csv

In [53]:
df = pd.read_csv('lingSpam.csv')

In [54]:
df.head()

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,Subject: great part-time or summer job !\n \n ...,1
1,1,Subject: auto insurance rates too high ?\n \n ...,1
2,2,Subject: do want the best and economical hunti...,1
3,3,Subject: email 57 million people for $ 99\n \n...,1
4,4,Subject: do n't miss these !\n \n attention ! ...,1


In [55]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,Body,Label
0,Subject: great part-time or summer job !\n \n ...,1
1,Subject: auto insurance rates too high ?\n \n ...,1
2,Subject: do want the best and economical hunti...,1
3,Subject: email 57 million people for $ 99\n \n...,1
4,Subject: do n't miss these !\n \n attention ! ...,1


In [56]:
df.Label.value_counts()

0    2172
1     433
Name: Label, dtype: int64

## Preprocessing

In [57]:
def preprocess(text):
    # remove punctuation
    puncts = string.punctuation
    for punct in puncts:
        text = text.replace(punct, '')
    # lowercase
    text = text.lower()
    # remove numbers
    text = ''.join(word for word in text if not word.isdigit())
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    text = [w for w in word_tokens if not w in stop_words]
    #text = ' '.join(text)
    return text

df['Body_clean'] = df['Body'].apply(preprocess)
df

Unnamed: 0,Body,Label,Body_clean
0,Subject: great part-time or summer job !\n \n ...,1,"[subject, great, parttime, summer, job, displa..."
1,Subject: auto insurance rates too high ?\n \n ...,1,"[subject, auto, insurance, rates, high, dear, ..."
2,Subject: do want the best and economical hunti...,1,"[subject, want, best, economical, hunting, vac..."
3,Subject: email 57 million people for $ 99\n \n...,1,"[subject, email, million, people, million, ema..."
4,Subject: do n't miss these !\n \n attention ! ...,1,"[subject, nt, miss, attention, warning, adults..."
...,...,...,...
2600,Subject: computationally - intensive methods i...,0,"[subject, computationally, intensive, methods,..."
2601,Subject: books : a survey of american linguist...,0,"[subject, books, survey, american, linguistics..."
2602,Subject: wecol ' 98 - - western conference on ...,0,"[subject, wecol, western, conference, linguist..."
2603,Subject: euralex ' 98 - revised programme\n \n...,0,"[subject, euralex, revised, programme, euralex..."


In [58]:
# lemmatize

def lemmat(text):
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text                                
                                    
df['Body_clean'] = df['Body_clean'].apply(lemmat)
df                                      

Unnamed: 0,Body,Label,Body_clean
0,Subject: great part-time or summer job !\n \n ...,1,subject great parttime summer job display box ...
1,Subject: auto insurance rates too high ?\n \n ...,1,subject auto insurance rate high dear nlpeople...
2,Subject: do want the best and economical hunti...,1,subject want best economical hunting vacation ...
3,Subject: email 57 million people for $ 99\n \n...,1,subject email million people million email add...
4,Subject: do n't miss these !\n \n attention ! ...,1,subject nt miss attention warning adult warnin...
...,...,...,...
2600,Subject: computationally - intensive methods i...,0,subject computationally intensive method quant...
2601,Subject: books : a survey of american linguist...,0,subject book survey american linguistics publi...
2602,Subject: wecol ' 98 - - western conference on ...,0,subject wecol western conference linguistics a...
2603,Subject: euralex ' 98 - revised programme\n \n...,0,subject euralex revised programme euralex lieg...


In [62]:
# vectorize to a bag-of-words representation

vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(df['Body_clean'])
X_vec = X_vec.toarray()

In [63]:
X_vec

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [50, 98, 40, ..., 10,  2,  2]], dtype=int64)

## MultinomialNB model

In [74]:
X = vectorizer.fit_transform(df.Body_clean)
y = df.Label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

nb_model = MultinomialNB()
nb_model.fit(X_train,y_train)
nb_model.score(X_test,y_test)

0.9923224568138196

## Prediction 

In [76]:
# testing against testing set 
y_pred = nb_model.predict(X_test) 
print(confusion_matrix(y_test, y_pred))

[[435   1]
 [  3  82]]


In [77]:
y_pred

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [82]:
def predicting(new_text):
    new_text = vectorizer.transform([new_text])
    prediction = nb_model.predict(new_text)
    return prediction[0]