In [1]:
import pandas as pd
import numpy as nm

#for Label Encoding
from sklearn.preprocessing import LabelEncoder

#for Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#for Data splitting
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv(r'C:\Users\Acer\Desktop\Machine Learning Models\Dataset\email_dataset.csv')
df.shape

(5695, 2)

In [3]:
df

Unnamed: 0,text,spam
0,Subject: logistics for the sycamore meeting in...,0
1,Subject: ll visa - anshuman shrivastava anshu...,0
2,"Subject: re : for your approval erica , yes ...",0
3,Subject: new medz how t cornel o save on your...,1
4,"Subject: tiger team info vince , here is the...",0
...,...,...
5690,"Subject: benchmarking study sally , i gave y...",0
5691,Subject: energy book vl . 0 vince : i have r...,0
5692,Subject: re : friday brown bag lunch on option...,0
5693,Subject: interim report to gary hickerson for ...,0


In [4]:
df.shape

(5695, 2)

In [5]:
df['spam'].value_counts()

0    4327
1    1368
Name: spam, dtype: int64

# DATA PRE-PROCESSING

In [6]:
df=df.drop_duplicates(keep='first')

In [7]:
df

Unnamed: 0,text,spam
0,Subject: logistics for the sycamore meeting in...,0
1,Subject: ll visa - anshuman shrivastava anshu...,0
2,"Subject: re : for your approval erica , yes ...",0
3,Subject: new medz how t cornel o save on your...,1
4,"Subject: tiger team info vince , here is the...",0
...,...,...
5690,"Subject: benchmarking study sally , i gave y...",0
5691,Subject: energy book vl . 0 vince : i have r...,0
5692,Subject: re : friday brown bag lunch on option...,0
5693,Subject: interim report to gary hickerson for ...,0


#### Word Count

In [8]:
df['word_count'] = df['text'].apply(lambda x: len(str(x).split(" ")))
df[['text','word_count']].head()

Unnamed: 0,text,word_count
0,Subject: logistics for the sycamore meeting in...,755
1,Subject: ll visa - anshuman shrivastava anshu...,416
2,"Subject: re : for your approval erica , yes ...",341
3,Subject: new medz how t cornel o save on your...,107
4,"Subject: tiger team info vince , here is the...",315


In [9]:
df.word_count.describe()

count    5695.000000
mean      358.137313
std       452.047939
min         3.000000
25%       112.500000
50%       232.000000
75%       439.000000
max      8984.000000
Name: word_count, dtype: float64

# DATA CLEANING

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
porter_stemmer = PorterStemmer()

from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

stop_words=stopwords.words("english")

### Removing Punctuation Marks

In [11]:
import string
def dataClean(text):
    message = re.sub('[^a-zA-Z]',' ', text)
    message = message.split()
    return " ".join(message)

df['text'] = df['text'].apply(dataClean)
df.head(n = 10)

Unnamed: 0,text,spam,word_count
0,Subject logistics for the sycamore meeting in ...,0,755
1,Subject ll visa anshuman shrivastava anshuman ...,0,416
2,Subject re for your approval erica yes no prob...,0,341
3,Subject new medz how t cornel o save on your m...,1,107
4,Subject tiger team info vince here is the info...,0,315
5,Subject professor bambos visit shirley profess...,0,83
6,Subject re mscf speaker series recruitment tha...,0,162
7,Subject re your visit to enron joe fyi please ...,0,1166
8,Subject re from larry roberts thanks for the r...,0,87
9,Subject fw resume for vince kaminski we just r...,0,310


### Converting to Lowercase

In [12]:
import string
def lowercase(text):
    message = text.lower()
    return message

df['text'] = df['text'].apply(lowercase)
df.head(n = 10)

Unnamed: 0,text,spam,word_count
0,subject logistics for the sycamore meeting in ...,0,755
1,subject ll visa anshuman shrivastava anshuman ...,0,416
2,subject re for your approval erica yes no prob...,0,341
3,subject new medz how t cornel o save on your m...,1,107
4,subject tiger team info vince here is the info...,0,315
5,subject professor bambos visit shirley profess...,0,83
6,subject re mscf speaker series recruitment tha...,0,162
7,subject re your visit to enron joe fyi please ...,0,1166
8,subject re from larry roberts thanks for the r...,0,87
9,subject fw resume for vince kaminski we just r...,0,310


### Stemming & Lemmatisation

In [13]:
import string
def stemming_lemmatisation(text):
    message = text.split()
    words = [snowball_stemmer.stem(word) for word in message if word not in stop_words]
    lem = WordNetLemmatizer()
    words = [lem.lemmatize(word) for word in message if not word in  stop_words]
    return " ".join(words)

df["text"] = df["text"].apply(stemming_lemmatisation)
df.head(n = 10)

Unnamed: 0,text,spam,word_count
0,subject logistics sycamore meeting chelmsford ...,0,755
1,subject visa anshuman shrivastava anshuman ple...,0,416
2,subject approval erica yes problem vince infor...,0,341
3,subject new medz cornel save medlcations pharm...,1,107
4,subject tiger team info vince info tiger team ...,0,315
5,subject professor bambos visit shirley profess...,0,83
6,subject mscf speaker series recruitment thanks...,0,162
7,subject visit enron joe fyi please plan attend...,0,1166
8,subject larry robert thanks referral matter wa...,0,87
9,subject fw resume vince kaminski received resu...,0,310


# FEATURE EXTRACTION AND VECTORIZATION

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(df['text'],df['spam'],test_size=0.2,random_state=57)

In [16]:
X_train

1568    subject phone time dear dr kaminski thanks arr...
4920    subject christmas basket kevin please add copy...
4618    subject electricity conference update apex con...
4809    subject congratulation vince congratulation pr...
1553    subject fma european conference john book arri...
                              ...                        
3151    subject charles shen molly would tanya vince e...
4502    subject price cap forwarded vladimir gorny hou...
3077    subject yaron resume kevin would greatly appre...
3798    subject fma european conference fine want chan...
3023    subject works good want know save overrode pii...
Name: text, Length: 4556, dtype: object

In [17]:
def features_transform(mail_text):
    #form bag of words using countvectorizer
    transformer = CountVectorizer(max_features=27000).fit(X_train)
    bag_of_words = transformer.transform(mail_text)
     
    #apply the TF-IDF transform to the output of BOW
    tfidf_transformer = TfidfTransformer().fit(bag_of_words)
    tfidf_texts = tfidf_transformer.transform(bag_of_words)
    
    #return result of transforms
    return tfidf_texts

In [None]:
cv=CountVectorizer(max_features=27000).fit(X_train)

In [19]:
X_train_features=features_transform(X_train)
X_test_features=features_transform(X_test)

In [20]:
import pickle
pickle.dump(transformer, open('transform.pkl', 'wb'))

NameError: name 'transformer' is not defined

In [None]:
X_train_features.shape

In [None]:
df['text'][8]

In [None]:
print(X_train_features.shape[5][:])

# MODEL TRAINING

In [None]:
def generate_accuracy_and_heatmap(model, x, y): # This is a multi label classification problem
    confusion_m = confusion_matrix(y,model.predict(x))
    sns.heatmap(confusion_m,annot=True,fmt="d")
    ac="{:.2f}".format(accuracy_score(y,model.predict(x))*100)
    print('Accuracy : ', ac,'%')
    print("Y axis : Actual Value.")
    print("X axis : Predicted Value.")
    return 1

## Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf_mnb = MultinomialNB()
clf_mnb.fit(X_train_features, Y_train)

In [None]:
generate_accuracy_and_heatmap(clf_mnb, X_test_features, Y_test)

## Complement Naive Bayes

In [None]:
from sklearn.naive_bayes import ComplementNB
clf_cnb = ComplementNB()
clf_cnb.fit(X_train_features, Y_train)

In [None]:
generate_accuracy_and_heatmap(clf_cnb, X_test_features, Y_test)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf_DT=DecisionTreeClassifier(random_state=0 , max_depth=18)
clf_DT.fit(X_train_features, Y_train)

In [None]:
generate_accuracy_and_heatmap(clf_DT, X_test_features, Y_test)

## XGBoost Classifier

In [None]:
import xgboost as xgb
xgb_xgb = xgb.XGBClassifier()
xgb_xgb.fit(X_train_features, Y_train)

In [None]:
generate_accuracy_and_heatmap(xgb_xgb, X_test_features, Y_test)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(criterion='entropy', max_depth= 9, max_features= 'sqrt', n_estimators= 200, random_state=34)
clf_rf.fit(X_train_features, Y_train)

In [None]:
generate_accuracy_and_heatmap(clf_rf, X_test_features, Y_test)

## Light Gradient Boosting Machine

In [None]:
import lightgbm as lgb
clf_lgbm = lgb.LGBMClassifier()
clf_lgbm.fit(X_train_features, Y_train)

In [None]:
generate_accuracy_and_heatmap(clf_lgbm, X_test_features, Y_test)

# Result Visualisation

In [None]:
import matplotlib.pyplot as plt

accuracy_dict = {'MNB':87.27,'CNB':94.29,'DT':96.14,'XGB': 97.45,'RF':79.19, 'LGBM':98.42}
models = list(accuracy_dict.keys())
accuracies = list(accuracy_dict.values())
fig = plt.figure(figsize = (5, 5))
#  Bar plot
plt.bar(models, accuracies, color ='coral',width = 0.5)
plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.title("Accuracy  Comparison")
plt.show()

# Transforming into Pickle file

In [None]:
import pickle
pickle.dump(clf_lgbm,open('SpamEmailDetection.pkl','wb'))