In [1]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
import string
import os
import re
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('emails.txt') #read the CSV file

In [5]:
df.tail(100)

Unnamed: 0,text,spam
5628,Subject: retail markets conference i would li...,0
5629,Subject: re : friday morning meeting ? vince ...,0
5630,Subject: membership mixer tomorrow - paesanos ...,0
5631,Subject: re : your comments on metals var mode...,0
5632,Subject: term project : this is the list of p...,0
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [6]:
df.shape

(5728, 2)

In [7]:
df.spam.value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [8]:
df.columns

Index(['text', 'spam'], dtype='object')

In [9]:
df.drop_duplicates(inplace = True)

In [10]:
df.shape

(5695, 2)

In [11]:
df.spam.value_counts()

0    4327
1    1368
Name: spam, dtype: int64

In [12]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [13]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer

In [14]:
sent1 = "Subject: why is this"

In [15]:
sent2 = re.sub("Subject:","",sent1)
sent2

' why is this'

In [16]:
tokens = word_tokenize(sent1.lower())
tokens

['subject', ':', 'why', 'is', 'this']

In [18]:
def clean_txt(sent):    
    #Stripping white spaces before and after the text
    sent = sent.strip()
    #Replacing multiple spaces with a single space
    result = re.sub("\s+", " ", sent)
    result = re.sub("Subject:","",result)
    #Replacing Non-Alpha-numeric and non space charecters with nothing
    result = re.sub("[^\w\s]","",result)
    
    tokens = word_tokenize(result.lower())
    stop_updated = stopwords.words("english")  +  ["would", "could","told"]
    text = [term for term in tokens if term not in stop_updated and len(term) > 2] 
    res = " ".join(text)
    return res

In [80]:
df.iloc[5011]['text']

'Subject: bios of mit participants  vince : bios from mit participants did arrive - - - see below . amy  donald r . lessard  deputy dean ; epoch foundation professor of international management  office e 52 - 474  tel 617 - 253 - 6688  ? ? ? ? ? ? ? ? ? ? ?  lessard \' s current research is focused on the shaping and managing of risks in  large engineering projects , the globalization of financial services , and  knowledge development within multinational firms . as deputy dean , lessard  coordinates sloan \' s research centers and provides faculty leadership for its  international initiatives , institutional partnerships and executive  education . the international initiatives include joint programs with tsinghua  and fudan universities and lingnan ( university ) college in china , as well as  programs in taiwan and singapore . lessard is also the faculty director for  the mit - wide partnership between merrill lynch and mit .  general expertise international corporate strategy and fi

In [21]:
df.iloc[5000]['spam']

0

In [22]:
df['ctext'] = df.text.apply(clean_txt)

In [23]:
df

Unnamed: 0,text,spam,ctext
0,Subject: naturally irresistible your corporate...,1,naturally irresistible corporate identity real...
1,Subject: the stock trading gunslinger fanny i...,1,stock trading gunslinger fanny merrill muzo co...
2,Subject: unbelievable new homes made easy im ...,1,unbelievable new homes made easy wanting show ...
3,Subject: 4 color printing special request add...,1,color printing special request additional info...
4,"Subject: do not have money , get software cds ...",1,money get software cds software compatibility ...
...,...,...,...
5723,Subject: re : research and development charges...,0,research development charges gpg forwarded shi...
5724,"Subject: re : receipts from visit jim , than...",0,receipts visit jim thanks invitation visit lsu...
5725,Subject: re : enron case study update wow ! a...,0,enron case study update wow day super thank mu...
5726,"Subject: re : interest david , please , call...",0,interest david please call shirley crenshaw as...


In [24]:
df.head(10)

Unnamed: 0,text,spam,ctext
0,Subject: naturally irresistible your corporate...,1,naturally irresistible corporate identity real...
1,Subject: the stock trading gunslinger fanny i...,1,stock trading gunslinger fanny merrill muzo co...
2,Subject: unbelievable new homes made easy im ...,1,unbelievable new homes made easy wanting show ...
3,Subject: 4 color printing special request add...,1,color printing special request additional info...
4,"Subject: do not have money , get software cds ...",1,money get software cds software compatibility ...
5,"Subject: great nnews hello , welcome to medzo...",1,great nnews hello welcome medzonline groundsel...
6,Subject: here ' s a hot play in motion homela...,1,hot play motion homeland security investments ...
7,Subject: save your money buy getting this thin...,1,save money buy getting thing tried cialls yet ...
8,Subject: undeliverable : home based business f...,1,undeliverable home based business grownups mes...
9,Subject: save your money buy getting this thin...,1,save money buy getting thing tried cialls yet ...


In [25]:
#Seperate text column and the labels into X and y
X_text = df.ctext.values
y = df.spam.values

In [26]:
X_text

array(['naturally irresistible corporate identity really hard recollect company market full suqgestions information isoverwhelminq good catchy logo stylish statlonery outstanding website make task much easier promise havinq ordered iogo company automaticaily become world ieader isguite ciear without good products effective business organization practicable aim hotat nowadays market promise marketing efforts become much effective list clear benefits creativeness hand made original logos specially done reflect distinctive company image convenience logo stationery provided formats easy use content management system letsyou change website content even structure promptness see logo drafts within three business days affordability marketing break make gaps budget 100 satisfaction guaranteed provide unlimited amount changes extra fees surethat love result collaboration look portfolio interested',
       'stock trading gunslinger fanny merrill muzo colza attainder penultimate like esmark perspi

In [None]:
X_text[4000]

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size = 0.20, random_state=3110)
classifier = LogisticRegression()


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_trainmat = tfidf_vectorizer.fit_transform(X_train)

In [29]:
classifier.fit(X_trainmat.toarray(), y_train)

LogisticRegression()

In [30]:
X_testmat = tfidf_vectorizer.transform(X_test)
X_testmat

<1139x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 54504 stored elements in Compressed Sparse Row format>

In [31]:
y_pred = classifier.predict(X_testmat.toarray())

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy_score(y_test,y_pred)

0.9789288849868305

In [33]:
confusion_matrix(y_test,y_pred)

array([[860,   9],
       [ 15, 255]], dtype=int64)

In [34]:
y_predtrain = classifier.predict(X_trainmat)

In [35]:
accuracy_score(y_train,y_predtrain)

0.9892449517120281

In [36]:
np.round(classifier.predict_proba(X_testmat),2)

array([[0.37, 0.63],
       [0.97, 0.03],
       [0.98, 0.02],
       ...,
       [0.99, 0.01],
       [0.99, 0.01],
       [0.16, 0.84]])

In [37]:
probabtest = np.round(classifier.predict_proba(X_testmat),2)

In [40]:
probdf = pd.DataFrame(probabtest)
probdf.loc[10]

0    1.0
1    0.0
Name: 10, dtype: float64

In [41]:
probdf[1].sort_values(ascending = False)

881     0.99
1119    0.98
489     0.98
1038    0.98
237     0.98
        ... 
187     0.00
188     0.00
852     0.00
394     0.00
367     0.00
Name: 1, Length: 1139, dtype: float64

In [67]:
y_pred_new_threshold = (classifier.predict_proba(X_testmat)[:,1]>=0.6).astype(int)

In [68]:
classifier.predict_proba(X_testmat)[:,1]

array([0.6258109 , 0.0280243 , 0.02352709, ..., 0.01264034, 0.00841225,
       0.8439583 ])

In [69]:
(classifier.predict_proba(X_testmat)[:,1]>=0.8).astype(int)

array([0, 0, 0, ..., 0, 0, 1])

In [70]:
classifier.predict_proba(X_testmat)[:,1]>=0.8

array([False, False, False, ..., False, False,  True])

In [71]:
(classifier.predict_proba(X_testmat)[:,1]>=0.6).astype(int)

array([1, 0, 0, ..., 0, 0, 1])

In [72]:
(np.array([True,False,False,True])).astype(int)

array([1, 0, 0, 1])

In [73]:
confusion_matrix(y_test,y_pred_new_threshold)

array([[864,   5],
       [ 32, 238]], dtype=int64)

In [61]:
accuracy_score(y_test,y_pred_new_threshold)

0.9236172080772608

In [62]:
probabtest = np.round(classifier.predict_proba(X_testmat),2)

In [63]:
probabtest

array([[0.37, 0.63],
       [0.97, 0.03],
       [0.98, 0.02],
       ...,
       [0.99, 0.01],
       [0.99, 0.01],
       [0.16, 0.84]])

In [74]:
probdf = pd.DataFrame(probabtest)

In [75]:
probdf[1].sort_values(ascending = False)

881     0.99
1119    0.98
489     0.98
1038    0.98
237     0.98
        ... 
187     0.00
188     0.00
852     0.00
394     0.00
367     0.00
Name: 1, Length: 1139, dtype: float64

In [76]:
import pickle

In [77]:
with open('classifier.pickle','wb') as f:
    pickle.dump(classifier,f)

In [78]:
with open('tfidfmodel.pickle','wb') as f:
    pickle.dump(tfidf_vectorizer,f)