In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
email_df = pd.read_csv('data/emails.csv')
email_df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [3]:
email_df.isnull().sum()

text    0
spam    0
dtype: int64

## Text Preprocessing and cleaning

In [4]:
#define a function to clean the text
import string
def clean_text(text):
    text = ''.join([i for i in text if not i.isdigit()])
    text = text.strip()
    text = text.lower()
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    return text


In [5]:
email_df['cleaned_text'] = email_df['text'].apply(clean_text)

In [6]:
email_df

Unnamed: 0,text,spam,cleaned_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,subject color printing special request addit...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fr...
...,...,...,...
5723,Subject: re : research and development charges...,0,subject re research and development charges t...
5724,"Subject: re : receipts from visit jim , than...",0,subject re receipts from visit jim thanks ...
5725,Subject: re : enron case study update wow ! a...,0,subject re enron case study update wow all ...
5726,"Subject: re : interest david , please , call...",0,subject re interest david please call shi...


In [7]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

## Removing stopwords

In [8]:
from nltk import word_tokenize
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    tokens = word_tokenize(text)
    without_stopwords = [word for word in tokens if not word in stop_words]
    return without_stopwords
    
email_df['text'] = email_df['text'].apply(remove_stop_words)

In [9]:
email_df

Unnamed: 0,text,spam,cleaned_text
0,"[Subject, :, naturally, irresistible, corporat...",1,subject naturally irresistible your corporate ...
1,"[Subject, :, stock, trading, gunslinger, fanny...",1,subject the stock trading gunslinger fanny is...
2,"[Subject, :, unbelievable, new, homes, made, e...",1,subject unbelievable new homes made easy im w...
3,"[Subject, :, 4, color, printing, special, requ...",1,subject color printing special request addit...
4,"[Subject, :, money, ,, get, software, cds, !, ...",1,subject do not have money get software cds fr...
...,...,...,...
5723,"[Subject, :, :, research, development, charges...",0,subject re research and development charges t...
5724,"[Subject, :, :, receipts, visit, jim, ,, thank...",0,subject re receipts from visit jim thanks ...
5725,"[Subject, :, :, enron, case, study, update, wo...",0,subject re enron case study update wow all ...
5726,"[Subject, :, :, interest, david, ,, please, ,,...",0,subject re interest david please call shi...


## Lemmatizing the words


In [12]:
from nltk.stem import WordNetLemmatizer

def lemma(text):
    lemmatizer = WordNetLemmatizer() # Instantiate lemmatizer
    lemmatized = [lemmatizer.lemmatize(word) for word in text] # Lemmatize
    lemmatized_string = "".join(lemmatized)
    return lemmatized_string

email_df['clean_text'] = email_df.cleaned_text.apply(lemma)

email_df.head()

Unnamed: 0,text,spam,cleaned_text,clean_text
0,"[Subject, :, naturally, irresistible, corporat...",1,subject naturally irresistible your corporate ...,subject naturally irresistible your corporate ...
1,"[Subject, :, stock, trading, gunslinger, fanny...",1,subject the stock trading gunslinger fanny is...,subject the stock trading gunslinger fanny is...
2,"[Subject, :, unbelievable, new, homes, made, e...",1,subject unbelievable new homes made easy im w...,subject unbelievable new homes made easy im w...
3,"[Subject, :, 4, color, printing, special, requ...",1,subject color printing special request addit...,subject color printing special request addit...
4,"[Subject, :, money, ,, get, software, cds, !, ...",1,subject do not have money get software cds fr...,subject do not have money get software cds fr...


## converting textual data to numbers using countvectorizer. Bag of words modelling


In [36]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(email_df.clean_text)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [37]:
y = email_df['spam'].values
X

<5728x33715 sparse matrix of type '<class 'numpy.int64'>'
	with 663785 stored elements in Compressed Sparse Row format>

## Model Training using logistic regression

In [38]:
# split data to train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2, stratify=y)


In [39]:
y_train

array([0, 1, 1, ..., 1, 1, 0])

In [40]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [41]:
x_train_preddict = model.predict(X_train)

In [44]:
accuracy = accuracy_score(x_train_preddict, y_train)
print('the accuracy of the training data is: ', accuracy)

the accuracy of the training data is:  1.0


In [45]:
X_test_predict = model.predict(X_test)
X_test_accuracy = accuracy_score(X_test_predict, y_test)

In [46]:
X_test_accuracy

0.9921465968586387

In [47]:
X_new = X_test[120]

prediction = model.predict(X_new)
print(prediction)
if(prediction[0]==0):
    print('the email is real')
else:
    print('the email is spam')

[0]
the email is real


In [52]:
y_test[120]

0