In [1]:
import re
import pandas as pd
import numpy as np
import csv
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

### Loading data

In [2]:
data = pd.read_csv('C:/Users/eric the cool/Desktop/9665/project/spam_ham_dataset.csv', encoding = 'latin-1')
data = data[['text', 'label_num']]
data = data.rename(columns={'label_num': 'label'})

### Train test splitting

In [5]:
percent = 0.8

In [6]:
random.seed(10)
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=0)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

### Preprocessing email text for training and test set

In [7]:
def preprocessing (text):
    
    # Normalization and cleaning
    text = text.lower()
    text = re.sub("(http|https|www)(:|\.)\S+.com"," ",text)
    text = re.sub("[^\w\d]"," ",text)
    text = re.sub("\d+"," ",text)
    
    # Tokenization and Stemming
    token_text = []
    for word in nltk.word_tokenize(text):
        if word not in nltk.corpus.stopwords.words("english"):
            token_text.append(lemmatizer.lemmatize(word))
        
    return token_text

#### Adding tokenized_text column to the trainning matrix

In [12]:
processed_text_train = []
for i in range(train.shape[0]):
    processed_text_train.append(preprocessing(train["text"][i]))  
train['processed_text'] = processed_text_train

In [13]:
processed_text_test = []
for i in range(test.shape[0]):
    processed_text_test.append(preprocessing(test['text'][i]))
test['processed_text'] = processed_text_test

In [14]:
test.head(10)

Unnamed: 0,text,label,processed_text
0,Subject: ship channel hub co\r\nplease review ...,0,"[subject, ship, channel, hub, co, please, revi..."
1,Subject: feb 2000 intercompany accrual varianc...,0,"[subject, feb, intercompany, accrual, variance..."
2,"Subject: meter 981318\r\ndaren ,\r\nthe above ...",0,"[subject, meter, daren, meter, recorded, flow,..."
3,Subject: via - ggra is lousy mizar\r\nanti\r\n...,1,"[subject, via, ggra, lousy, mizar, anti, curb,..."
4,Subject: potential list - feb . 2001\r\ndaren ...,0,"[subject, potential, list, feb, daren, aware, ..."
5,Subject: retroactive adjustments\r\nthe follow...,0,"[subject, retroactive, adjustment, following, ..."
6,Subject: re : texas general land office it tra...,0,"[subject, texas, general, land, office, transp..."
7,Subject: fw : epgt\r\ndaren - can you please l...,0,"[subject, fw, epgt, daren, please, let, know, ..."
8,"Subject: new pictures\r\nfor faster viewing , ...",0,"[subject, new, picture, faster, viewing, would..."
9,Subject: underpriced issue with high return on...,1,"[subject, underpriced, issue, high, return, eq..."


### feature selection

In [21]:
emailCorpus = []
for text in train['processed_text']:
    emailCorpus += text

In [22]:
spamCorpus = []
for i in range(len(train['processed_text'])):
    if train['label'][i] ==1:
        spamCorpus += train['processed_text'][i]

In [42]:
len(spamCorpus)

146316

In [23]:
nonspamCorpus = []
for i in range(len(train['processed_text'])):
    if train['label'][i] ==0:
        nonspamCorpus += train['processed_text'][i]

In [43]:
len(nonspamCorpus)

257149

In [24]:
fd_email=nltk.FreqDist(emailCorpus)
fd_spam=nltk.FreqDist(spamCorpus)
fd_nonspam=nltk.FreqDist(nonspamCorpus)

In [25]:
wl_email = [w for (w,_) in fd_email.most_common()]
wl_spam = [w for (w,_) in fd_spam.most_common()]
wl_nonspam = [w for (w,_) in fd_nonspam.most_common()]

In [26]:
set_spam = set(wl_spam[:int(len(wl_spam)*percent)])
set_nonspam = set(wl_nonspam[:int(len(wl_spam)*percent)])

common_token = set(wl_email[:int(len(wl_email)*0.2)])

set_spam = set_spam.difference(common_token)
set_nonspam = set_nonspam.difference(common_token)

In [27]:
featureSet = list(set_spam.union(set_nonspam))

In [28]:
featureList = list(featureSet)

In [29]:
print(len(wl_spam))
print(len(wl_nonspam))
print(len(featureList))
print(len(common_token))
len(train['processed_text'])
len(test['processed_text'])

30730
12433
23897
7379


1035

In [30]:
X_train = np.zeros((len(train['processed_text']),len(featureList)))
X_test = np.zeros((len(test['processed_text']),len(featureList)))
# X_train = [[0 for _ in range(len(featureList))] for _ in range(len(train['tokenized_text']))]
# X_test = [[0 for _ in range(len(featureList))] for _ in range(len(test['tokenized_text']))]

In [31]:
for obs in range(len(train['processed_text'])):
    for word in train['processed_text'][obs]:
        if word in featureList:
            X_train[obs][featureList.index(word)] = 1
            
for obs in range(len(test['processed_text'])):
    for word in test['processed_text'][obs]:
        if word in featureList:
            X_test[obs][featureList.index(word)] = 1

In [32]:
import pickle
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5,
                           solver = 'liblinear',
                           scoring ='accuracy',
                           penalty = 'l1',
                           random_state = 0,
                           n_jobs = -1,
                           verbose =3,
                           max_iter= 300).fit(X_train,train['label'])
saved_model = open('save_model.sav','wb')
pickle.dump(clf,saved_model)
saved_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   15.5s remaining:   23.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   39.7s finished


[LibLinear]

In [34]:
filename ='save_model.sav'
saved_clf = pickle.load(open(filename,'rb'))

In [36]:
print("Accuracy for the Logistic Regression is :",saved_clf.score(X_train,train['label']))

Accuracy for the Logistic Regression is : 0.9634912959381045


In [37]:
print("Accuracy for the Logistic Regression is :",saved_clf.score(X_test,test['label']))

Accuracy for the Logistic Regression is : 0.8492753623188406


### Confusion Matrix and scores

In [39]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test['label'], clf.predict(X_test), labels=None, sample_weight=None, normalize=None)

array([[715,  17],
       [139, 164]], dtype=int64)

In [41]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(test['label'], clf.predict(X_test),
                                average = 'binary')

(0.9060773480662984, 0.5412541254125413, 0.6776859504132231, None)

### Precision = 0.9061
### Recall = 0.5413
### F-score = 0.6777