### Libraries

In [1]:
import re
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.corpus import stopwords

### Loading in Data

In [2]:
# import os
# for dirname, _, filenames in os.walk('C:/Users/eric the cool/Desktop/9665/project'):
#     print(filenames)

In [2]:
dataset = 2

In [6]:
if dataset == 1:
    train = pd.read_csv('C:/Users/eric the cool/Desktop/9665/project/SMS_train.csv', encoding = 'latin-1')
    test = pd.read_csv('C:/Users/eric the cool/Desktop/9665/project/SMS_test.csv', encoding = 'latin-1')
    train['Label'] =  train['Label'].map({"Non-Spam":0,"Spam":1})
    test['Label'] =  test['Label'].map({"Non-Spam":0,"Spam":1})

if dataset == 2:
    data = pd.read_csv('C:/Users/eric the cool/Desktop/9665/project/spam_ham_dataset.csv', encoding = 'latin-1')
    data = data.rename(columns={'label_num': 'Label', 'text': 'Message_body'})
    
    from sklearn.model_selection import train_test_split
    train, test = train_test_split(data, test_size=0.2, random_state=0)
    train = train[['Message_body', 'Label']]
    test = test[['Message_body', 'Label']]
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)

### Preprocessing

In [7]:
def preprocessing (text):
    
    ps = nltk.stem.PorterStemmer()
    # Normalization and cleaning
    text = text.lower()
    text = re.sub("(http|https|www)(:|\.)\S+.com"," ",text)
    text = re.sub('[^a-zA-Z0-9\n]', ' ', text)
    text = re.sub("[^\w\d]"," ",text)
    text = re.sub("\d+"," ",text)
    text = re.sub('\s+',' ', text)
    text = " ".join([ps.stem(t) for t in text.split() if t not in nltk.corpus.stopwords.words("english")])
    
    # Tokenization and Stemming
#     token_text = []
#     ps = nltk.stem.PorterStemmer()
#     for word in nltk.word_tokenize(text):
#         token_text.append(ps.stem(word))
        
    return text

In [4]:
train.shape[0]

4136

In [9]:
train['Message_body'][6]

'Subject: and the final numbers for may are . . .\r\niferc 1 , 240 , 000 ( last volume was 72084 on day 18 )\r\nenron 930 , 000 ( last volume was 21667 on day 26 )\r\ngas daily 1 , 033 , 416 ( last volume was 80000 on day 31 )\r\nplease advise ,\r\nami'

In [7]:
train['Message_body'][0]

"Subject: meter 986296\r\nscherlyn , per our conversation here is the meter 986296 , and the months of\r\nfeb and march this meter needs to go to the 012 - 41500 - 02 - 015 contract .\r\nthank you sherlyn . let me know and i ' ll enter the new track id ' s . tom ."

In [6]:
preprocessing(train['Message_body'][3])

'subject daren firm trade waha book set intern counterparti desk desk trade enrononlin follow user id password give access live price web site http www enrononlin com user id adm password welcom note case sensit pleas keep user id password secur allow transact enrononlin contact helpdesk x question problem gain access id thank stephani x'

In [6]:
preprocessing(train['Message_body'][0])

'subject meter scherlyn per convers meter month feb march meter need go contract thank sherlyn let know enter new track id tom'

In [7]:
processed_text = []
for i in range(train.shape[0]):
    processed_text.append(preprocessing(train.loc[i].at["Message_body"]))
    
train['processed_text'] = processed_text

In [8]:
processed_text

['subject meter scherlyn per convers meter month feb march meter need go contract thank sherlyn let know enter new track id tom',
 'subject natur ga nomin enron methanol nomin follow natur ga requir methanol plant august mmbtu per day egpfc nomin follow natur ga requir mtbe plant morgan point august mmbtu per day',
 'subject cleburn outag gentlemen want clarifi length outag start today cleburn site earliest plant line wednesday morn anyth found beyond crack transit piec clog fuel nozzl may extend outag day soon hear someth let know michael',
 'subject daren firm trade waha book set intern counterparti desk desk trade enrononlin follow user id password give access live price web site http www enrononlin com user id adm password welcom note case sensit pleas keep user id password secur allow transact enrononlin contact helpdesk x question problem gain access id thank stephani x',
 'subject mon feb page load imag show view messag discon jyz pbb tugjf rqw blmgn qysmk kthj wmwahq qwpjol pzx

In [9]:
processed_text2 = []
for i in range(test.shape[0]):
    processed_text2.append(preprocessing(test['Message_body'][i]))
    
test['processed_text'] = processed_text2

In [10]:
train.head()

Unnamed: 0,Message_body,Label,processed_text
0,"Subject: meter 986296\r\nscherlyn , per our co...",0,subject meter scherlyn per convers meter month...
1,Subject: natural gas nomination for 08 / 00\r\...,0,subject natur ga nomin enron methanol nomin fo...
2,"Subject: cleburne outage\r\ngentlemen ,\r\ni w...",0,subject cleburn outag gentlemen want clarifi l...
3,"Subject: daren ,\r\nthe firm trading waha book...",0,subject daren firm trade waha book set intern ...
4,"Subject: re : mon , 2 feb 2004 03 : 16 : 16 - ...",1,subject mon feb page load imag show view messa...


In [11]:
test.head()

Unnamed: 0,Message_body,Label,processed_text
0,Subject: ship channel hub co\r\nplease review ...,0,subject ship channel hub co pleas review comme...
1,Subject: feb 2000 intercompany accrual varianc...,0,subject feb intercompani accrual varianc list ...
2,"Subject: meter 981318\r\ndaren ,\r\nthe above ...",0,subject meter daren meter record flow jan year...
3,Subject: via - ggra is lousy mizar\r\nanti\r\n...,1,subject via ggra lousi mizar anti curb hemisph...
4,Subject: potential list - feb . 2001\r\ndaren ...,0,subject potenti list feb daren awar potenti co...


In [12]:
X_train = train['processed_text']
Y_train = train['Label']

X_test = test['processed_text']
Y_test = test['Label']


print(X_train.shape)
print(Y_train.shape)

(4136,)
(4136,)


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_vec = TfidfVectorizer(min_df=10, max_features=1000)

In [14]:
text_vec.fit(X_train.values)

train_text = text_vec.transform(X_train.values)
test_text = text_vec.transform(X_test.values)

In [15]:
print(train_text.shape)
print(test_text.shape)

(4136, 1000)
(1035, 1000)


In [16]:
import pickle
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5,
                           scoring ='accuracy',
                           random_state = 0,
                           n_jobs = -1,
                           verbose =3,
                           max_iter= 300).fit(train_text,Y_train)
# saved_model = open('save_model.sav','wb')
# pickle.dump(clf,saved_model)
# saved_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.3s remaining:    9.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.8s finished


In [None]:
# filename ='save_model.sav'
# saved_clf = pickle.load(open(filename,'rb'))

In [17]:
print("Accuracy for the Logistic Regression is :",clf.score(train_text,Y_train))

Accuracy for the Logistic Regression is : 0.988394584139265


In [18]:
print("Accuracy for the Logistic Regression is :",clf.score(test_text,Y_test))

Accuracy for the Logistic Regression is : 0.9710144927536232


### Confusion Matrix

In [24]:
clf.predict(test_text)

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [26]:
from sklearn.metrics import confusion_matrix

confusion_matrix(Y_test, clf.predict(test_text), labels=None, sample_weight=None, normalize=None)

array([[714,  18],
       [ 12, 291]], dtype=int64)

In [None]:
Precision, Recall, F-score

In [28]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(Y_test, clf.predict(test_text),
                                average = 'binary')


(0.941747572815534, 0.9603960396039604, 0.9509803921568628, None)