In [1]:
import re
import pandas as pd
import numpy as np
import csv
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

### Loading data

In [2]:
data = pd.read_csv('C:/Users/eric the cool/Desktop/9665/project/spam_ham_dataset.csv', encoding = 'latin-1')
data = data[['text', 'label_num']]
data = data.rename(columns={'label_num': 'label'})

### Train test splitting

In [3]:
random.seed(10)
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=0)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

### Preprocessing email text for training and test set


In [4]:
def preprocessing (text):
    
    # Normalization and cleaning
    text = text.lower()
    text = re.sub("(http|https|www)(:|\.)\S+.com"," ",text)
    text = re.sub("[^a-zA-Z0-9\n]", " ", text)
    text = re.sub("[^\w\d]"," ",text)
    text = re.sub("\d+"," ",text)
    text = re.sub("\s+"," ", text)
    return text

In [5]:
processed_text_train = []
for i in range(train.shape[0]):
    processed_text_train.append(preprocessing(train["text"][i]))  
train['processed_text'] = processed_text_train

In [6]:
processed_text_test = []
for i in range(test.shape[0]):
    processed_text_test.append(preprocessing(test['text'][i]))
test['processed_text'] = processed_text_test

### Feature extraction

- loading googleNews300 model

In [7]:
import gensim
w2v = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [21]:
def sent_vectorize(post):
    vec_size = w2v.vector_size
    sent_vec = np.zeros(vec_size)
    vec_count = 1
    for word in nltk.word_tokenize(post):
        if lemmatizer.lemmatize(word) in w2v:
            vec_count += 1
            sent_vec += w2v[lemmatizer.lemmatize(word)]
    sent_vec = sent_vec/vec_count          
    return sent_vec

- Creating w2v feature matries

In [23]:
X_train = np.array([sent_vectorize(sent) for sent in train["text"]])
X_test = np.array([sent_vectorize(sent) for sent in test["text"]])
y_train = train["label"]
y_test = test["label"]

### Training logistic regression classifier

In [40]:
import pickle
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5,
                           scoring ='accuracy',
                           random_state = 0,
                           n_jobs = -1,
                           verbose =3,
                           max_iter= 300).fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   30.1s remaining:   45.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   35.7s finished


### Training and test accuracy

In [41]:
print("Accuracy for the Logistic Regression is :",clf.score(X_train,y_train))

Accuracy for the Logistic Regression is : 0.9912959381044487


In [43]:
print("Accuracy for the Logistic Regression is :",clf.score(X_test,y_test))

Accuracy for the Logistic Regression is : 0.9671497584541063


In [46]:
clf.predict(X_test)

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

### Confusion matrix, precision, recall and F-screo

In [49]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, clf.predict(X_test), labels=None, sample_weight=None, normalize=None)

array([[713,  19],
       [ 15, 288]], dtype=int64)

In [51]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, clf.predict(X_test),
                                average = 'binary')


(0.9381107491856677, 0.9504950495049505, 0.9442622950819671, None)

### Precision = 0.9381
### Recall = 0.9505
### F-score = 0.9443

In [None]:
# def extract_features(vector):
#     features = {}
#     for i in range(300):
#         features['contains({})'.format(i)] = vector[i]
#     return features

# featureset_train = [(extract_features(X_train[i]), train['label'][i])
#               for i in range(len(X_train))]

# featureset_test = [(extract_features(X_test[i]), test['label'][i])
#               for i in range(len(X_test))]

# tree_classifier = nltk.DecisionTreeClassifier.train(featureset_train, binary=True)

# print(nltk.classify.accuracy(tree_classifier, featureset_test))