In [1]:
import pandas as pd
import numpy as np 
from sklearn.datasets import load_files


import time

# Preprocessing and text cleaning
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Model generation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn.preprocessing import StandardScaler

In [2]:
X, y = [], []
#dataset loading
email = load_files("enron")
X = np.append(X, email.data)
y = np.append(y, email.target)    

In [3]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(0, len(X)):
    # Backtrack and skipline symbols deletion
    review = re.sub(r'\\r\\n', ' ', str(X[i]))
    # Symbols deletion
    review = re.sub('[^a-zA-Z]', ' ', review)
    # Replacing big blankspaces gaps for only one blankspace
    review = re.sub(r'\s+', ' ', review)                    
    # 'b' character at the beginning of every email deletion
    review = re.sub(r'^b\s+', '', review)       
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [4]:
# Wordset generation
cv = CountVectorizer()
X = cv.fit_transform(corpus)

In [5]:
#Splitting dataset in two subsets (training and test). 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y,  random_state=9, test_size=0.2)

In [6]:
X_train_scaled = StandardScaler(with_mean=False).fit_transform(X_train)
X_test_scaled = StandardScaler(with_mean=False).fit_transform(X_test)

In [7]:
#Evaluating the model on the training set
classifier = MultinomialNB().fit(X_train_scaled, y_train)
pred = classifier.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, pred))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, pred))
print('---------------[Classification report]---------------')
print(classification_report(y_train, pred))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9793034382997663
-----------------[Confusion matrix]------------------
[[13272    20]
 [  538 13131]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98     13292
         1.0       1.00      0.96      0.98     13669

    accuracy                           0.98     26961
   macro avg       0.98      0.98      0.98     26961
weighted avg       0.98      0.98      0.98     26961

-----------------------------------------------------


In [8]:
#Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(classifier, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.9173002  0.91654303 0.91876855 0.91394659 0.91598665]
-------------[Cross-validation mean score on training set]-------------
0.9165090022179537
-----------------------------------------------------------------------


In [9]:
#Evaluating the model on the testing set
pred = classifier.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, pred))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, pred))
print('---------------[Classification report]---------------')
print(classification_report(y_test, pred))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9368046283934134
-----------------[Confusion matrix]------------------
[[3224   29]
 [ 397 3091]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.89      0.99      0.94      3253
         1.0       0.99      0.89      0.94      3488

    accuracy                           0.94      6741
   macro avg       0.94      0.94      0.94      6741
weighted avg       0.94      0.94      0.94      6741

-----------------------------------------------------


In [10]:
#Results of executing cross-validation on testing set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(classifier, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.89696071 0.89391691 0.90207715 0.89762611 0.88575668]
-------------[Cross-validation mean score on test set]-------------
0.8952675132475315
-------------------------------------------------------------------


In [11]:
#KNN Classifier instantiation
knn_model = KNeighborsClassifier(n_neighbors=int(math.sqrt(X_train.shape[0]))).fit(X_train_scaled, y_train)
knn_pred_train = knn_model.predict(X_train_scaled)
#Evaluating the model on the testing set
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, knn_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, knn_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, knn_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.5637402173509884
-----------------[Confusion matrix]------------------
[[ 1566 11726]
 [   36 13633]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.98      0.12      0.21     13292
         1.0       0.54      1.00      0.70     13669

    accuracy                           0.56     26961
   macro avg       0.76      0.56      0.45     26961
weighted avg       0.75      0.56      0.46     26961

-----------------------------------------------------


In [12]:
#Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(knn_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.56128314 0.55433976 0.5476632  0.56602374 0.55396884]
-------------[Cross-validation mean score on training set]-------------
0.556655738755756
-----------------------------------------------------------------------


In [13]:
knn_pred_test = knn_model.predict(X_test_scaled)
#Evaluating the model on test training
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, knn_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, knn_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, knn_pred_test))
print('-----------------------------------------------------')


--------------------[Accuracy]-----------------------
0.5781041388518025
-----------------[Confusion matrix]------------------
[[ 424 2829]
 [  15 3473]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.97      0.13      0.23      3253
         1.0       0.55      1.00      0.71      3488

    accuracy                           0.58      6741
   macro avg       0.76      0.56      0.47      6741
weighted avg       0.75      0.58      0.48      6741

-----------------------------------------------------


In [14]:
# Results of executing cross-validation on data test
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(knn_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.48257969 0.48219585 0.48219585 0.48293769 0.48293769]
-------------[Cross-validation mean score on test set]-------------
0.48256935019456104
-------------------------------------------------------------------


In [15]:
svm_model = svm.LinearSVC(random_state = 1, max_iter=90000).fit(X_train_scaled, y_train)
svm_pred_train = svm_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, svm_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, svm_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, svm_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
1.0
-----------------[Confusion matrix]------------------
[[13292     0]
 [    0 13669]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     13292
         1.0       1.00      1.00      1.00     13669

    accuracy                           1.00     26961
   macro avg       1.00      1.00      1.00     26961
weighted avg       1.00      1.00      1.00     26961

-----------------------------------------------------


In [16]:
# Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(svm_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.92582978 0.93212166 0.91580119 0.92470326 0.92841246]
-------------[Cross-validation mean score on training set]-------------
0.92537367100225
-----------------------------------------------------------------------


In [17]:
svm_pred_test = svm_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, svm_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, svm_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, svm_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9182613855511051
-----------------[Confusion matrix]------------------
[[3159   94]
 [ 457 3031]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92      3253
         1.0       0.97      0.87      0.92      3488

    accuracy                           0.92      6741
   macro avg       0.92      0.92      0.92      6741
weighted avg       0.92      0.92      0.92      6741

-----------------------------------------------------


In [18]:
# Results of executing cross-validation on test set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(svm_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.89621942 0.89910979 0.87908012 0.89317507 0.90652819]
-------------[Cross-validation mean score on test set]-------------
0.894822519373621
-------------------------------------------------------------------


In [19]:
logistic_regression_model = LogisticRegression(random_state = 1, max_iter = 1000).fit(X_train_scaled, y_train)
logistic_regression_pred_train = logistic_regression_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, logistic_regression_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, logistic_regression_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, logistic_regression_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.999925818775268
-----------------[Confusion matrix]------------------
[[13290     2]
 [    0 13669]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     13292
         1.0       1.00      1.00      1.00     13669

    accuracy                           1.00     26961
   macro avg       1.00      1.00      1.00     26961
weighted avg       1.00      1.00      1.00     26961

-----------------------------------------------------


In [20]:
# Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(logistic_regression_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.96996106 0.97032641 0.96123887 0.97014095 0.96865727]
-------------[Cross-validation mean score on training set]-------------
0.9680649124235672
-----------------------------------------------------------------------


In [21]:
logistic_regression_pred_test = logistic_regression_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, logistic_regression_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, logistic_regression_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, logistic_regression_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.969737427681353
-----------------[Confusion matrix]------------------
[[3207   46]
 [ 158 3330]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97      3253
         1.0       0.99      0.95      0.97      3488

    accuracy                           0.97      6741
   macro avg       0.97      0.97      0.97      6741
weighted avg       0.97      0.97      0.97      6741

-----------------------------------------------------


In [22]:
# Results of executing cross-validation on test set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(logistic_regression_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.96812454 0.96439169 0.95845697 0.95994065 0.97551929]
-------------[Cross-validation mean score on test set]-------------
0.9652866284070187
-------------------------------------------------------------------


In [23]:
decision_tree_model = tree.DecisionTreeClassifier(random_state=1, max_depth=2).fit(X_train_scaled, y_train)
decision_tree_pred_train = decision_tree_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, decision_tree_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, decision_tree_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, decision_tree_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7741552613033641
-----------------[Confusion matrix]------------------
[[ 7300  5992]
 [   97 13572]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.99      0.55      0.71     13292
         1.0       0.69      0.99      0.82     13669

    accuracy                           0.77     26961
   macro avg       0.84      0.77      0.76     26961
weighted avg       0.84      0.77      0.76     26961

-----------------------------------------------------


In [24]:
# Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(decision_tree_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.77192657 0.77911721 0.76353858 0.77466617 0.78097181]
-------------[Cross-validation mean score on training set]-------------
0.774044068005509
-----------------------------------------------------------------------


In [25]:
decision_tree_pred_test = decision_tree_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, decision_tree_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, decision_tree_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, decision_tree_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7819314641744548
-----------------[Confusion matrix]------------------
[[1807 1446]
 [  24 3464]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.99      0.56      0.71      3253
         1.0       0.71      0.99      0.82      3488

    accuracy                           0.78      6741
   macro avg       0.85      0.77      0.77      6741
weighted avg       0.84      0.78      0.77      6741

-----------------------------------------------------


In [26]:
# Results of executing cross-validation on test set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(decision_tree_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.78576723 0.79154303 0.77670623 0.76928783 0.78635015]
-------------[Cross-validation mean score on test set]-------------
0.7819308950689928
-------------------------------------------------------------------


In [27]:
random_forest_model = RandomForestClassifier(max_depth=3, random_state=1).fit(X_train_scaled, y_train)
random_forest_pred_train = random_forest_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, random_forest_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, random_forest_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, random_forest_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.8483735766477505
-----------------[Confusion matrix]------------------
[[ 9274  4018]
 [   70 13599]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.99      0.70      0.82     13292
         1.0       0.77      0.99      0.87     13669

    accuracy                           0.85     26961
   macro avg       0.88      0.85      0.84     26961
weighted avg       0.88      0.85      0.84     26961

-----------------------------------------------------


In [28]:
## Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(random_forest_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.86259967 0.86220326 0.85979228 0.84977745 0.84699555]
-------------[Cross-validation mean score on training set]-------------
0.8562736424456144
-----------------------------------------------------------------------


In [29]:
random_forest_pred_test = random_forest_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, random_forest_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, random_forest_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, random_forest_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.857736240913811
-----------------[Confusion matrix]------------------
[[2321  932]
 [  27 3461]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.99      0.71      0.83      3253
         1.0       0.79      0.99      0.88      3488

    accuracy                           0.86      6741
   macro avg       0.89      0.85      0.85      6741
weighted avg       0.88      0.86      0.85      6741

-----------------------------------------------------


In [30]:
# # Results of executing cross-validation on test set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(random_forest_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.78576723 0.79896142 0.78783383 0.79673591 0.78857567]
-------------[Cross-validation mean score on test set]-------------
0.7915748119829393
-------------------------------------------------------------------
