In [1]:
import pandas as pd
import numpy as np 
from sklearn.datasets import load_files

import time

# Preprocessing and text cleaning
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

#Filesystem files management
import os
import shutil

#Model generation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn.preprocessing import StandardScaler


In [2]:
#Dirname where the dataset is stored
dataset_directory = 'pucorpora'
#Direname where the dataset formatted will be loaded
format_dataset_directory = 'format_'+dataset_directory
#Path and dirname where the spam emails will be stored
spam_directory = format_dataset_directory+'/spam'
#Path and dirname where the spam free emails will be loaded
legit_directory = format_dataset_directory+'/ham'
#The directory to store the formatted emails will be created in case this doesn't exist
shutil.rmtree(format_dataset_directory, ignore_errors=True)
#In case the formatted directory doesn't exist, this will be created tied to spam and ham subdirectories
if format_dataset_directory not in os.listdir():
    os.mkdir(format_dataset_directory)
    os.mkdir(spam_directory)
    os.mkdir(legit_directory)
#The directory dataset to be loaded is listed
list_directories = os.listdir(dataset_directory)
#every emails is cocpied in its respective directory (spam or ham)
for directory in list_directories:
    source = dataset_directory+'/'+directory 
    for file in os.listdir(dataset_directory+'/'+directory):
        if 'spm' in file:
            shutil.copyfile(source+'/'+file, spam_directory+'/'+file)
        else:
            shutil.copyfile(source+'/'+file, legit_directory+'/'+file)


In [3]:
#Loading the dataset
X, y = [], []
email = load_files(format_dataset_directory)
X = np.append(X, email.data)
y = np.append(y, email.target)

In [4]:
#Stemmer and lematizer instantiation
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
# Corpus creation
corpus = []
for i in range(0, len(X)):
    # Deleting every character newline character in every email
    review = re.sub(r'\\n', ' ', str(X[i]))
    # Deleting the string "b'subject: which is at the beginning of every email
    review = re.sub(r"\bb'Subject:", ' ', review)
    # Replacing big blankspaces gaps by only one blankspace
    review = re.sub(r'\s+', ' ', review)
    # Deleting all of the blankspaces followed by a single quotation mark at the end of every email
    review = re.sub(r"\s\'", '', review)       
    #Turning the text to its lowercase version
    review = review.lower()
    #Turning the email words into a list of words
    review = review.split()
    #Replacing every word in the list to its base version
    review = [stemmer.stem(word) for word in review if word not in stopwords.words('english')]
    #Concattening the wordlist in an only string
    review = ' '.join(review)
    #The words (their base version) are added to the corpus 
    Añadiendo la nueva cadena de palabras (con las palabras en su versión base) al corpus
    corpus.append(review)

In [5]:
# Token vector creation
cv = CountVectorizer()
X = cv.fit_transform(corpus)

In [6]:
# Splitting the dataset in two subsets (training and test) 80% for training and 20% for testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,  random_state=9, test_size=0.2)

In [7]:
X_train_scaled = StandardScaler(with_mean=False).fit_transform(X_train)
X_test_scaled = StandardScaler(with_mean=False).fit_transform(X_test)

In [8]:
#Evaluatin the model on training set
classifier = MultinomialNB().fit(X_train_scaled, y_train)
pred = classifier.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, pred))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, pred))
print('---------------[Classification report]---------------')
print(classification_report(y_train, pred))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9793066855323664
-----------------[Confusion matrix]------------------
[[3181   47]
 [  70 2356]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98      3228
         1.0       0.98      0.97      0.98      2426

    accuracy                           0.98      5654
   macro avg       0.98      0.98      0.98      5654
weighted avg       0.98      0.98      0.98      5654

-----------------------------------------------------


In [9]:
#Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(classifier, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.8938992  0.88240495 0.91335102 0.89655172 0.88230088]
-------------[Cross-validation mean score on training set]-------------
0.8937015563014953
-----------------------------------------------------------------------


In [10]:
#Evaluatin the model on testing set
pred = classifier.predict(X_test)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, pred))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, pred))
print('---------------[Classification report]---------------')
print(classification_report(y_test, pred))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.8585572842998586
-----------------[Confusion matrix]------------------
[[669 161]
 [ 39 545]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.94      0.81      0.87       830
         1.0       0.77      0.93      0.84       584

    accuracy                           0.86      1414
   macro avg       0.86      0.87      0.86      1414
weighted avg       0.87      0.86      0.86      1414

-----------------------------------------------------


In [11]:
#Results of executing cross-validation on testing set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(classifier, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.90106007 0.90812721 0.93286219 0.90459364 0.91134752]
-------------[Cross-validation mean score on test set]-------------
0.9115981254542266
-------------------------------------------------------------------


In [12]:
# knn_model = KNeighborsClassifier(n_neighbors=int(math.sqrt(X_train.shape[0]))).fit(X_train, y_train)
knn_model = KNeighborsClassifier(n_neighbors=2).fit(X_train_scaled, y_train)
knn_pred_train = knn_model.predict(X_train_scaled)
#Evaluatin the model on training set
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, knn_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, knn_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, knn_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7509727626459144
-----------------[Confusion matrix]------------------
[[3228    0]
 [1408 1018]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.70      1.00      0.82      3228
         1.0       1.00      0.42      0.59      2426

    accuracy                           0.75      5654
   macro avg       0.85      0.71      0.71      5654
weighted avg       0.83      0.75      0.72      5654

-----------------------------------------------------


In [13]:
#Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(knn_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.63041556 0.63837312 0.63925729 0.70733864 0.64247788]
-------------[Cross-validation mean score on training set]-------------
0.651572498298162
-----------------------------------------------------------------------


In [14]:
knn_pred_test = knn_model.predict(X_test_scaled)
#Evaluating the model on testing set
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, knn_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, knn_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, knn_pred_test))
print('-----------------------------------------------------')


--------------------[Accuracy]-----------------------
0.6541725601131542
-----------------[Confusion matrix]------------------
[[826   4]
 [485  99]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       830
         1.0       0.96      0.17      0.29       584

    accuracy                           0.65      1414
   macro avg       0.80      0.58      0.53      1414
weighted avg       0.77      0.65      0.57      1414

-----------------------------------------------------


In [15]:
#Results of executing cross-validation on testing set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(knn_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.63957597 0.65371025 0.49823322 0.64664311 0.60992908]
-------------[Cross-validation mean score on test set]-------------
0.6096183244367591
-------------------------------------------------------------------


In [16]:
svm_model = svm.LinearSVC(random_state = 1, max_iter=4000).fit(X_train_scaled, y_train)
svm_pred_train = svm_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, svm_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, svm_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, svm_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
1.0
-----------------[Confusion matrix]------------------
[[3228    0]
 [   0 2426]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3228
         1.0       1.00      1.00      1.00      2426

    accuracy                           1.00      5654
   macro avg       1.00      1.00      1.00      5654
weighted avg       1.00      1.00      1.00      5654

-----------------------------------------------------


In [17]:
#Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(svm_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.933687   0.94076039 0.93280283 0.93987622 0.9380531 ]
-------------[Cross-validation mean score on training set]-------------
0.9370359068253485
-----------------------------------------------------------------------


In [18]:
svm_pred_test = svm_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, svm_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, svm_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, svm_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9427157001414427
-----------------[Confusion matrix]------------------
[[811  19]
 [ 62 522]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.93      0.98      0.95       830
         1.0       0.96      0.89      0.93       584

    accuracy                           0.94      1414
   macro avg       0.95      0.94      0.94      1414
weighted avg       0.94      0.94      0.94      1414

-----------------------------------------------------


In [19]:
#Results of executing cross-validation on testing set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(svm_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.91519435 0.91166078 0.8975265  0.92932862 0.89716312]
-------------[Cross-validation mean score on test set]-------------
0.9101746735834398
-------------------------------------------------------------------


In [20]:
logistic_regression_model = LogisticRegression(random_state = 1, max_iter=150).fit(X_train_scaled, y_train)
logistic_regression_pred_train = logistic_regression_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, logistic_regression_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, logistic_regression_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, logistic_regression_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
1.0
-----------------[Confusion matrix]------------------
[[3228    0]
 [   0 2426]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3228
         1.0       1.00      1.00      1.00      2426

    accuracy                           1.00      5654
   macro avg       1.00      1.00      1.00      5654
weighted avg       1.00      1.00      1.00      5654

-----------------------------------------------------


In [21]:
#Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(logistic_regression_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.92130858 0.93633952 0.93633952 0.94252874 0.94690265]
-------------[Cross-validation mean score on training set]-------------
0.9366838024146539
-----------------------------------------------------------------------


In [22]:
logistic_regression_pred_test = logistic_regression_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, logistic_regression_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, logistic_regression_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, logistic_regression_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9391796322489392
-----------------[Confusion matrix]------------------
[[815  15]
 [ 71 513]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95       830
         1.0       0.97      0.88      0.92       584

    accuracy                           0.94      1414
   macro avg       0.95      0.93      0.94      1414
weighted avg       0.94      0.94      0.94      1414

-----------------------------------------------------


In [23]:
#Results of executing cross-validation on testing set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(logistic_regression_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.92579505 0.93639576 0.92932862 0.90459364 0.86524823]
-------------[Cross-validation mean score on test set]-------------
0.9122722602310602
-------------------------------------------------------------------


In [24]:
decision_tree_model = tree.DecisionTreeClassifier(random_state=1, max_depth=2).fit(X_train_scaled, y_train)
decision_tree_pred_train = decision_tree_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, decision_tree_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, decision_tree_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, decision_tree_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7426600636717369
-----------------[Confusion matrix]------------------
[[3135   93]
 [1362 1064]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.70      0.97      0.81      3228
         1.0       0.92      0.44      0.59      2426

    accuracy                           0.74      5654
   macro avg       0.81      0.70      0.70      5654
weighted avg       0.79      0.74      0.72      5654

-----------------------------------------------------


In [25]:
#Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(decision_tree_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.69761273 0.74093722 0.74801061 0.75331565 0.72743363]
-------------[Cross-validation mean score on training set]-------------
0.7334619688113737
-----------------------------------------------------------------------


In [26]:
decision_tree_pred_test = decision_tree_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, decision_tree_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, decision_tree_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, decision_tree_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7340876944837341
-----------------[Confusion matrix]------------------
[[722 108]
 [268 316]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.73      0.87      0.79       830
         1.0       0.75      0.54      0.63       584

    accuracy                           0.73      1414
   macro avg       0.74      0.71      0.71      1414
weighted avg       0.74      0.73      0.72      1414

-----------------------------------------------------


In [27]:
#Results of executing cross-validation on testing set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(decision_tree_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.74558304 0.74204947 0.75265018 0.74558304 0.72340426]
-------------[Cross-validation mean score on test set]-------------
0.7418539959401549
-------------------------------------------------------------------


In [28]:
random_forest_model = RandomForestClassifier(max_depth=3, random_state=1).fit(X_train_scaled, y_train)
random_forest_pred_train = random_forest_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, random_forest_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, random_forest_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, random_forest_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.6979129819596745
-----------------[Confusion matrix]------------------
[[3222    6]
 [1702  724]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79      3228
         1.0       0.99      0.30      0.46      2426

    accuracy                           0.70      5654
   macro avg       0.82      0.65      0.62      5654
weighted avg       0.80      0.70      0.65      5654

-----------------------------------------------------


In [29]:
#Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(random_forest_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.67727675 0.67020336 0.68788683 0.69230769 0.68849558]
-------------[Cross-validation mean score on training set]-------------
0.6832340398895175
-----------------------------------------------------------------------


In [30]:
random_forest_pred_test = random_forest_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, random_forest_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, random_forest_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, random_forest_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7043847241867044
-----------------[Confusion matrix]------------------
[[826   4]
 [414 170]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

         0.0       0.67      1.00      0.80       830
         1.0       0.98      0.29      0.45       584

    accuracy                           0.70      1414
   macro avg       0.82      0.64      0.62      1414
weighted avg       0.79      0.70      0.65      1414

-----------------------------------------------------


In [31]:
#Results of executing cross-validation on testing set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(random_forest_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.65371025 0.68551237 0.66077739 0.6819788  0.64893617]
-------------[Cross-validation mean score on test set]-------------
0.6661829937598677
-------------------------------------------------------------------
