In [1]:
# libraries import
import numpy as py
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn.preprocessing import StandardScaler


In [2]:
# Loading the dataset
df = pd.read_csv('spam_ham_dataset.csv')

In [3]:
# Duplicates deletion
df.drop_duplicates(inplace = True)

In [4]:
#Checking number of null entries
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [5]:
# Downloading stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eduar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
"""
Function that will process the text and it'll proceed as follow
1) punctuation signs deletion
2) stopwords deletion
3) return a clean listwords
"""
def process_text(text):
    #1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    #2
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    #3
    return clean_words 


In [7]:
# Turning the listwords into a token matrix
messages_bow = CountVectorizer(analyzer = process_text).fit_transform(df['text'])

In [8]:
# Spliting the dataset into two datasets (training and test) 80% for training and 80% for testing
X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['label_num'], test_size = 0.20, random_state = 1)

In [9]:
X_train_scaled = StandardScaler(with_mean=False).fit_transform(X_train)
X_test_scaled = StandardScaler(with_mean=False).fit_transform(X_test)

In [10]:
#Evaluating the model on training dataset
classifier = MultinomialNB().fit(X_train_scaled, y_train)
pred = classifier.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, pred))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, pred))
print('---------------[Classification report]---------------')
print(classification_report(y_train, pred))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9908123791102514
-----------------[Confusion matrix]------------------
[[2937    4]
 [  34 1161]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2941
           1       1.00      0.97      0.98      1195

    accuracy                           0.99      4136
   macro avg       0.99      0.99      0.99      4136
weighted avg       0.99      0.99      0.99      4136

-----------------------------------------------------


In [11]:
#Results of executing cross-validation on training dataset
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(classifier, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.93719807 0.94679565 0.93833132 0.93107618 0.95525998]
-------------[Cross-validation mean score on training set]-------------
0.941732237468529
-----------------------------------------------------------------------


In [12]:
#Evaluating the model on test set
pred = classifier.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, pred))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, pred))
print('---------------[Classification report]---------------')
print(classification_report(y_test, pred))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9623188405797102
-----------------[Confusion matrix]------------------
[[726   5]
 [ 34 270]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       731
           1       0.98      0.89      0.93       304

    accuracy                           0.96      1035
   macro avg       0.97      0.94      0.95      1035
weighted avg       0.96      0.96      0.96      1035

-----------------------------------------------------


In [13]:
#Results of executing cross-validation on testing dataset
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(classifier, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.93236715 0.96618357 0.91304348 0.94202899 0.92753623]
-------------[Cross-validation mean score on test set]-------------
0.9362318840579709
-------------------------------------------------------------------


In [14]:
knn_model = KNeighborsClassifier(n_neighbors=int(math.sqrt(X_train_scaled.shape[0]))).fit(X_train_scaled, y_train)
knn_pred_train = knn_model.predict(X_train_scaled)
# Evaluating the model on training set
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, knn_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, knn_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, knn_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7147001934235977
-----------------[Confusion matrix]------------------
[[2941    0]
 [1180   15]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.71      1.00      0.83      2941
           1       1.00      0.01      0.02      1195

    accuracy                           0.71      4136
   macro avg       0.86      0.51      0.43      4136
weighted avg       0.80      0.71      0.60      4136

-----------------------------------------------------


In [15]:
#Results of executing cross-validation on training dataset
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(knn_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.71135266 0.71342201 0.71100363 0.71100363 0.71100363]
-------------[Cross-validation mean score on training set]-------------
0.7115571093937112
-----------------------------------------------------------------------


In [16]:
# Evaluating the model on training set
knn_pred_test = knn_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, knn_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, knn_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, knn_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7101449275362319
-----------------[Confusion matrix]------------------
[[731   0]
 [300   4]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.71      1.00      0.83       731
           1       1.00      0.01      0.03       304

    accuracy                           0.71      1035
   macro avg       0.85      0.51      0.43      1035
weighted avg       0.79      0.71      0.59      1035

-----------------------------------------------------


In [17]:
#Results of executing cross-validation on testing dataset
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(knn_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.71014493 0.70531401 0.70531401 0.70531401 0.70531401]
-------------[Cross-validation mean score on test set]-------------
0.706280193236715
-------------------------------------------------------------------


In [18]:
svm_model = svm.LinearSVC(random_state = 1, max_iter=6000).fit(X_train_scaled, y_train)
svm_pred_train = svm_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, svm_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, svm_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, svm_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
1.0
-----------------[Confusion matrix]------------------
[[2941    0]
 [   0 1195]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2941
           1       1.00      1.00      1.00      1195

    accuracy                           1.00      4136
   macro avg       1.00      1.00      1.00      4136
weighted avg       1.00      1.00      1.00      4136

-----------------------------------------------------


In [19]:
#Resultados de ejecutar cross-validation sobre los datos de entrenamiento
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(svm_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.88164251 0.90084643 0.90084643 0.89480048 0.90326481]
-------------[Cross-validation mean score on training set]-------------
0.8962801348217468
-----------------------------------------------------------------------


In [20]:
svm_pred_test = svm_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, svm_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, svm_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, svm_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9004830917874396
-----------------[Confusion matrix]------------------
[[685  46]
 [ 57 247]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       731
           1       0.84      0.81      0.83       304

    accuracy                           0.90      1035
   macro avg       0.88      0.87      0.88      1035
weighted avg       0.90      0.90      0.90      1035

-----------------------------------------------------


In [21]:
#Results of executing cross-validation on testing dataset
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(svm_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.82125604 0.88888889 0.85507246 0.84541063 0.86956522]
-------------[Cross-validation mean score on test set]-------------
0.8560386473429953
-------------------------------------------------------------------


In [22]:
logistic_regression_model = LogisticRegression(random_state = 1).fit(X_train_scaled, y_train)
logistic_regression_pred_train = logistic_regression_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, logistic_regression_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, logistic_regression_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, logistic_regression_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
1.0
-----------------[Confusion matrix]------------------
[[2941    0]
 [   0 1195]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2941
           1       1.00      1.00      1.00      1195

    accuracy                           1.00      4136
   macro avg       1.00      1.00      1.00      4136
weighted avg       1.00      1.00      1.00      4136

-----------------------------------------------------


In [23]:
#Results of executing cross-validation on training dataset
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(logistic_regression_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.93719807 0.94679565 0.95042322 0.95042322 0.96009674]
-------------[Cross-validation mean score on training set]-------------
0.9489873765253609
-----------------------------------------------------------------------


In [24]:
logistic_regression_pred_test = logistic_regression_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, logistic_regression_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, logistic_regression_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, logistic_regression_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9565217391304348
-----------------[Confusion matrix]------------------
[[714  17]
 [ 28 276]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       731
           1       0.94      0.91      0.92       304

    accuracy                           0.96      1035
   macro avg       0.95      0.94      0.95      1035
weighted avg       0.96      0.96      0.96      1035

-----------------------------------------------------


In [25]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(logistic_regression_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.94202899 0.9468599  0.9178744  0.92753623 0.9468599 ]
-------------[Cross-validation mean score on test set]-------------
0.9362318840579711
-------------------------------------------------------------------


In [26]:
decision_tree_model = tree.DecisionTreeClassifier(random_state=1, max_depth=2).fit(X_train_scaled, y_train)
decision_tree_pred_train = decision_tree_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, decision_tree_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, decision_tree_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, decision_tree_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7884429400386848
-----------------[Confusion matrix]------------------
[[2881   60]
 [ 815  380]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.78      0.98      0.87      2941
           1       0.86      0.32      0.46      1195

    accuracy                           0.79      4136
   macro avg       0.82      0.65      0.67      4136
weighted avg       0.80      0.79      0.75      4136

-----------------------------------------------------


In [27]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(decision_tree_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.79468599 0.78839178 0.78718259 0.79443773 0.77750907]
-------------[Cross-validation mean score on training set]-------------
0.7884414302320827
-----------------------------------------------------------------------


In [28]:
decision_tree_pred_test = decision_tree_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, decision_tree_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, decision_tree_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, decision_tree_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7855072463768116
-----------------[Confusion matrix]------------------
[[718  13]
 [209  95]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.77      0.98      0.87       731
           1       0.88      0.31      0.46       304

    accuracy                           0.79      1035
   macro avg       0.83      0.65      0.66      1035
weighted avg       0.81      0.79      0.75      1035

-----------------------------------------------------


In [29]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(decision_tree_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.8115942  0.80676329 0.77294686 0.76811594 0.71497585]
-------------[Cross-validation mean score on test set]-------------
0.77487922705314
-------------------------------------------------------------------


In [30]:
random_forest_model = RandomForestClassifier(max_depth=3, random_state=1).fit(X_train_scaled, y_train)
random_forest_pred_train = random_forest_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, random_forest_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, random_forest_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, random_forest_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7246131528046421
-----------------[Confusion matrix]------------------
[[2941    0]
 [1139   56]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.72      1.00      0.84      2941
           1       1.00      0.05      0.09      1195

    accuracy                           0.72      4136
   macro avg       0.86      0.52      0.46      4136
weighted avg       0.80      0.72      0.62      4136

-----------------------------------------------------


In [31]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(random_forest_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.71980676 0.73276904 0.71584039 0.72551391 0.72309553]
-------------[Cross-validation mean score on training set]-------------
0.7234051253293143
-----------------------------------------------------------------------


In [32]:
random_forest_pred_test = random_forest_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, random_forest_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, random_forest_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, random_forest_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7217391304347827
-----------------[Confusion matrix]------------------
[[731   0]
 [288  16]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.72      1.00      0.84       731
           1       1.00      0.05      0.10       304

    accuracy                           0.72      1035
   macro avg       0.86      0.53      0.47      1035
weighted avg       0.80      0.72      0.62      1035

-----------------------------------------------------


In [33]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(random_forest_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.72463768 0.71980676 0.71980676 0.71497585 0.72463768]
-------------[Cross-validation mean score on test set]-------------
0.7207729468599033
-------------------------------------------------------------------
