In [1]:

# Loading libraries
import os
import re
from os import listdir
from os.path import isfile, join

from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
import collections

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
import snips as snp
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn.preprocessing import StandardScaler

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eduar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
"""
Loading spam/non-spam emails on the variables 
spamfiles and hamfiles, respectively
"""
spampath = join(os.getcwd(), "spam")
spamfiles = [join(spampath, fname) for fname in listdir(spampath)]

hampath = join(os.getcwd(), "ham")
hamfiles = [join(hampath, fname) for fname in listdir(hampath)]

In [3]:
"""
    Function to extract mail body associated to fpath
    params:
        fpath: Path to the email whoose body it's gonna be extracted
    output:
        Mail body since it's first newline character on
"""


def get_body(fpath):    
    with open(fpath, "r") as myfile:
        try: 
            lines = myfile.readlines()
            #First newline chacaracter is identified
            idx = lines.index("\n")
            #mail body, since first newline chacaracter on, is returned
            return "".join(lines[idx:])
        except: 
            print("Couldn't decode file %s" %(fpath))

In [4]:
"""
    Function to get a word salad from mail body which is being processed
    params:
        body: mail body which is being processed
    output:
        just processed mail word salad
"""
def word_salad(body):
    #parser instantiation, using internal python parser (html.parser)
    soup = BeautifulSoup(body, 'html.parser')
    body = soup.get_text()

    # Getting total number of html elements
    nhtml = len(soup.find_all())
    # Getting total number of html element which label is <a>
    nlinks = len(soup.find_all("a"))
    # Concattening to mail body, the words htmltag and linktag
    # as many times as nhtml and nlinks, respectively. 
    body = body + nhtml*" htmltag " + nlinks*" linktag "
    #Turning to lowercase mail content
    body = body.lower()
    
    # Replacing URL's by the string httpaddr
    regx = re.compile(r"(http|https)://[^\s]*")
    body, nhttps = regx.subn(repl=" httpaddr ", string=body)

    # Replacing email addresses by the word emailaddr
    regx = re.compile(r"\b[^\s]+@[^\s]+[.][^\s]+\b")
    body, nemails = regx.subn(repl=" emailaddr ", string=body)
    
    # Replacing number by the word number
    regx = re.compile(r"\b[\d.]+\b")
    body = regx.sub(repl=" number ", string=body)

    # Replacing the characters $, ! and ? by dollar, exclamark and questmark, respectively
    regx = re.compile(r"[$]")
    body = regx.sub(repl=" dollar ", string=body)
    regx = re.compile(r"[!]")
    body = regx.sub(repl=" exclammark ", string=body)
    regx = re.compile(r"[?]")
    body = regx.sub(repl=" questmark ", string=body)

    #Replacing punctuation marks by blankspaces
    regx = re.compile(r"([^\w\s]+)|([_-]+)")  
    body = regx.sub(repl=" ", string=body)
    # Replacing newline characters by the string newline and blankline by the string blankline
    regx = re.compile(r"\n")
    body = regx.sub(repl=" newline ", string=body)
    regx = re.compile(r"\n\n")
    body = regx.sub(repl=" blankline ", string=body)

    # Replacing big gaps of blankspaces by just one blankspace
    regx = re.compile(r"\s+")
    body = regx.sub(repl=" ", string=body)

    # Replacing blankspaces at the beginning and at the end 
    body = body.strip(" ")

    # Getting rid of "useless" words
    bodywords = body.split(" ")
    keepwords = [word for word in bodywords if word not in stopwords.words('english')]

    # Replacing words by their base version
    stemmer = SnowballStemmer("english")
    stemwords = [stemmer.stem(wd) for wd in keepwords]
    body = " ".join(stemwords)

    # Returning processing results
    return body

In [5]:
# Saving enough memory space for storing all of the elements. This way is quicker than adding them as we go
emails_raw =  ["email"]*len(hamfiles + spamfiles)  
emails_processed =  ["email"]*len(hamfiles + spamfiles)
# Target vector
y = [0]*len(hamfiles) + [1]*len(spamfiles)  
for idx, fpath in enumerate(hamfiles + spamfiles):
    # Extracting just mail body
    body = get_body(fpath)  
    if body:
        emails_raw[idx] = body
        # Executing all of the pre-processing tasks
        processed = word_salad(body)  
        emails_processed[idx] = processed    

Couldn't decode file c:\Users\eduar\My Drive\TFM\PEC4 - Memoria final\Datasets\spamassasin\ham\cmds
Couldn't decode file c:\Users\eduar\My Drive\TFM\PEC4 - Memoria final\Datasets\spamassasin\spam\0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1
Couldn't decode file c:\Users\eduar\My Drive\TFM\PEC4 - Memoria final\Datasets\spamassasin\spam\00116.29e39a0064e2714681726ac28ff3fdef
Couldn't decode file c:\Users\eduar\My Drive\TFM\PEC4 - Memoria final\Datasets\spamassasin\spam\00263.13fc73e09ae15e0023bdb13d0a010f2d
Couldn't decode file c:\Users\eduar\My Drive\TFM\PEC4 - Memoria final\Datasets\spamassasin\spam\00320.20dcbb5b047b8e2f212ee78267ee27ad
Couldn't decode file c:\Users\eduar\My Drive\TFM\PEC4 - Memoria final\Datasets\spamassasin\spam\00323.9e36bf05304c99f2133a4c03c49533a9
Couldn't decode file c:\Users\eduar\My Drive\TFM\PEC4 - Memoria final\Datasets\spamassasin\spam\00324.6f320a8c6b5f8e4bc47d475b3d4e86ef
Couldn't decode file c:\Users\eduar\My Drive\TFM\PEC4 - Memoria final\Datasets\spamassasin\

In [6]:
# Plain list of all of thw words of all of the emails
flatlist = [word for email in emails_processed for word in email.split(" ")]  

In [7]:
# Counting number of occurrences of every word and storing this info as a dictionary
counts_dict = collections.Counter(flatlist)

In [8]:
# word list which all of the words that appear more than 100 times
Listado de palabras que aparecen más de 100 veces
vocablist = [key for key in counts_dict if counts_dict[key] > 100]

In [9]:
# Generating token matrix
vectorizer = CountVectorizer(vocabulary=vocablist)  
X = vectorizer.transform(emails_processed)

In [10]:
#Splitting the dataset in training (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)

In [11]:
X_train_scaled = StandardScaler(with_mean=False).fit_transform(X_train)
X_test_scaled = StandardScaler(with_mean=False).fit_transform(X_test)

In [12]:
#Evaluating the model on training set
classifier = MultinomialNB().fit(X_train_scaled, y_train)
pred = classifier.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, pred))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, pred))
print('---------------[Classification report]---------------')
print(classification_report(y_train, pred))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9727272727272728
-----------------[Confusion matrix]------------------
[[5452  111]
 [  93 1824]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5563
           1       0.94      0.95      0.95      1917

    accuracy                           0.97      7480
   macro avg       0.96      0.97      0.96      7480
weighted avg       0.97      0.97      0.97      7480

-----------------------------------------------------


In [13]:
#Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(classifier, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.97326203 0.96724599 0.96724599 0.97459893 0.96724599]
-------------[Cross-validation mean score on training set]-------------
0.9699197860962567
-----------------------------------------------------------------------


In [14]:
#Evaluating model on testing set
pred = classifier.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, pred))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, pred))
print('---------------[Classification report]---------------')
print(classification_report(y_test, pred))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9620523784072689
-----------------[Confusion matrix]------------------
[[1349   40]
 [  31  451]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1389
           1       0.92      0.94      0.93       482

    accuracy                           0.96      1871
   macro avg       0.95      0.95      0.95      1871
weighted avg       0.96      0.96      0.96      1871

-----------------------------------------------------


In [15]:
# Results of executing cross-validation on testing set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(classifier, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.98133333 0.95721925 0.95989305 0.95989305 0.96791444]
-------------[Cross-validation mean score on test set]-------------
0.965250623885918
-------------------------------------------------------------------


In [16]:
knn_model = KNeighborsClassifier(n_neighbors=int(math.sqrt(X_train.shape[0]))).fit(X_train, y_train)
knn_pred_train = knn_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, knn_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, knn_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, knn_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7450534759358288
-----------------[Confusion matrix]------------------
[[5552   11]
 [1896   21]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.75      1.00      0.85      5563
           1       0.66      0.01      0.02      1917

    accuracy                           0.75      7480
   macro avg       0.70      0.50      0.44      7480
weighted avg       0.72      0.75      0.64      7480

-----------------------------------------------------


In [17]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(knn_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.89304813 0.88836898 0.86497326 0.86229947 0.86029412]
-------------[Cross-validation mean score on training set]-------------
0.8737967914438503
-----------------------------------------------------------------------


In [18]:
knn_pred_test = knn_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, knn_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, knn_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, knn_pred_test))
print('-----------------------------------------------------')


--------------------[Accuracy]-----------------------
0.743452699091395
-----------------[Confusion matrix]------------------
[[1388    1]
 [ 479    3]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.74      1.00      0.85      1389
           1       0.75      0.01      0.01       482

    accuracy                           0.74      1871
   macro avg       0.75      0.50      0.43      1871
weighted avg       0.75      0.74      0.64      1871

-----------------------------------------------------


In [19]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(knn_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.74666667 0.76737968 0.7513369  0.75668449 0.76203209]
-------------[Cross-validation mean score on test set]-------------
0.7568199643493763
-------------------------------------------------------------------


In [20]:
svm_model = svm.LinearSVC(random_state = 1, max_iter=150000).fit(X_train_scaled, y_train)
svm_pred_train = svm_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, svm_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, svm_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, svm_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9995989304812835
-----------------[Confusion matrix]------------------
[[5560    3]
 [   0 1917]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5563
           1       1.00      1.00      1.00      1917

    accuracy                           1.00      7480
   macro avg       1.00      1.00      1.00      7480
weighted avg       1.00      1.00      1.00      7480

-----------------------------------------------------


In [21]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(svm_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.97794118 0.97994652 0.97794118 0.98328877 0.97860963]
-------------[Cross-validation mean score on training set]-------------
0.9795454545454547
-----------------------------------------------------------------------


In [22]:
svm_pred_test = svm_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, svm_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, svm_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, svm_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9754142169962586
-----------------[Confusion matrix]------------------
[[1356   33]
 [  13  469]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1389
           1       0.93      0.97      0.95       482

    accuracy                           0.98      1871
   macro avg       0.96      0.97      0.97      1871
weighted avg       0.98      0.98      0.98      1871

-----------------------------------------------------


In [23]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(svm_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.96       0.94919786 0.94117647 0.95721925 0.93582888]
-------------[Cross-validation mean score on test set]-------------
0.9486844919786096
-------------------------------------------------------------------


In [24]:
logistic_regression_model = LogisticRegression(random_state = 1).fit(X_train_scaled, y_train)
logistic_regression_pred_train = logistic_regression_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, logistic_regression_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, logistic_regression_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, logistic_regression_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9995989304812835
-----------------[Confusion matrix]------------------
[[5560    3]
 [   0 1917]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5563
           1       1.00      1.00      1.00      1917

    accuracy                           1.00      7480
   macro avg       1.00      1.00      1.00      7480
weighted avg       1.00      1.00      1.00      7480

-----------------------------------------------------


In [25]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(logistic_regression_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.98529412 0.98395722 0.98663102 0.98997326 0.98729947]
-------------[Cross-validation mean score on training set]-------------
0.9866310160427808
-----------------------------------------------------------------------


In [26]:
logistic_regression_pred_test = logistic_regression_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, logistic_regression_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, logistic_regression_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, logistic_regression_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9893105291288081
-----------------[Confusion matrix]------------------
[[1376   13]
 [   7  475]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1389
           1       0.97      0.99      0.98       482

    accuracy                           0.99      1871
   macro avg       0.98      0.99      0.99      1871
weighted avg       0.99      0.99      0.99      1871

-----------------------------------------------------


In [27]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(logistic_regression_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.976      0.97593583 0.96524064 0.96524064 0.96256684]
-------------[Cross-validation mean score on test set]-------------
0.9689967914438503
-------------------------------------------------------------------


In [28]:
decision_tree_model = tree.DecisionTreeClassifier(random_state=1, max_depth=2).fit(X_train_scaled, y_train)
decision_tree_pred_train = decision_tree_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, decision_tree_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, decision_tree_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, decision_tree_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.8578877005347594
-----------------[Confusion matrix]------------------
[[4964  599]
 [ 464 1453]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      5563
           1       0.71      0.76      0.73      1917

    accuracy                           0.86      7480
   macro avg       0.81      0.83      0.82      7480
weighted avg       0.86      0.86      0.86      7480

-----------------------------------------------------


In [29]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(decision_tree_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.88034759 0.84959893 0.84625668 0.8776738  0.87299465]
-------------[Cross-validation mean score on training set]-------------
0.8653743315508022
-----------------------------------------------------------------------


In [30]:
decision_tree_pred_test = decision_tree_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, decision_tree_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, decision_tree_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, decision_tree_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.8637092463923036
-----------------[Confusion matrix]------------------
[[1258  131]
 [ 124  358]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1389
           1       0.73      0.74      0.74       482

    accuracy                           0.86      1871
   macro avg       0.82      0.82      0.82      1871
weighted avg       0.86      0.86      0.86      1871

-----------------------------------------------------


In [31]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(decision_tree_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.86933333 0.88502674 0.89037433 0.87700535 0.89037433]
-------------[Cross-validation mean score on test set]-------------
0.8824228163992871
-------------------------------------------------------------------


In [32]:
random_forest_model = RandomForestClassifier(max_depth=3, random_state=1).fit(X_train_scaled, y_train)
random_forest_pred_train = random_forest_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, random_forest_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, random_forest_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, random_forest_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7897058823529411
-----------------[Confusion matrix]------------------
[[5563    0]
 [1573  344]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.78      1.00      0.88      5563
           1       1.00      0.18      0.30      1917

    accuracy                           0.79      7480
   macro avg       0.89      0.59      0.59      7480
weighted avg       0.84      0.79      0.73      7480

-----------------------------------------------------


In [33]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(random_forest_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.79278075 0.79278075 0.79612299 0.77941176 0.78141711]
-------------[Cross-validation mean score on training set]-------------
0.7885026737967914
-----------------------------------------------------------------------


In [34]:
random_forest_pred_test = random_forest_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, random_forest_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, random_forest_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, random_forest_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.7915553180117584
-----------------[Confusion matrix]------------------
[[1387    2]
 [ 388   94]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.78      1.00      0.88      1389
           1       0.98      0.20      0.33       482

    accuracy                           0.79      1871
   macro avg       0.88      0.60      0.60      1871
weighted avg       0.83      0.79      0.73      1871

-----------------------------------------------------


In [35]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(random_forest_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.784      0.80481283 0.79144385 0.80481283 0.79679144]
-------------[Cross-validation mean score on test set]-------------
0.796372192513369
-------------------------------------------------------------------
