In [1]:
# import libraries
import numpy as py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn.preprocessing import StandardScaler
import math

In [2]:
#Loading dataset
colnames = [
    'word_freq_make','word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order',
    'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free', 'word_freq_business',
    'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl',
    'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',   
    'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original',
    'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_semicolon', 'char_freq_parentheses', 'char_freq_brackets', 
    'char_freq_exclamation', 'char_freq_dollar', 'char_freq_hash', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total'
]

df = pd.read_csv('spambase.data', names=colnames)

In [3]:
# Splitting the dataset in 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:55] , df['capital_run_length_total'], test_size = 0.20, random_state = 1)

In [4]:
X_train_scaled = StandardScaler(with_mean=False).fit_transform(X_train)
X_test_scaled = StandardScaler(with_mean=False).fit_transform(X_test)

In [5]:
# Evaluating the model on training set
classifier = MultinomialNB().fit(X_train_scaled, y_train)
pred = classifier.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, pred))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, pred))
print('---------------[Classification report]---------------')
print(classification_report(y_train, pred))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.8804347826086957
-----------------[Confusion matrix]------------------
[[1867  357]
 [  83 1373]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.96      0.84      0.89      2224
           1       0.79      0.94      0.86      1456

    accuracy                           0.88      3680
   macro avg       0.88      0.89      0.88      3680
weighted avg       0.89      0.88      0.88      3680

-----------------------------------------------------


In [6]:
# Results of executing cross-validation on training set
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(classifier, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.87092391 0.87907609 0.88858696 0.88451087 0.87771739]
-------------[Cross-validation mean score on training set]-------------
0.8801630434782609
-----------------------------------------------------------------------


In [7]:
#Evaluating the model on testing set
pred = classifier.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, pred))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, pred))
print('---------------[Classification report]---------------')
print(classification_report(y_test, pred))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.8686210640608035
-----------------[Confusion matrix]------------------
[[475  89]
 [ 32 325]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.94      0.84      0.89       564
           1       0.79      0.91      0.84       357

    accuracy                           0.87       921
   macro avg       0.86      0.88      0.87       921
weighted avg       0.88      0.87      0.87       921

-----------------------------------------------------


In [8]:
# Results of executing cross-validation on testing set
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(classifier, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.83783784 0.85869565 0.89673913 0.89673913 0.86956522]
-------------[Cross-validation mean score on test set]-------------
0.8719153936545242
-------------------------------------------------------------------


In [9]:
knn_model = KNeighborsClassifier(n_neighbors=int(math.sqrt(X_train_scaled.shape[0]))).fit(X_train_scaled, y_train)
knn_pred_train = knn_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, knn_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, knn_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, knn_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.8842391304347826
-----------------[Confusion matrix]------------------
[[2079  145]
 [ 281 1175]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      2224
           1       0.89      0.81      0.85      1456

    accuracy                           0.88      3680
   macro avg       0.89      0.87      0.88      3680
weighted avg       0.88      0.88      0.88      3680

-----------------------------------------------------


In [10]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(knn_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.86956522 0.88586957 0.85869565 0.8763587  0.88858696]
-------------[Cross-validation mean score on training set]-------------
0.8758152173913043
-----------------------------------------------------------------------


In [11]:
knn_pred_test = knn_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, knn_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, knn_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, knn_pred_test))
print('-----------------------------------------------------')


--------------------[Accuracy]-----------------------
0.9011943539630836
-----------------[Confusion matrix]------------------
[[538  26]
 [ 65 292]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       564
           1       0.92      0.82      0.87       357

    accuracy                           0.90       921
   macro avg       0.91      0.89      0.89       921
weighted avg       0.90      0.90      0.90       921

-----------------------------------------------------


In [12]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(knn_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.88108108 0.78804348 0.83695652 0.7826087  0.79347826]
-------------[Cross-validation mean score on test set]-------------
0.8164336075205639
-------------------------------------------------------------------


In [13]:
svm_model = svm.LinearSVC(random_state = 1, max_iter=26500).fit(X_train_scaled, y_train)
svm_pred_train = svm_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, svm_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, svm_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, svm_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9307065217391305
-----------------[Confusion matrix]------------------
[[2126   98]
 [ 157 1299]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.93      0.96      0.94      2224
           1       0.93      0.89      0.91      1456

    accuracy                           0.93      3680
   macro avg       0.93      0.92      0.93      3680
weighted avg       0.93      0.93      0.93      3680

-----------------------------------------------------


In [14]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(svm_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')



---------------[Cross-validation scores on training set]---------------
[0.91711957 0.93070652 0.90896739 0.91983696 0.91711957]
-------------[Cross-validation mean score on training set]-------------
0.91875
-----------------------------------------------------------------------


In [15]:
svm_pred_test = svm_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, svm_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, svm_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, svm_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9218241042345277
-----------------[Confusion matrix]------------------
[[531  33]
 [ 39 318]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.93      0.94      0.94       564
           1       0.91      0.89      0.90       357

    accuracy                           0.92       921
   macro avg       0.92      0.92      0.92       921
weighted avg       0.92      0.92      0.92       921

-----------------------------------------------------


In [16]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(svm_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.91351351 0.93478261 0.91304348 0.91304348 0.90217391]
-------------[Cross-validation mean score on test set]-------------
0.9153113983548767
-------------------------------------------------------------------


In [17]:
logistic_regression_model = LogisticRegression(random_state = 1).fit(X_train_scaled, y_train)
logistic_regression_pred_train = logistic_regression_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, logistic_regression_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, logistic_regression_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, logistic_regression_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9285326086956521
-----------------[Confusion matrix]------------------
[[2120  104]
 [ 159 1297]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      2224
           1       0.93      0.89      0.91      1456

    accuracy                           0.93      3680
   macro avg       0.93      0.92      0.92      3680
weighted avg       0.93      0.93      0.93      3680

-----------------------------------------------------


In [18]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(logistic_regression_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.91847826 0.9388587  0.9076087  0.92798913 0.91983696]
-------------[Cross-validation mean score on training set]-------------
0.9225543478260869
-----------------------------------------------------------------------


In [19]:
logistic_regression_pred_test = logistic_regression_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, logistic_regression_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, logistic_regression_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, logistic_regression_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9229098805646037
-----------------[Confusion matrix]------------------
[[533  31]
 [ 40 317]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       564
           1       0.91      0.89      0.90       357

    accuracy                           0.92       921
   macro avg       0.92      0.92      0.92       921
weighted avg       0.92      0.92      0.92       921

-----------------------------------------------------


In [20]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(logistic_regression_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.92432432 0.93478261 0.91304348 0.9076087  0.90217391]
-------------[Cross-validation mean score on test set]-------------
0.9163866039952996
-------------------------------------------------------------------


In [21]:
decision_tree_model = tree.DecisionTreeClassifier(random_state=1, max_depth=2).fit(X_train_scaled, y_train)
decision_tree_pred_train = decision_tree_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, decision_tree_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, decision_tree_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, decision_tree_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.8611413043478261
-----------------[Confusion matrix]------------------
[[2139   85]
 [ 426 1030]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      2224
           1       0.92      0.71      0.80      1456

    accuracy                           0.86      3680
   macro avg       0.88      0.83      0.85      3680
weighted avg       0.87      0.86      0.86      3680

-----------------------------------------------------


In [22]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(decision_tree_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.85869565 0.82201087 0.86005435 0.85461957 0.8138587 ]
-------------[Cross-validation mean score on training set]-------------
0.8418478260869564
-----------------------------------------------------------------------


In [23]:
decision_tree_pred_test = decision_tree_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, decision_tree_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, decision_tree_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, decision_tree_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.8773072747014115
-----------------[Confusion matrix]------------------
[[549  15]
 [ 98 259]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.85      0.97      0.91       564
           1       0.95      0.73      0.82       357

    accuracy                           0.88       921
   macro avg       0.90      0.85      0.86       921
weighted avg       0.89      0.88      0.87       921

-----------------------------------------------------


In [24]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(decision_tree_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.88108108 0.85869565 0.80434783 0.8423913  0.84782609]
-------------[Cross-validation mean score on test set]-------------
0.8468683901292596
-------------------------------------------------------------------


In [25]:
random_forest_model = RandomForestClassifier(max_depth=3, random_state=1).fit(X_train_scaled, y_train)
random_forest_pred_train = random_forest_model.predict(X_train_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_train, random_forest_pred_train))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_train, random_forest_pred_train))
print('---------------[Classification report]---------------')
print(classification_report(y_train, random_forest_pred_train))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9114130434782609
-----------------[Confusion matrix]------------------
[[2177   47]
 [ 279 1177]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      2224
           1       0.96      0.81      0.88      1456

    accuracy                           0.91      3680
   macro avg       0.92      0.89      0.90      3680
weighted avg       0.92      0.91      0.91      3680

-----------------------------------------------------


In [26]:
shuffling = StratifiedKFold(5, shuffle=True, random_state=1)
scores_train  = cross_val_score(random_forest_model, X_train_scaled, y_train, cv=shuffling)
print('---------------[Cross-validation scores on training set]---------------')
print(scores_train)
print('-------------[Cross-validation mean score on training set]-------------')
print(scores_train.mean())
print('-----------------------------------------------------------------------')

---------------[Cross-validation scores on training set]---------------
[0.91032609 0.88722826 0.90081522 0.90625    0.91440217]
-------------[Cross-validation mean score on training set]-------------
0.9038043478260869
-----------------------------------------------------------------------


In [27]:
random_forest_pred_test = random_forest_model.predict(X_test_scaled)
print('--------------------[Accuracy]-----------------------')
print(accuracy_score(y_test, random_forest_pred_test))
print('-----------------[Confusion matrix]------------------')
print(confusion_matrix(y_test, random_forest_pred_test))
print('---------------[Classification report]---------------')
print(classification_report(y_test, random_forest_pred_test))
print('-----------------------------------------------------')

--------------------[Accuracy]-----------------------
0.9120521172638436
-----------------[Confusion matrix]------------------
[[554  10]
 [ 71 286]]
---------------[Classification report]---------------
              precision    recall  f1-score   support

           0       0.89      0.98      0.93       564
           1       0.97      0.80      0.88       357

    accuracy                           0.91       921
   macro avg       0.93      0.89      0.90       921
weighted avg       0.92      0.91      0.91       921

-----------------------------------------------------


In [28]:
shuffling = StratifiedKFold(5, shuffle = True, random_state = 1)
scores_test = cross_val_score(random_forest_model, X_test_scaled, y_test, cv=shuffling)
print('---------------[Cross-validation scores on test set]---------------')
print(scores_test)
print('-------------[Cross-validation mean score on test set]-------------')
print(scores_test.mean())
print('-------------------------------------------------------------------')

---------------[Cross-validation scores on test set]---------------
[0.92972973 0.92391304 0.9076087  0.9076087  0.89673913]
-------------[Cross-validation mean score on test set]-------------
0.9131198589894242
-------------------------------------------------------------------
