In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import time
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Load the testing dataset
df_test1 = pd.read_csv("Dataset2_1Train_new2.csv")
df_test2 = pd.read_csv("Dataset2_1Test_new2.csv")
df_test = pd.concat([df_test1, df_test2], axis=0, ignore_index=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
X_test1 = df_test['cleaned_text'].values
y_test = df_test['label'].values

In [3]:
# Load the tokenizer
with open("glove_vect_fit1.pkl", "rb") as file:
  glove_vect_fit = pickle.load(file)
# Load the models
with open("glove_mlp_model1.pkl", "rb") as file:
  glove_mlp_best_model = pickle.load(file)
with open("glove_knn_model1.pkl", "rb") as file:
  glove_knn_best_model = pickle.load(file)
with open("glove_rf_model1.pkl", "rb") as file:
  glove_rf_best_model = pickle.load(file)
with open("glove_lr_model1.pkl", "rb") as file:
  glove_lr_best_model = pickle.load(file)
with open("glove_svc_model1.pkl", "rb") as file:
  glove_svc_best_model = pickle.load(file)

In [4]:
# Function to generate document vectors using GloVe embeddings
def document_vector(glove_model, email):
    words = email.split()
    return np.mean([glove_model[word] for word in words if word in glove_model] or [np.zeros(len(next(iter(glove_model.values()))))], axis=0)
    
# Function to transform test data using GloVe embeddings
def transform_glove(X_test, glove_model):
    X_test_vectors = np.array([document_vector(glove_model, email) for email in X_test])
    return X_test_vectors

In [5]:
#Testing of Glove features of testing data using MLP
def check_best_model_MLP(X_test, y_test, glove_mlp_best_model, glove_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_glove(X_test, glove_vect_fit)
    y_pred = glove_mlp_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_MLP(X_test1, y_test, glove_mlp_best_model, glove_vect_fit)

              precision    recall  f1-score   support

           0       0.94      0.98      0.96     22458
           1       0.88      0.71      0.79      4491

    accuracy                           0.94     26949
   macro avg       0.91      0.85      0.87     26949
weighted avg       0.93      0.94      0.93     26949

[[22010   448]
 [ 1298  3193]]
Precision: 0.876956879978028
Recall: 0.710977510576709
FScore: 0.7852926709296606
Testing Accuracy: 0.935210954024268
The total testing time is 2.297351598739624 seconds


In [6]:
#Testing of Glove features of testing data using KNN
def check_best_model_KNN(X_test, y_test, glove_knn_best_model, glove_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_glove(X_test, glove_vect_fit)
    y_pred = glove_knn_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_KNN(X_test1, y_test, glove_knn_best_model, glove_vect_fit)

              precision    recall  f1-score   support

           0       0.91      0.99      0.95     22458
           1       0.90      0.51      0.65      4491

    accuracy                           0.91     26949
   macro avg       0.91      0.75      0.80     26949
weighted avg       0.91      0.91      0.90     26949

[[22211   247]
 [ 2210  2281]]
Precision: 0.9022943037974683
Recall: 0.5079046982854598
FScore: 0.6499501353469156
Testing Accuracy: 0.9088277858176556
The total testing time is 4.786086082458496 seconds


In [7]:
#Testing of Glove features of testing data using RF
def check_best_model_RF(X_test, y_test, glove_rf_best_model, glove_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_glove(X_test, glove_vect_fit)
    y_pred = glove_rf_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_RF(X_test1, y_test, glove_rf_best_model, glove_vect_fit)

              precision    recall  f1-score   support

           0       0.95      0.96      0.95     22458
           1       0.79      0.73      0.75      4491

    accuracy                           0.92     26949
   macro avg       0.87      0.84      0.85     26949
weighted avg       0.92      0.92      0.92     26949

[[21574   884]
 [ 1234  3257]]
Precision: 0.786524993962811
Recall: 0.7252282342462704
FScore: 0.754633920296571
Testing Accuracy: 0.9214071023043526
The total testing time is 2.4678077697753906 seconds


In [9]:
#Testing of Glove features of testing data using LR
def check_best_model_LR(X_test, y_test, glove_lr_best_model, glove_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_glove(X_test, glove_vect_fit)
    y_pred = glove_lr_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_LR(X_test1, y_test, glove_lr_best_model, glove_vect_fit)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96     22458
           1       0.84      0.77      0.80      4491

    accuracy                           0.94     26949
   macro avg       0.90      0.87      0.88     26949
weighted avg       0.94      0.94      0.94     26949

[[21786   672]
 [ 1028  3463]]
Precision: 0.837484885126965
Recall: 0.7710977510576709
FScore: 0.8029214004173428
Testing Accuracy: 0.9369178819251178
The total testing time is 2.187619209289551 seconds


In [10]:
#Testing of Glove features of testing data using SVC
def check_best_model_SVC(X_test, y_test, glove_svc_best_model, glove_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_glove(X_test, glove_vect_fit)
    y_pred = glove_svc_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_SVC(X_test1, y_test, glove_svc_best_model, glove_vect_fit)

              precision    recall  f1-score   support

           0       0.95      0.98      0.97     22458
           1       0.87      0.76      0.81      4491

    accuracy                           0.94     26949
   macro avg       0.91      0.87      0.89     26949
weighted avg       0.94      0.94      0.94     26949

[[21954   504]
 [ 1076  3415]]
Precision: 0.8713957642255677
Recall: 0.7604097083054999
FScore: 0.812128418549346
Testing Accuracy: 0.941370737318639
The total testing time is 11.730715274810791 seconds


In [11]:
all_params = glove_svc_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [13]:
all_params = glove_mlp_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 1000, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}


In [14]:
all_params = glove_knn_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


In [15]:
all_params = glove_lr_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'C': 100, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [16]:
all_params = glove_rf_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
