In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import time
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [16]:
# Load the testing dataset
df_test1 = pd.read_csv("Dataset2_2Train_new2.csv")
df_test2 = pd.read_csv("Dataset2_2Test_new2.csv")
df_test = pd.concat([df_test1, df_test2], axis=0, ignore_index=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
X_test1 = df_test['final_cleaned_text'].values
y_test = df_test['label'].values

In [50]:
# Load the tokenizer
with open("glove_vect_fit2.pkl", "rb") as file:
  glove_vect_fit = pickle.load(file)
# Load the models
with open("glove_mlp_model2.pkl", "rb") as file:
  glove_mlp_best_model = pickle.load(file)
with open("glove_knn_model2.pkl", "rb") as file:
  glove_knn_best_model = pickle.load(file)
with open("glove_rf_model2.pkl", "rb") as file:
  glove_rf_best_model = pickle.load(file)
with open("glove_lr_model2.pkl", "rb") as file:
  glove_lr_best_model = pickle.load(file)
with open("glove_svc_model2.pkl", "rb") as file:
  glove_svc_best_model = pickle.load(file)

In [52]:
# Function to generate document vectors using GloVe embeddings
def document_vector(glove_model, email):
    words = email.split()
    return np.mean([glove_model[word] for word in words if word in glove_model] or [np.zeros(len(next(iter(glove_model.values()))))], axis=0)
    
# Function to transform test data using GloVe embeddings
def transform_glove(X_test, glove_model):
    X_test_vectors = np.array([document_vector(glove_model, email) for email in X_test])
    return X_test_vectors

In [54]:
#Testing of Glove features of testing data using MLP
def check_best_model_MLP(X_test, y_test, glove_mlp_best_model, glove_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_glove(X_test, glove_vect_fit)
    y_pred = glove_mlp_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_MLP(X_test1, y_test, glove_mlp_best_model, glove_vect_fit)

              precision    recall  f1-score   support

           0       0.97      0.95      0.96     22458
           1       0.79      0.85      0.82      4491

    accuracy                           0.94     26949
   macro avg       0.88      0.90      0.89     26949
weighted avg       0.94      0.94      0.94     26949

[[21413  1045]
 [  661  3830]]
Precision: 0.7856410256410257
Recall: 0.8528167446003118
FScore: 0.8178518043988896
Testing Accuracy: 0.9366952391554417
The total testing time is 2.4799370765686035 seconds


In [55]:
#Testing of Glove features of testing data using KNN
def check_best_model_KNN(X_test, y_test, glove_knn_best_model, glove_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_glove(X_test, glove_vect_fit)
    y_pred = glove_knn_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_KNN(X_test1, y_test, glove_knn_best_model, glove_vect_fit)

              precision    recall  f1-score   support

           0       0.94      0.96      0.95     22458
           1       0.78      0.69      0.73      4491

    accuracy                           0.92     26949
   macro avg       0.86      0.82      0.84     26949
weighted avg       0.91      0.92      0.91     26949

[[21584   874]
 [ 1399  3092]]
Precision: 0.7796268280383257
Recall: 0.6884880872856824
FScore: 0.7312285680501359
Testing Accuracy: 0.9156554974210546
The total testing time is 19.628166437149048 seconds


In [56]:
#Testing of Glove features of testing data using RF
def check_best_model_RF(X_test, y_test, glove_rf_best_model, glove_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_glove(X_test, glove_vect_fit)
    y_pred = glove_rf_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_RF(X_test1, y_test, glove_rf_best_model, glove_vect_fit)

              precision    recall  f1-score   support

           0       0.96      0.93      0.95     22458
           1       0.70      0.80      0.75      4491

    accuracy                           0.91     26949
   macro avg       0.83      0.86      0.85     26949
weighted avg       0.92      0.91      0.91     26949

[[20948  1510]
 [  911  3580]]
Precision: 0.7033398821218074
Recall: 0.7971498552660877
FScore: 0.7473123891034338
Testing Accuracy: 0.9101636424357119
The total testing time is 2.647120237350464 seconds


In [57]:
#Testing of Glove features of testing data using LR
def check_best_model_LR(X_test, y_test, glove_lr_best_model, glove_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_glove(X_test, glove_vect_fit)
    y_pred = glove_lr_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_LR(X_test1, y_test, glove_lr_best_model, glove_vect_fit)

              precision    recall  f1-score   support

           0       0.96      0.95      0.95     22458
           1       0.75      0.80      0.77      4491

    accuracy                           0.92     26949
   macro avg       0.86      0.87      0.86     26949
weighted avg       0.92      0.92      0.92     26949

[[21274  1184]
 [  913  3578]]
Precision: 0.751364972700546
Recall: 0.7967045201514139
FScore: 0.7733707986598941
Testing Accuracy: 0.9221863519982189
The total testing time is 2.4183874130249023 seconds


In [58]:
#Testing of Glove features of testing data using SVC
def check_best_model_SVC(X_test, y_test, glove_svc_best_model, glove_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_glove(X_test, glove_vect_fit)
    y_pred = glove_svc_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_SVC(X_test1, y_test, glove_svc_best_model, glove_vect_fit)

              precision    recall  f1-score   support

           0       0.97      0.95      0.96     22458
           1       0.79      0.85      0.82      4491

    accuracy                           0.94     26949
   macro avg       0.88      0.90      0.89     26949
weighted avg       0.94      0.94      0.94     26949

[[21409  1049]
 [  654  3837]]
Precision: 0.7853049529267294
Recall: 0.85437541750167
FScore: 0.8183854111122961
Testing Accuracy: 0.9368065605402798
The total testing time is 11.375045776367188 seconds


In [59]:
all_params = glove_svc_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [60]:
all_params = glove_mlp_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100, 100), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 1000, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}


In [61]:
all_params = glove_knn_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'manhattan', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


In [62]:
all_params = glove_lr_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'C': 100, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [63]:
all_params = glove_rf_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
