In [2]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import time
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Import the data
df_test1 = pd.read_csv("Dataset2_2Train_new2.csv")
df_test2 = pd.read_csv("Dataset2_2Test_new2.csv")
df_test = pd.concat([df_test1, df_test2], axis=0, ignore_index=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
X_test1 = df_test['final_cleaned_text'].values
y_test = df_test['label'].values

In [3]:
# Load the tokenizer 
with open("w2v_vect_fit2.pkl", "rb") as file:
  w2v_vect_fit = pickle.load(file)
# Load the models
with open("w2v_mlp_model2.pkl", "rb") as file:
  w2v_mlp_best_model = pickle.load(file)
with open("w2v_knn_model2.pkl", "rb") as file:
  w2v_knn_best_model = pickle.load(file)
with open("w2v_rf_model2.pkl", "rb") as file:
  w2v_rf_best_model = pickle.load(file)
with open("w2v_lr_model2.pkl", "rb") as file:
  w2v_lr_best_model = pickle.load(file)
with open("w2v_svc_model2.pkl", "rb") as file:
  w2v_svc_best_model = pickle.load(file)

In [4]:
#Feature Extraction for Testing data
def document_vector(w2v_model, doc):
    words = doc.split()
    words = [word for word in words if word in w2v_model.wv.key_to_index]
    if len(words) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(w2v_model.wv[words], axis=0)

def transform_w2v(X_test, w2v_model):
    X_test_vectors = np.array([document_vector(w2v_model, doc) for doc in X_test])
    return X_test_vectors

In [5]:
#Testing of W2V features of testing data using MLP
def check_best_model_MLP(X_test, y_test, w2v_mlp_best_model, w2v_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_w2v(X_test, w2v_vect_fit)
    y_pred = w2v_mlp_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_MLP(X_test1, y_test, w2v_mlp_best_model, w2v_vect_fit)

              precision    recall  f1-score   support

           0       0.99      0.93      0.96     22458
           1       0.74      0.93      0.82      4491

    accuracy                           0.93     26949
   macro avg       0.86      0.93      0.89     26949
weighted avg       0.94      0.93      0.94     26949

[[20969  1489]
 [  315  4176]]
Precision: 0.7371579876434246
Recall: 0.9298597194388778
FScore: 0.8223710122095313
Testing Accuracy: 0.9330587405840662
The total testing time is 8.628127336502075 seconds


In [6]:
#Testing of W2V features of testing data using KNN
def check_best_model_KNN(X_test, y_test, w2v_knn_best_model, w2v_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_w2v(X_test, w2v_vect_fit)
    y_pred = w2v_knn_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_KNN(X_test1, y_test, w2v_knn_best_model, w2v_vect_fit)

              precision    recall  f1-score   support

           0       0.94      0.99      0.96     22458
           1       0.93      0.66      0.77      4491

    accuracy                           0.94     26949
   macro avg       0.93      0.83      0.87     26949
weighted avg       0.94      0.94      0.93     26949

[[22230   228]
 [ 1507  2984]]
Precision: 0.9290161892901619
Recall: 0.6644399910932977
FScore: 0.7747630793197455
Testing Accuracy: 0.9356191324353408
The total testing time is 26.50739812850952 seconds


In [8]:
#Testing of W2V features of testing data using RF
def check_best_model_RF(X_test, y_test, w2v_rf_best_model, w2v_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_w2v(X_test, w2v_vect_fit)
    y_pred = w2v_rf_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_RF(X_test1, y_test, w2v_rf_best_model, w2v_vect_fit)

              precision    recall  f1-score   support

           0       0.95      0.98      0.96     22458
           1       0.86      0.75      0.80      4491

    accuracy                           0.94     26949
   macro avg       0.91      0.86      0.88     26949
weighted avg       0.94      0.94      0.94     26949

[[21916   542]
 [ 1128  3363]]
Precision: 0.8612035851472472
Recall: 0.7488309953239813
FScore: 0.8010957598856598
Testing Accuracy: 0.9380310957734981
The total testing time is 7.89499831199646 seconds


In [9]:
#Testing of W2V features of testing data using LR
def check_best_model_LR(X_test, y_test, w2v_lr_best_model, w2v_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_w2v(X_test, w2v_vect_fit)
    y_pred = w2v_lr_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_LR(X_test1, y_test, w2v_lr_best_model, w2v_vect_fit)

              precision    recall  f1-score   support

           0       0.97      0.95      0.96     22458
           1       0.78      0.87      0.82      4491

    accuracy                           0.94     26949
   macro avg       0.87      0.91      0.89     26949
weighted avg       0.94      0.94      0.94     26949

[[21337  1121]
 [  593  3898]]
Precision: 0.7766487348077307
Recall: 0.8679581384992207
FScore: 0.8197686645636173
Testing Accuracy: 0.936398382129207
The total testing time is 7.574327230453491 seconds


In [12]:
#Testing of W2V features of testing data using SVC
def check_best_model_SVC(X_test, y_test, w2v_svc_best_model, w2v_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_w2v(X_test, w2v_vect_fit)
    y_pred = w2v_svc_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_SVC(X_test1, y_test, w2v_svc_best_model, w2v_vect_fit)

              precision    recall  f1-score   support

           0       0.98      0.95      0.97     22458
           1       0.79      0.92      0.85      4491

    accuracy                           0.94     26949
   macro avg       0.88      0.94      0.91     26949
weighted avg       0.95      0.94      0.95     26949

[[21330  1128]
 [  356  4135]]
Precision: 0.7856735702071063
Recall: 0.920730349588065
FScore: 0.8478572893172033
Testing Accuracy: 0.9449330216334558
The total testing time is 13.069381475448608 seconds


In [13]:
all_params = w2v_svc_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [14]:
all_params = w2v_mlp_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100, 100), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 1000, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}


In [15]:
all_params = w2v_knn_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'manhattan', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


In [16]:
all_params = w2v_lr_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'C': 100, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [17]:
all_params = w2v_rf_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
