In [2]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import time
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Import the data
df_test1 = pd.read_csv("Dataset2_2Train_new2.csv")
df_test2 = pd.read_csv("Dataset2_2Test_new2.csv")
df_test = pd.concat([df_test1, df_test2], axis=0, ignore_index=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
X_test1 = df_test['cleaned_text'].values
y_test = df_test['label'].values

In [3]:
# Load the tokenizer 
with open("w2v_vect_fit1.pkl", "rb") as file:
  w2v_vect_fit = pickle.load(file)
# Load the models
with open("w2v_mlp_model1.pkl", "rb") as file:
  w2v_mlp_best_model = pickle.load(file)
with open("w2v_knn_model1.pkl", "rb") as file:
  w2v_knn_best_model = pickle.load(file)
with open("w2v_rf_model1.pkl", "rb") as file:
  w2v_rf_best_model = pickle.load(file)
with open("w2v_lr_model1.pkl", "rb") as file:
  w2v_lr_best_model = pickle.load(file)
with open("w2v_svc_model1.pkl", "rb") as file:
  w2v_svc_best_model = pickle.load(file)

In [4]:
#Feature Extraction for Testing data
def document_vector(w2v_model, doc):
    words = doc.split()
    words = [word for word in words if word in w2v_model.wv.key_to_index]
    if len(words) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(w2v_model.wv[words], axis=0)

def transform_w2v(X_test, w2v_model):
    X_test_vectors = np.array([document_vector(w2v_model, doc) for doc in X_test])
    return X_test_vectors

In [6]:
#Testing of W2V features of testing data using MLP
def check_best_model_MLP(X_test, y_test, w2v_mlp_best_model, w2v_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_w2v(X_test, w2v_vect_fit)
    y_pred = w2v_mlp_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_MLP(X_test1, y_test, w2v_mlp_best_model, w2v_vect_fit)

              precision    recall  f1-score   support

           0       0.98      0.96      0.97     22458
           1       0.83      0.91      0.87      4491

    accuracy                           0.95     26949
   macro avg       0.90      0.94      0.92     26949
weighted avg       0.96      0.95      0.95     26949

[[21600   858]
 [  391  4100]]
Precision: 0.826946349334409
Recall: 0.9129369850812736
FScore: 0.8678167001799132
Testing Accuracy: 0.9536531967791013
The total testing time is 6.792290449142456 seconds


In [7]:
#Testing of W2V features of testing data using KNN
def check_best_model_KNN(X_test, y_test, w2v_knn_best_model, w2v_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_w2v(X_test, w2v_vect_fit)
    y_pred = w2v_knn_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_KNN(X_test1, y_test, w2v_knn_best_model, w2v_vect_fit)

              precision    recall  f1-score   support

           0       0.96      0.96      0.96     22458
           1       0.81      0.81      0.81      4491

    accuracy                           0.94     26949
   macro avg       0.89      0.89      0.89     26949
weighted avg       0.94      0.94      0.94     26949

[[21624   834]
 [  851  3640]]
Precision: 0.813589628967367
Recall: 0.8105099087063015
FScore: 0.8120468488566648
Testing Accuracy: 0.937474488849308
The total testing time is 9.346926927566528 seconds


In [8]:
#Testing of W2V features of testing data using RF
def check_best_model_RF(X_test, y_test, w2v_rf_best_model, w2v_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_w2v(X_test, w2v_vect_fit)
    y_pred = w2v_rf_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_RF(X_test1, y_test, w2v_rf_best_model, w2v_vect_fit)

              precision    recall  f1-score   support

           0       0.97      0.94      0.96     22458
           1       0.74      0.88      0.80      4491

    accuracy                           0.93     26949
   macro avg       0.86      0.91      0.88     26949
weighted avg       0.94      0.93      0.93     26949

[[21095  1363]
 [  556  3935]]
Precision: 0.742733106832767
Recall: 0.8761968381206858
FScore: 0.8039636326488916
Testing Accuracy: 0.9287914208319418
The total testing time is 11.637950897216797 seconds


In [9]:
#Testing of W2V features of testing data using LR
def check_best_model_LR(X_test, y_test, w2v_lr_best_model, w2v_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_w2v(X_test, w2v_vect_fit)
    y_pred = w2v_lr_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_LR(X_test1, y_test, w2v_lr_best_model, w2v_vect_fit)

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     22458
           1       0.84      0.87      0.85      4491

    accuracy                           0.95     26949
   macro avg       0.91      0.92      0.91     26949
weighted avg       0.95      0.95      0.95     26949

[[21726   732]
 [  605  3886]]
Precision: 0.8414898224339541
Recall: 0.8652861278111779
FScore: 0.8532220880447908
Testing Accuracy: 0.9503877694905192
The total testing time is 9.17873501777649 seconds


In [10]:
#Testing of W2V features of testing data using SVC
def check_best_model_SVC(X_test, y_test, w2v_svc_best_model, w2v_vect_fit):
    start_time = time.time()
    X_test = pd.Series(X_test)
    X_test = X_test.fillna('')
    X_test_feature = transform_w2v(X_test, w2v_vect_fit)
    y_pred = w2v_svc_best_model.predict(X_test_feature)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    fscore = (2*p*r)/(p+r)
    print("Precision:",p)
    print("Recall:",r)
    print("FScore:",fscore)
    accuracy = accuracy_score(y_test, y_pred)
    print("Testing Accuracy:",accuracy)
    end_time = time.time()
    print(f"The total testing time is {end_time-start_time} seconds")
check_best_model_SVC(X_test1, y_test, w2v_svc_best_model, w2v_vect_fit)

              precision    recall  f1-score   support

           0       0.98      0.96      0.97     22458
           1       0.83      0.88      0.85      4491

    accuracy                           0.95     26949
   macro avg       0.90      0.92      0.91     26949
weighted avg       0.95      0.95      0.95     26949

[[21637   821]
 [  551  3940]]
Precision: 0.8275572358748162
Recall: 0.8773101759073703
FScore: 0.8517077388672719
Testing Accuracy: 0.9490890200007421
The total testing time is 16.44593572616577 seconds


In [11]:
all_params = w2v_svc_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [14]:
all_params = w2v_mlp_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'activation': 'tanh', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 1000, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}


In [15]:
all_params = w2v_knn_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


In [16]:
all_params = w2v_lr_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'C': 100, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [17]:
all_params = w2v_rf_best_model.get_params()
print("All parameters:", all_params)

All parameters: {'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 2000, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
