In [158]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.preprocessing import StandardScaler

In [159]:
x_train_tf = np.load('feature_extraction/X_train_tf.npy', allow_pickle=True)
x_train_count = np.load('feature_extraction/X_train_count.npy', allow_pickle=True)


x_test_tf = np.load('feature_extraction/X_test_tf.npy', allow_pickle=True)
x_test_count = np.load('feature_extraction/X_test_count.npy', allow_pickle=True)

y_train = np.load('feature_extraction/y_train.npy', allow_pickle=True)
y_test = np.load('feature_extraction/y_test.npy', allow_pickle=True)

# Build machine learning modules

In [160]:
# Define variables to store the metrics
metrics_data = {
    "Model": [],
    "Feature Extraction": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": []
}

In [161]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the models
models = [
    RandomForestClassifier(random_state=42),
    LogisticRegression(random_state=42, max_iter=1000),
    DecisionTreeClassifier(random_state=42),
    MultinomialNB(),
    SVC(random_state=42),
    GradientBoostingClassifier(random_state=42),
    KNeighborsClassifier()
]

# Define the model names
model_names = ["RF", "LR", "DT", "NB", "SVC", "GB", "KNN"]

# Define the ngram_ranges corresponding to the x_data
ngram_ranges = [(1, 1), (1, 2), (1, 3), (1, 4)]

# Iterate over models, x_data, and ngram_ranges
for model, model_name in zip(models, model_names):
    for x_train, x_test, ngram_range in zip(x_train_tf, x_test_tf, ngram_ranges):
        # Fit the model on the training data
        model.fit(x_train, y_train)

        # Predict on the test data
        y_pred = model.predict(x_test)

        # Calculate evaluation metrics
        acc = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Append the metrics to the data dictionary
        metrics_data["Model"].append(model_name)
        metrics_data["Feature Extraction"].append("TF-IDF" + str(ngram_range))
        metrics_data["Accuracy"].append(acc)
        metrics_data["Precision"].append(precision)
        metrics_data["Recall"].append(recall)
        metrics_data["F1 Score"].append(f1)

        # Print evaluation metrics
        print(f"Metrics for {model_name} with TF-IDF for {ngram_range} range gram:")
        print("Accuracy:", acc)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 Score:", f1)
        print()

    # Fit the model on the Count Vectorization data
    model.fit(x_train_count, y_train)
    y_pred_count = model.predict(x_test_count)
    acc_count = accuracy_score(y_test, y_pred_count)
    precision_count = precision_score(y_test, y_pred_count)
    recall_count = recall_score(y_test, y_pred_count)
    f1_count = f1_score(y_test, y_pred_count)

    # Append the metrics for Count Vectorization
    metrics_data["Model"].append(model_name)
    metrics_data["Feature Extraction"].append("Count Vectorization")
    metrics_data["Accuracy"].append(acc_count)
    metrics_data["Precision"].append(precision_count)
    metrics_data["Recall"].append(recall_count)
    metrics_data["F1 Score"].append(f1_count)

    # Print evaluation metrics for Count Vectorization
    print(f"Metrics for {model_name} with Count Vectorization:")
    print("Accuracy:", acc_count)
    print("Precision:", precision_count)
    print("Recall:", recall_count)
    print("F1 Score:", f1_count)
    print()


Metrics for RF with TF-IDF for (1, 1) range gram:
Accuracy: 0.913239267487241
Precision: 0.8891774891774892
Recall: 0.8644781144781145
F1 Score: 0.8766538625693555

Metrics for RF with TF-IDF for (1, 2) range gram:
Accuracy: 0.9156409486640649
Precision: 0.9031111111111111
Recall: 0.8552188552188552
F1 Score: 0.8785127539991353

Metrics for RF with TF-IDF for (1, 3) range gram:
Accuracy: 0.9168417892524767
Precision: 0.9020300088261254
Recall: 0.8602693602693603
F1 Score: 0.8806548901335631

Metrics for RF with TF-IDF for (1, 4) range gram:
Accuracy: 0.9159411588111678
Precision: 0.901060070671378
Recall: 0.8585858585858586
F1 Score: 0.8793103448275861

Metrics for RF with Count Vectorization:
Accuracy: 0.9084359051335935
Precision: 0.906163753449862
Recall: 0.8291245791245792
F1 Score: 0.8659340659340661

Metrics for LR with TF-IDF for (1, 1) range gram:
Accuracy: 0.9198438907235065
Precision: 0.8866498740554156
Recall: 0.8888888888888888
F1 Score: 0.8877679697351828

Metrics for LR w

In [162]:
# Save the metrics to a file or data structure
import pandas as pd
df_metrics = pd.DataFrame(metrics_data)
df_metrics.to_csv("metrics.csv", index=False)