## LOAD DATA

In [1]:
import pandas as pd
from pathlib import Path

df = pd.read_csv(Path().absolute().parent.parent / "data/text_class.csv")

In [2]:
import sys

sys.path.insert(0, str(Path().absolute().parent.parent))

from src.text_vectorization import hashing_texts

In [3]:
X = hashing_texts(df["text"], 2**15)
y = df["class"]

In [4]:
from collections import Counter

print(Counter(y))

Counter({0: 73, 1: 24})


## CROSS VALIDATION

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import confusion_matrix

def stacking_clf():
    estimators = [
        ("Random Forest", RandomForestClassifier(random_state=42)),
        ("Logistic Regression", LogisticRegression()),
        ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ]
    stacking_classifier = StackingClassifier(
        estimators=estimators, final_estimator=GradientBoostingClassifier(random_state=42)
    )
    return stacking_classifier

In [6]:
from sklearn.model_selection import KFold
from collections import Counter

from imblearn.over_sampling import SMOTE

kf = KFold(n_splits=5, random_state=42, shuffle=True)
rows = []
results_means = {}
for name in ["stacking_ensemble", "gradient_boosting", "logistic_regression"]:
    results_means[name] = {
        "model":name,
        "SMOTE":False,
        "sensitivity":[],
        "specificity":[],
        "precision":[],
    }
results_means_SMOTE = {}
for name in ["stacking_ensemble", "gradient_boosting", "logistic_regression"]:
    results_means_SMOTE[name] = {
        "model":name,
        "SMOTE":True,
        "sensitivity":[],
        "specificity":[],
        "precision":[],
    }

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    oversample = SMOTE()
    X_train_SMOTE, y_train_SMOTE = oversample.fit_resample(X_train, y_train)
    for name, clf in [
        ("stacking_ensemble", stacking_clf()),
        ("gradient_boosting", GradientBoostingClassifier()),
        ("logistic_regression", LogisticRegression()),
    ]:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        sensitivity = tp / (fn + tp) if (fn + tp) else 0
        specificity = tn / (tn + fp) if (tn + fp) else 0
        precision = tp / (fp + tp) if (fp + tp) else 0
        results_means[name]["sensitivity"].append(sensitivity)
        results_means[name]["specificity"].append(specificity)
        results_means[name]["precision"].append(precision)
        clf.fit(X_train_SMOTE, y_train_SMOTE)
        y_pred = clf.predict(X_test)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        sensitivity = tp / (fn + tp) if (fn + tp) else 0
        specificity = tn / (tn + fp) if (tn + fp) else 0
        precision = tp / (fp + tp) if (fp + tp) else 0
        results_means_SMOTE[name]["sensitivity"].append(sensitivity)
        results_means_SMOTE[name]["specificity"].append(specificity)
        results_means_SMOTE[name]["precision"].append(precision)

In [7]:
import numpy as np

rows = []
for k,v in results_means.items():
    for j,l in v.items():
        if j in ["sensitivity","specificity","precision"]:
            results_means[k][j] = np.mean(l)
for k,v in results_means_SMOTE.items():
    for j,l in v.items():
        if j in ["sensitivity","specificity","precision"]:
            results_means_SMOTE[k][j] = np.mean(l)
rows.extend(list(results_means.values()))
rows.extend(list(results_means_SMOTE.values()))

In [8]:
print("Predictions")
df_results = pd.DataFrame(rows)
df_results.to_csv("results_cross_validation.csv", index=False)

Predictions
