## LOAD DATA

In [1]:
import pandas as pd
from pathlib import Path

df = pd.read_csv(Path().absolute().parent.parent / "data/text_class_teaching.csv")

In [2]:
import sys

sys.path.insert(0, str(Path().absolute().parent.parent))

from src.text_vectorization import hashing_texts

In [3]:
df.fillna(-1, inplace=True)
target_col = "Aor"
df = df[df[target_col] != -1]
df = df.sample(frac=0.5)
X = hashing_texts(df["text"], 2**15)
y = df[target_col].tolist()

In [4]:
from collections import Counter

print(Counter(y))

Counter({0: 3979, 1: 128})


## DATA AUGMENTATION WITH SMOTE

In [5]:
from collections import Counter

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print("Before data augmentation with SMOTE")
print(X_train.shape)
print(Counter(y_train))

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

print("After data augmentation with SMOTE")
print(X_train.shape)
print(Counter(y_train))

Before data augmentation with SMOTE
(2751, 32768)
Counter({0: 2660, 1: 91})
After data augmentation with SMOTE
(5320, 32768)
Counter({0: 2660, 1: 2660})


## MODEL EVALUATION

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import confusion_matrix

def stacking_clf():
    estimators = [
        ("Random Forest", RandomForestClassifier(random_state=42)),
        ("Logistic Regression", LogisticRegression()),
        ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ]
    stacking_classifier = StackingClassifier(
        estimators=estimators, final_estimator=GradientBoostingClassifier(random_state=42)
    )
    return stacking_classifier

def predict_function(X, y, with_smote=True):
    rows = []
    for name, clf in [
#         ("stacking_ensemble", stacking_clf()),
        ("gradient_boosting", GradientBoostingClassifier()),
        ("logistic_regression", LogisticRegression()),
    ]:
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
        if with_smote:
            oversample = SMOTE()
            X_train, y_train = oversample.fit_resample(X_train, y_train)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        sensitivity = tp / (fn + tp) if (fn + tp) else 0
        specificity = tn / (tn + fp) if (tn + fp) else 0
        precision = tp / (fp + tp) if (fp + tp) else 0
        accuracy = (tp + tn)/ (fn + fp + tp + tn)
        rows.append({
            "model":name,
            "accuracy":accuracy,
            "sensitivity":sensitivity,
            "specificity":specificity,
            "precision":precision,
            "smote":with_smote
        })
    return rows

In [7]:
print("Predicting without smote")
df_without_smote = pd.DataFrame(predict_function(X, y, with_smote=False))
print(df_without_smote)
df_without_smote.to_excel("results_without_smote.xlsx", index=False)
print("Predicting with smote")
df_com_smote = pd.DataFrame(predict_function(X, y))
print(df_com_smote)
df_com_smote.to_excel("results_with_smote.xlsx", index=False)

Predicting without smote
                 model  accuracy  sensitivity  specificity  precision  smote
0    gradient_boosting  0.997079     0.965517     0.997996   0.933333  False
1  logistic_regression  0.986368     0.517241     1.000000   1.000000  False
Predicting with smote
                 model  accuracy  sensitivity  specificity  precision  smote
0    gradient_boosting  0.997079     0.965517     0.997996   0.933333   True
1  logistic_regression  0.997079     0.896552     1.000000   1.000000   True
