## LOAD DATA

In [1]:
import pandas as pd
from pathlib import Path

df = pd.read_csv(Path().absolute().parent / "3. Clustering/text_class.csv")

In [2]:
import sys

sys.path.insert(0, str(Path().absolute().parent.parent))

from src.text_vectorization import hashing_texts

X = hashing_texts(df["text"], 2**15)
y = df["class"]

## DATA AGUMENTATION WITH SMOTE

In [3]:
from collections import Counter

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print("Before data augmentation with SMOTE")
print(X_train.shape)
print(Counter(y_train))

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

print("After data augmentation with SMOTE")
print(X_train.shape)
print(Counter(y_train))

Before data augmentation with SMOTE
(64, 32768)
Counter({0: 48, 1: 16})
After data augmentation with SMOTE
(96, 32768)
Counter({1: 48, 0: 48})


## MODEL EVALUATION

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import confusion_matrix

def stacking_clf():
    estimators = [
        ("Random Forest", RandomForestClassifier(random_state=42)),
        ("Logistic Regression", LogisticRegression()),
        ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ]
    stacking_classifier = StackingClassifier(
        estimators=estimators, final_estimator=LogisticRegression()
    )
    return stacking_classifier

def predict_function(X, y, with_smote=True):
    rows = []
    for name, clf in [
        ("stacking_ensemble", stacking_clf()),
        ("gradient_boosting", GradientBoostingClassifier()),
        ("logistic_regression", LogisticRegression()),
    ]:
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
        if with_smote:
            oversample = SMOTE()
            X_train, y_train = oversample.fit_resample(X_train, y_train)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        sensitivity = tp / (fn + tp) if (fn + tp) else 0
        specificity = tn / (tn + fp) if (tn + fp) else 0
        precision = tp / (fp + tp) if (fp + tp) else 0
        rows.append({
            "model":name,
            "sensitivity":sensitivity,
            "specificity":specificity,
            "precision":precision,
            "smote":with_smote
        })
    return rows

In [5]:
print("Predicting without smote")
df_without_smote = pd.DataFrame(predict_function(X, y, with_smote=False))
print(df_without_smote)
df_without_smote.to_excel("results_without_smote.xlsx", index=False)
print("Predicting with smote")
df_com_smote = pd.DataFrame(predict_function(X, y))
print(df_com_smote)
df_com_smote.to_excel("results_with_smote.xlsx", index=False)

Predicting without smote
                 model  sensitivity  specificity  precision  smote
0    stacking_ensemble          0.6         0.75      0.375  False
1    gradient_boosting          0.6         0.75      0.375  False
2  logistic_regression          0.0         1.00      0.000  False
Predicting with smote
                 model  sensitivity  specificity  precision  smote
0    stacking_ensemble          1.0         0.85   0.625000   True
1    gradient_boosting          1.0         0.80   0.555556   True
2  logistic_regression          1.0         0.95   0.833333   True
