In [4]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score,
    recall_score, f1_score
)
from xgboost import XGBClassifier

### READ DATA

In [13]:
train = pd.read_csv("./data/processed/train.csv")
test = pd.read_csv("./data/processed/test.csv")

X_train = train.drop(columns=["review_flagged"])
y_train = train["review_flagged"]

X_test = test.drop(columns=["review_flagged"])
y_test = test["review_flagged"]

# Replace inf and -inf with NaN
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test  = X_test.replace([np.inf, -np.inf], np.nan)

# Replace NaN with 0
X_train = X_train.fillna(0)
X_test  = X_test.fillna(0)


### Model Evaluation Helper Function

In [18]:
def evaluate_model(name, y_true, y_prob):
    """
    Evaluates a binary classifier using appropriate metrics.
    """
    y_pred = (y_prob >= 0.5).astype(int)

    results = {
        "Model": name,
        "ROC-AUC": roc_auc_score(y_true, y_prob),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1": f1_score(y_true, y_pred, zero_division=0),
    }
    return results

### Model Training + Evaluation

In [19]:
# 3. BASELINE MODEL (predict prob = 0.5)
all_results = []
y_prob_baseline = np.full_like(y_test, 0.5, dtype=float)

baseline_result = evaluate_model("Baseline (0.5 prob)", y_test, y_prob_baseline)
all_results.append(baseline_result)

In [20]:
# 4. LOGISTIC REGRESSION
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

logreg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=2000,
        n_jobs=-1,
        class_weight="balanced"
    ))
])

# CV predictions for train
logreg_cv_pred = cross_val_predict(
    logreg_pipeline, X_train, y_train,
    cv=cv, method="predict_proba"
)[:, 1]

# Train on full training set
logreg_pipeline.fit(X_train, y_train)

# Test set prediction
y_prob_logreg = logreg_pipeline.predict_proba(X_test)[:, 1]

logreg_result = evaluate_model("Logistic Regression (CV)", y_test, y_prob_logreg)
logreg_result["CV ROC-AUC"] = roc_auc_score(y_train, logreg_cv_pred)

all_results.append(logreg_result)

In [None]:
# Model A (NO EARLY STOPPING) for CV
xgb_cv = XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    n_jobs=-1
)

# Generate cross-validation predictions
xgb_cv_pred = cross_val_predict(
    xgb_cv, X_train, y_train,
    cv=cv, method="predict_proba"
)[:, 1]


# Model B (WITH early stopping) for final test evaluation
xgb = XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    early_stopping_rounds=30,
    n_jobs=-1
)

xgb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

y_prob_xgb = xgb.predict_proba(X_test)[:, 1]

xgb_result = evaluate_model("XGBoost (CV + ES)", y_test, y_prob_xgb)
xgb_result["CV ROC-AUC"] = roc_auc_score(y_train, xgb_cv_pred)

all_results.append(xgb_result)

xgb_importance = xgb.feature_importances_

feat_imp_df = pd.DataFrame({
    "feature": X_train.columns,
    "importance": xgb_importance
}).sort_values("importance", ascending=False)

# Escape $ to avoid LaTeX rendering errors
feat_imp_df["feature"] = feat_imp_df["feature"].str.replace("$", "\\$")

plt.figure(figsize=(8, 10))
plt.barh(feat_imp_df.head(20)["feature"], feat_imp_df.head(20)["importance"])
plt.gca().invert_yaxis()
plt.title("XGBoost Feature Importance (Top 20)")
plt.show()



In [None]:
# 6. NEURAL NETWORK (MLP) (finetune and get SHAP)

mlp_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", MLPClassifier(
        hidden_layer_sizes=(128, 64),
        activation="relu",
        solver="adam",
        learning_rate_init=0.001,
        max_iter=80,
        random_state=42
    ))
])

# CV predictions
mlp_cv_pred = cross_val_predict(
    mlp_pipeline, X_train, y_train,
    cv=cv, method="predict_proba"
)[:, 1]

# Train on full training set
mlp_pipeline.fit(X_train, y_train)

y_prob_mlp = mlp_pipeline.predict_proba(X_test)[:, 1]

mlp_result = evaluate_model("Neural Network (MLP CV)", y_test, y_prob_mlp)
mlp_result["CV ROC-AUC"] = roc_auc_score(y_train, mlp_cv_pred)

all_results.append(mlp_result)



In [None]:
# ====================================================
# 7. RESULTS SUMMARY
# ====================================================
results_df = pd.DataFrame(all_results)
results_df