In [0]:
# Databricks notebook source
from pyspark.sql.functions import col
import pandas as pd
import numpy as np

delta_path = "/Volumes/workspace/default/santader_lab3(for4)"
santander_df = spark.read.format("delta").load(delta_path)

print("Rows:", santander_df.count())
display(santander_df.limit(5))


In [0]:
target_col = "target"
feature_col = "scaled_features"

santander_df = santander_df.filter(col(target_col).isNotNull())

pdf = santander_df.select(feature_col, target_col).toPandas()

X = np.vstack(pdf[feature_col].values)
y = pdf[target_col].values

In [0]:
import numpy as np
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    random_state=42,
    stratify=y_temp
)

In [0]:
class_weight = "balanced"

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

print("Запуск GridSearchCV з урахуванням дисбалансу...")

param_grid_santander = {
    "n_estimators": [300, 500],
    "max_depth": [8, 12, 16],
    "min_samples_split": [2, 5],
    "class_weight": ["balanced_subsample"]
}

grid_santander = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid_santander,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_santander.fit(X_train, y_train)

print("Найкращі параметри:")
print(grid_santander.best_params_)

best_rf = grid_santander.best_estimator_


In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

metrics = []

# =============================================
# 1. Logistic Regression (balanced)
# =============================================
lr = LogisticRegression(
    max_iter=500,
    class_weight="balanced",
    solver="liblinear",
    n_jobs=-1
)

lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:, 1]

metrics.append((
    "LogReg",
    accuracy_score(y_test, y_pred_lr),
    f1_score(y_test, y_pred_lr),
    roc_auc_score(y_test, y_prob_lr)
))


# =============================================
# 2. Decision Tree (balanced)
# =============================================
dt = DecisionTreeClassifier(
    max_depth=8,
    min_samples_leaf=5,
    class_weight="balanced",
    random_state=42
)

dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_prob_dt = dt.predict_proba(X_test)[:, 1]

metrics.append((
    "Decision Tree",
    accuracy_score(y_test, y_pred_dt),
    f1_score(y_test, y_pred_dt),
    roc_auc_score(y_test, y_prob_dt)
))


# =============================================
# 3. Random Forest (tuned + class_weight )
# =============================================
rf = RandomForestClassifier(
    n_estimators=best_rf.n_estimators,
    max_depth=best_rf.max_depth,
    min_samples_split=best_rf.min_samples_split,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

metrics.append((
    "Random Forest (tuned)",
    accuracy_score(y_test, y_pred_rf),
    f1_score(y_test, y_pred_rf),
    roc_auc_score(y_test, y_prob_rf)
))


# =============================================
# 4. Gradient Boosting 
# =============================================
gbr = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    max_features="sqrt"
)

gbr.fit(X_train, y_train)

y_pred_gbr = gbr.predict(X_test)
y_prob_gbr = gbr.predict_proba(X_test)[:, 1]

metrics.append((
    "Gradient Boosting",
    accuracy_score(y_test, y_pred_gbr),
    f1_score(y_test, y_pred_gbr),
    roc_auc_score(y_test, y_prob_gbr)
))


# =============================================
# Порівняння моделей
# =============================================
results = pd.DataFrame(metrics, columns=["Model", "Accuracy", "F1-score", "ROC-AUC"])
display(results)





In [0]:
# =============================================
# Важливість ознак (RF)
# =============================================
importances = rf.feature_importances_

feat_importance_df = pd.DataFrame({
    "feature": [f"feat_{i}" for i in range(X.shape[1])],
    "importance": importances
}).sort_values("importance", ascending=False)

display(feat_importance_df.head(15))


In [0]:
%pip install shap


In [0]:
# =============================================
# SHAP інтерпретація
# =============================================

import shap
shap.initjs()

X_sample = X_test[:1500]

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_sample)

if isinstance(shap_values, list):
    shap_values_1 = shap_values[1]
else:
    shap_values_1 = shap_values

shap.summary_plot(shap_values_1, X_sample)


In [0]:

# ============================================================
# FULL METRICS (для Random Forest)
# ============================================================

from sklearn.metrics import precision_score, recall_score

print("====================================================")
print("      FULL METRICS FOR BEST MODEL (Random Forest)")
print("====================================================")

acc = accuracy_score(y_test, y_pred_rf)
prec = precision_score(y_test, y_pred_rf)
rec = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
roc = roc_auc_score(y_test, y_prob_rf)

print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1-score: {f1:.3f}")
print(f"ROC-AUC: {roc:.3f}")



In [0]:
# ============================================================
# CONFUSION MATRIX
# ============================================================

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix — Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [0]:

# ============================================================
# ROC CURVE
# ============================================================

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_prob_rf)

plt.figure(figsize=(7, 6))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc:.3f}")
plt.plot([0, 1], [0, 1], 'k--', label="Random guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — Random Forest")
plt.legend()
plt.grid(True)
plt.show()


In [0]:
# ============================================================
# PRECISION–RECALL (PR) CURVE
# ============================================================

from sklearn.metrics import precision_recall_curve

precisions, recalls, _ = precision_recall_curve(y_test, y_prob_rf)

plt.figure(figsize=(7, 6))
plt.plot(recalls, precisions)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("PR Curve — Random Forest")
plt.grid(True)
plt.show()


In [0]:

# ============================================================
# CALIBRATION CURVE
# ============================================================

from sklearn.calibration import calibration_curve

prob_true, prob_pred = calibration_curve(y_test, y_prob_rf, n_bins=10)

plt.figure(figsize=(6,6))
plt.plot(prob_pred, prob_true, marker='o')
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("Mean predicted probability")
plt.ylabel("True fraction of positives")
plt.title("Calibration Curve — Random Forest")
plt.grid(True)
plt.show()

In [0]:
# ============================================================
# FEATURE IMPORTANCE (bar plot)
# ============================================================

plt.figure(figsize=(10,5))
plt.bar(range(len(importances)), importances)
plt.title("Feature Importance — Random Forest")
plt.xlabel("Feature index")
plt.ylabel("Importance")
plt.show()



In [0]:
 #============================================================
#  METRICS FOR TRAIN / VALIDATION / TEST — Random Forest
# ============================================================

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def compute_metrics(model, X, y):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]
    return (
        accuracy_score(y, y_pred),
        f1_score(y, y_pred, zero_division=0),
        roc_auc_score(y, y_prob)
    )

# ---- TRAIN ----
train_acc, train_f1, train_auc = compute_metrics(best_rf, X_train, y_train)

# ---- VALIDATION ----
val_acc, val_f1, val_auc = compute_metrics(best_rf, X_val, y_val)

# ---- TEST ----
test_acc, test_f1, test_auc = compute_metrics(best_rf, X_test, y_test)

# ---- SUMMARY TABLE ----
metrics_santander = pd.DataFrame([
    ["Random Forest (tuned)", train_acc, train_f1, train_auc,
                             val_acc,   val_f1,   val_auc,
                             test_acc,  test_f1,  test_auc]
],
columns=[
    "Model",
    "Train_Accuracy", "Train_F1", "Train_ROC-AUC",
    "Val_Accuracy",   "Val_F1",   "Val_ROC-AUC",
    "Test_Accuracy",  "Test_F1",  "Test_ROC-AUC"
])

display(metrics_santander)