#### Import needed dependecies

In [None]:
import os
import numpy as np
import pandas as pd
import torch, torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, f1_score, classification_report, confusion_matrix
import joblib

In [None]:
# runs all cells in data_preprocessing.ipynb
%run data_preprocessing.ipynb

In [None]:
os.makedirs("models", exist_ok=True)

## Train and evaluate without cross-validation

#### Simple train/validation split (no CV)

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_transformed, 
    y_train.values.ravel(),
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

#### Fit & evaluate Decision tree and KNN classifiers

In [None]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_tr, y_tr)
y_val_pred = dt.predict(X_val)
print("Decision Tree\n", classification_report(y_val, y_val_pred))

In [None]:
# KNN
knn = KNeighborsClassifier()
knn.fit(X_tr, y_tr)
print("KNN\n", classification_report(y_val, knn.predict(X_val)))

#### PyTorch MLP on a fixed split

In [None]:
# wrap into DataLoader
train_ds = TensorDataset(torch.from_numpy(X_tr).float(), torch.from_numpy(y_tr).float())
val_ds   = TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=32)

In [None]:
# simple MLP
class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim,128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128,64),    nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64,1)
        )
    def forward(self,x): return self.net(x).squeeze(1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(X_tr.shape[1]).to(device)
opt   = torch.optim.Adam(model.parameters(), lr=1e-3)
crit  = nn.BCEWithLogitsLoss()

# training loop
for epoch in range(20):
    model.train()
    for xb,yb in train_loader:
        xb,yb = xb.to(device), yb.to(device)
        opt.zero_grad()
        loss = crit(model(xb), yb)
        loss.backward(); opt.step()

# evaluate

model.eval()
preds, trues = [], []
with torch.no_grad():
    for xb,yb in val_loader:
        xb = xb.to(device)
        probs = torch.sigmoid(model(xb)).cpu().numpy()
        preds.extend((probs>0.5).astype(int))
        trues.extend(yb.numpy().astype(int))

print("MLP\n", classification_report(trues, preds))
print("MLP F1:", f1_score(trues, preds))


## Train and evaluate with cross-validation

#### Flatten labels

In [None]:
y = y_train.values.ravel()

#### Decision Tree and KNN via scikit-learn CV

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score)

save_dir = "models/"

models = [
    # Decision Tree
    ("Decision Tree",
     DecisionTreeClassifier(
         max_depth=5,
         min_samples_split=2,
         min_samples_leaf=1,
         random_state=42
     )
    ),
    # KNN
    ("KNN",
     KNeighborsClassifier(
         n_neighbors=5,
        weights='uniform'
     )
    )
]

cv_results = {}
model_avg_val_f1 = {}

# run cross_validate for each model
for name, clf in models:
    pipe = Pipeline([
        ("preproc", preprocessor),
        ("clf",      clf)
    ])
    results = cross_validate(
        pipe,
        X_train,
        y,
        cv=cv,
        scoring=f1_scorer,
        return_train_score=True,
        n_jobs=-1
    )
    cv_results[name] = results

    # print per‐fold F1
    train_f1 = results['train_score']
    val_f1   = results['test_score']
    print(f"\n{name} F1 scores by fold:")
    for fold, (tr, va) in enumerate(zip(train_f1, val_f1), start=1):
        print(f"  Fold {fold:>1d}: train F1 = {tr:.3f},  val F1 = {va:.3f}")
    
    # Save the model's average validation F1 scores
    # Later be used to determine the best of the 3 models
    model_avg_val_f1[name] = val_f1.mean()
    
    # Final fit on all training data & save
    pipe.fit(X_train, y)
    fname = f"{name.lower().replace(' ', '_')}.pkl"
    path  = os.path.join(save_dir, fname)
    joblib.dump(pipe, path)
    print(f"→ Saved `{name}` pipeline to {path}")

In [None]:
# build DataFrame of results
rows = []
for model_name, res in cv_results.items():
    for fold_idx, (train_score, test_score, fit_time, score_time) in enumerate(
        zip(res['train_score'], res['test_score'], res['fit_time'], res['score_time']),
        start=1
    ):
        rows.append({
            'model'       : model_name,
            'fold'        : fold_idx,
            'train_f1'    : train_score,
            'val_f1'      : test_score,
            'fit_seconds' : fit_time,
            'score_seconds': score_time
        })

df_cv = pd.DataFrame(rows)

#### PyTorch MLP with 5-fold CV

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}\n")

# First, transform the entire feature matrix once:
X_all = preprocessor.fit_transform(X_train, y)

# Hyperparameters
EPOCHS      = 20
BATCH_SIZE  = 32
LR          = 1e-3
INPUT_DIM   = X_all.shape[1]

In [None]:
# Define a simple MLP for binary classification
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)            # single logit
        )
    def forward(self, x):
        return self.net(x).squeeze(1)  # (batch,)

In [None]:
import os
import matplotlib.pyplot as plt

def train_and_eval(train_idx, val_idx, fold=None):
    # Split data
    X_tr = torch.from_numpy(X_all[train_idx]).float()
    y_tr = torch.from_numpy(y[train_idx]).float()
    X_va = torch.from_numpy(X_all[val_idx]).float()
    y_va = torch.from_numpy(y[val_idx]).float()

    # DataLoaders
    tr_loader = DataLoader(TensorDataset(X_tr, y_tr),
                           batch_size=BATCH_SIZE, shuffle=True)
    va_loader = DataLoader(TensorDataset(X_va, y_va),
                           batch_size=BATCH_SIZE, shuffle=False)

    # Model, loss, optimizer
    model     = MLP(INPUT_DIM).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    train_losses, val_losses = [], []

    # Training loop
    for epoch in range(1, EPOCHS+1):
        model.train()
        running_train_loss = 0.0
        for xb, yb in tr_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)
            loss   = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            running_train_loss += loss.item() * xb.size(0)
        train_losses.append(running_train_loss / len(tr_loader.dataset))

        model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for xb, yb in va_loader:
                xb, yb = xb.to(device), yb.to(device)
                loss_val = criterion(model(xb), yb)
                running_val_loss += loss_val.item() * xb.size(0)
        val_losses.append(running_val_loss / len(va_loader.dataset))
    
    torch.save(model.state_dict(), f"{save_dir}/MLP.pth")

    # Plot learning curves
    plt.figure(figsize=(6,4))
    plt.plot(range(1, EPOCHS+1), train_losses, label='Train Loss')
    plt.plot(range(1, EPOCHS+1), val_losses,   label='Val   Loss')
    plt.xlabel('Epoch')
    plt.ylabel('BCEWithLogitsLoss')
    plt.title(f'MLP Learning Curves' + (f' — Fold {fold}' if fold is not None else ''))
    plt.legend()

    # Ensure directory exists and save
    os.makedirs('plots', exist_ok=True)
    fname = f'plots/mlp_learning_curve'
    if fold is not None:
        fname += f'_fold{fold}'
    fname += '.png'
    plt.savefig(fname, bbox_inches='tight', dpi=300)

    plt.show()
    
    # Final F1 on validation fold
    model.eval()
    preds_tr, trues_tr = [], []
    preds, trues = [], []
    with torch.no_grad():

        for xb, yb in tr_loader:
            xb = xb.to(device)
            probs = torch.sigmoid(model(xb)).cpu().numpy()
            preds_tr.extend((probs > 0.5).astype(int))
            trues_tr.extend(yb.numpy().astype(int))
 
        for xb, yb in va_loader:
            xb = xb.to(device)
            probs = torch.sigmoid(model(xb)).cpu().numpy()
            preds.extend((probs > 0.5).astype(int))
            trues.extend(yb.numpy().astype(int))

    return f1_score(trues_tr, preds_tr), f1_score(trues, preds)


In [None]:
# Run 5-fold CV
mlp_scores = []
for fold, (tr_idx, va_idx) in enumerate(cv.split(X_all, y), start=1):
    f1_tr, f1 = train_and_eval(tr_idx, va_idx, fold=fold)
    mlp_scores.append(f1)
    print(f"MLP Fold {fold} →  F1 (training) = {f1_tr:3f} F1 (val) = {f1:.3f}")

model_avg_val_f1["MLP"] = np.mean(np.array(mlp_scores))

print(f"\nMLP  → mean F1 = {np.mean(mlp_scores):.3f},  std = {np.std(mlp_scores):.3f}")

#### Plot bar chart for Decision Tree, KNN, and MLP

In [None]:
# Create a small DataFrame for the MLP
df_mlp = pd.DataFrame({
    'model': ['MLP'] * len(mlp_scores),
    'fold' : list(range(1, len(mlp_scores) + 1)),
    'val_f1': mlp_scores
})

# Combine with your existing DT/KNN results
# We only need model & val_f1 for the bar chart:
df_all = pd.concat([
    df_cv[['model','fold','val_f1']],
    df_mlp
], ignore_index=True)

# Compute means and stds over val_f1 by model
grouped = df_all.groupby('model')['val_f1']
means = grouped.mean()
stds  = grouped.std()

# Plot bar chart with error bars
plt.figure(figsize=(6,4))
bars = plt.bar(
    means.index,
    means.values,
    yerr=stds.values,
    capsize=5,
    alpha=0.8
)
plt.ylim(0, 1)
plt.ylabel("Mean 5-fold Val F1")
plt.title("Cross-Validated F1 by Model")

# Annotate each bar with its mean value
for bar, m in zip(bars, means.values):
    plt.text(
        bar.get_x() + bar.get_width()/2,
        m + 0.01,
        f"{m:.3f}",
        ha='center',
        va='bottom'
    )

# Save the figure
os.makedirs('plots', exist_ok=True)
plt.savefig('plots/cv_f1_by_model_all.png', bbox_inches='tight', dpi=300)

# Display it
plt.show()


#### Plot confusion matrices for Decision Tree, KNN, and MLP

In [None]:
# ensure output directory
os.makedirs('plots', exist_ok=True)

def plot_cm(cm, model_name):
    plt.figure(figsize=(4,4))
    im = plt.imshow(cm, cmap='Blues', interpolation='nearest')
    plt.title(f"{model_name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")

    # set proper ticks and labels
    classes = ['0','1']
    plt.xticks(np.arange(len(classes)), classes)
    plt.yticks(np.arange(len(classes)), classes)

    # annotate with contrasting text color
    thresh = cm.max() / 2
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            color = "white" if cm[i, j] > thresh else "black"
            plt.text(j, i, f"{cm[i, j]:d}",
                     ha='center', va='center',
                     color=color)

    plt.tight_layout()
    os.makedirs('plots', exist_ok=True)
    plt.savefig(f"plots/{model_name.lower().replace(' ','_')}_cm.png",
                dpi=300, bbox_inches='tight')
    plt.show()

# ——— Decision Tree & KNN———
X_tr_raw, X_val_raw, y_tr, y_val = train_test_split(
    X_train, y_train.values.ravel(),
    test_size=0.2, random_state=42, stratify=y_train
)

for name, clf in models:

    pipe.fit(X_tr_raw, y_tr)
    y_pred = pipe.predict(X_val_raw)    # also DataFrame

    cm = confusion_matrix(y_val, y_pred)
    plot_cm(cm, name)

# ——— MLP ———
cm_mlp = confusion_matrix(trues, preds)
plot_cm(cm_mlp, "MLP")
