In [2]:

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix, roc_auc_score,
                             roc_curve, auc)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


RANDOM_SEED = 42
RESULTS_DIR = "task2_results"
os.makedirs(RESULTS_DIR, exist_ok=True)

def save_fig(fig, name):
    path = os.path.join(RESULTS_DIR, name)
    fig.savefig(path, bbox_inches='tight')
    print("Saved:", path)

def multiclass_roc_auc(y_true, y_score, labels):
    
    from sklearn.preprocessing import label_binarize
    y_bin = label_binarize(y_true, classes=labels)
    n_classes = y_bin.shape[1]
    aucs = {}
    for i, cls in enumerate(labels):
        try:
            aucs[cls] = roc_auc_score(y_bin[:, i], y_score[:, i])
        except Exception:
            aucs[cls] = np.nan
    macro = np.nanmean([v for v in aucs.values() if not np.isnan(v)])
    return macro, aucs


df = pd.read_csv("cleaned_worldcup_matches.csv")
print("Rows:", len(df))
print("Columns:", df.columns.tolist())


possible_features = [
    
    "goal_diff_home", "goal_diff_away", "home_score", "away_score",
    "home_advantage", "year",
    
    "win_rate_last10", "wins_last10_count", "form_score_last10",
    
    "home_rank_proxy", "away_rank_proxy", "ranking_diff_proxy",
    "home_avg_age", "away_avg_age", "avg_age_diff",
    "home_avg_caps", "away_avg_caps", "avg_caps_diff",
    "home_total_market_value", "away_total_market_value", "market_value_diff",

]

features = [c for c in possible_features if c in df.columns]
print("Using features:", features)


if "match_outcome" in df.columns:
    y = df["match_outcome"].astype(str)
else:
    
    if "outcome_numeric" in df.columns:
        y = df["outcome_numeric"].map({1: "home_win", 0: "draw", -1: "away_win"})
    else:
        raise ValueError("No suitable target column found (match_outcome or outcome_numeric).")

X = df[features].copy()


numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)


numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

if categorical_features:
    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])
    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])
else:
    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numeric_features)
    ])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)
print("Train / Test:", X_train.shape, X_test.shape)
print("Class distribution (train):", y_train.value_counts(normalize=True).to_dict())


pipe_lr = Pipeline([
    ("pre", preprocessor),
    ("clf", LogisticRegression(random_state=RANDOM_SEED, multi_class="multinomial", solver="saga", max_iter=5000))
])


pipe_rf = Pipeline([
    ("pre", preprocessor),
    ("clf", RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1))
])


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

def evaluate_cv(pipe, X, y):
    acc = cross_val_score(pipe, X, y, cv=skf, scoring="accuracy", n_jobs=-1)
    f1 = cross_val_score(pipe, X, y, cv=skf, scoring="f1_macro", n_jobs=-1)
    return {"accuracy_mean": acc.mean(), "accuracy_std": acc.std(), "f1_macro_mean": f1.mean(), "f1_macro_std": f1.std()}

print("CV baseline Logistic Regression:", evaluate_cv(pipe_lr, X_train, y_train))
print("CV baseline Random Forest:", evaluate_cv(pipe_rf, X_train, y_train))


param_grid_lr = {
    "clf__C": [0.01, 0.1, 1.0, 10],
    "clf__penalty": ["l2"],  
}

gs_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=skf, scoring="f1_macro", n_jobs=-1, verbose=2)
gs_lr.fit(X_train, y_train)
print("Best LR params:", gs_lr.best_params_, "Best score:", gs_lr.best_score_)


param_grid_rf = {
    "clf__n_estimators": [100, 300],
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_split": [2, 5],
    "clf__class_weight": [None, "balanced"]
}

gs_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=skf, scoring="f1_macro", n_jobs=-1, verbose=2)
gs_rf.fit(X_train, y_train)
print("Best RF params:", gs_rf.best_params_, "Best score:", gs_rf.best_score_)

best_lr = gs_lr.best_estimator_
best_rf = gs_rf.best_estimator_

def evaluate_on_test(model, X_test, y_test, label="model"):
    y_pred = model.predict(X_test)
    y_prob = None
    try:
        y_prob = model.predict_proba(X_test)
    except Exception:
        pass

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
    print(f"\n== {label} Test Metrics ==")
    print("Accuracy:", acc)
    print("Precision (macro):", prec)
    print("Recall (macro):", rec)
    print("F1 (macro):", f1)
    print("\nClassification report:\n", classification_report(y_test, y_pred, zero_division=0))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    return {"acc": acc, "prec": prec, "rec": rec, "f1": f1, "y_pred": y_pred, "y_prob": y_prob}

res_lr = evaluate_on_test(best_lr, X_test, y_test, "Logistic Regression")
res_rf = evaluate_on_test(best_rf, X_test, y_test, "Random Forest")


def plot_confusion(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred, labels=sorted(y.unique()))
    fig, ax = plt.subplots(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", ax=ax, cmap="Blues",
                xticklabels=sorted(y.unique()), yticklabels=sorted(y.unique()))
    ax.set_ylabel("True"); ax.set_xlabel("Predicted"); ax.set_title(title)
    save_fig(fig, f"confusion_{title.replace(' ','_')}.png")
    plt.close(fig)

plot_confusion(y_test, res_lr['y_pred'], "Logistic Regression")
plot_confusion(y_test, res_rf['y_pred'], "Random Forest")

labels = sorted(y.unique())
if res_lr['y_prob'] is not None:
    macro_lr, aucs_lr = multiclass_roc_auc(y_test, res_lr['y_prob'], labels)
    print("LR ROC-AUC macro:", macro_lr, "per-class:", aucs_lr)
    
    from sklearn.preprocessing import label_binarize
    y_test_bin = label_binarize(y_test, classes=labels)
    n_classes = y_test_bin.shape[1]
    # For each class, compute ROC
    fig, ax = plt.subplots(figsize=(6,5))
    for i, cls in enumerate(labels):
        if n_classes == 1: continue
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], res_lr['y_prob'][:, i])
        ax.plot(fpr, tpr, label=f"{cls} (AUC={auc(res_lr['y_prob'][:, i], y_test_bin[:, i]):.2f})" if False else f"{cls}")
    ax.plot([0,1],[0,1],"k--")
    ax.set_title("LR ROC (per-class)")
    ax.legend()
    save_fig(fig, "roc_lr.png")
    plt.close(fig)

if res_rf['y_prob'] is not None:
    macro_rf, aucs_rf = multiclass_roc_auc(y_test, res_rf['y_prob'], labels)
    print("RF ROC-AUC macro:", macro_rf, "per-class:", aucs_rf)
    
def get_feature_names(preprocessor):
    
    num_cols = numeric_features
    if categorical_features:
        
        ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
        cat_names = ohe.get_feature_names_out(categorical_features).tolist()
        return num_cols + cat_names
    else:
        return num_cols

feature_names = get_feature_names(best_rf.named_steps['pre'])


try:
    lr_coef = best_lr.named_steps['clf'].coef_  
    lr_classes = best_lr.named_steps['clf'].classes_
    coef_df = pd.DataFrame(lr_coef.T, index=feature_names, columns=[f"coef_{c}" for c in lr_classes])
    coef_df.to_csv(os.path.join(RESULTS_DIR, "lr_coefficients.csv"))
    print("Saved logistic regression coefficients.")
except Exception as e:
    print("Could not extract LR coefficients:", e)


try:
    rf_imp = best_rf.named_steps['clf'].feature_importances_
    imp_df = pd.DataFrame({"feature": feature_names, "importance": rf_imp}).sort_values("importance", ascending=False)
    imp_df.to_csv(os.path.join(RESULTS_DIR, "rf_feature_importance.csv"), index=False)
    print("Saved RF feature importance.")
    
    fig, ax = plt.subplots(figsize=(6,6))
    imp_df.head(15).plot.barh(x="feature", y="importance", ax=ax, legend=False)
    ax.invert_yaxis()
    ax.set_title("Top 15 RF Feature Importances")
    save_fig(fig, "rf_top15_importance.png")
    plt.close(fig)
except Exception as e:
    print("Could not extract RF feature importances:", e)


joblib.dump(best_lr, os.path.join(RESULTS_DIR, "best_logistic.joblib"))
joblib.dump(best_rf, os.path.join(RESULTS_DIR, "best_randomforest.joblib"))
print("Saved trained models to", RESULTS_DIR)

summary = {
    "lr_test": {"acc": res_lr['acc'], "f1": res_lr['f1']},
    "rf_test": {"acc": res_rf['acc'], "f1": res_rf['f1']},
    "best_lr_params": gs_lr.best_params_,
    "best_rf_params": gs_rf.best_params_
}
pd.DataFrame(summary).to_csv(os.path.join(RESULTS_DIR, "model_summary.csv"))
print("Saved summary to", os.path.join(RESULTS_DIR, "model_summary.csv"))



Rows: 41794
Columns: ['date', 'home_team', 'away_team', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral', 'year', 'goal_diff_home', 'goal_diff_away', 'match_outcome', 'home_advantage', 'wins_last10_count', 'draws_last10_count', 'losses_last10_count', 'win_rate_last10', 'form_score_last10', 'wins_last10_count_away', 'draws_last10_count_away', 'losses_last10_count_away', 'win_rate_last10_away', 'form_score_last10_away', 'home_rank_proxy', 'away_rank_proxy', 'ranking_diff_proxy', 'outcome_numeric']
Using features: ['goal_diff_home', 'goal_diff_away', 'home_score', 'away_score', 'home_advantage', 'year', 'win_rate_last10', 'wins_last10_count', 'form_score_last10', 'home_rank_proxy', 'away_rank_proxy', 'ranking_diff_proxy']
Numeric features: ['goal_diff_home', 'goal_diff_away', 'home_score', 'away_score', 'home_advantage', 'year', 'win_rate_last10', 'wins_last10_count', 'form_score_last10', 'home_rank_proxy', 'away_rank_proxy', 'ranking_diff_proxy']
Categorical feature



Best LR params: {'clf__C': 0.01, 'clf__penalty': 'l2'} Best score: 1.0
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best RF params: {'clf__class_weight': None, 'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 100} Best score: 1.0

== Logistic Regression Test Metrics ==
Accuracy: 1.0
Precision (macro): 1.0
Recall (macro): 1.0
F1 (macro): 1.0

Classification report:
               precision    recall  f1-score   support

    away_win       1.00      1.00      1.00      2364
        draw       1.00      1.00      1.00      1924
    home_win       1.00      1.00      1.00      4071

    accuracy                           1.00      8359
   macro avg       1.00      1.00      1.00      8359
weighted avg       1.00      1.00      1.00      8359

Confusion matrix:
 [[2364    0    0]
 [   0 1924    0]
 [   0    0 4071]]

== Random Forest Test Metrics ==
Accuracy: 1.0
Precision (macro): 1.0
Recall (macro): 1.0
F1 (macro): 1.0

Classification report:
    