In [4]:
# ============================
# TASK 4: Feature Importance & Interpretation
# Fully SELF-CONTAINED version
# ============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ------------------------------------------------
# 1. LOAD CLEANED DATA
# ------------------------------------------------

df = pd.read_csv("cleaned_worldcup_matches.csv")
print("Loaded:", df.shape)

# ------------------------------------------------
# 2. FEATURES (NO LEAKAGE)
# ------------------------------------------------

features = [
    'home_advantage', 'year',
    'win_rate_last10', 'wins_last10_count', 'form_score_last10',
    'win_rate_last10_away', 'wins_last10_count_away', 'form_score_last10_away',
    'home_rank_proxy', 'away_rank_proxy', 'ranking_diff_proxy'
]

features = [f for f in features if f in df.columns]
print("Using features:", features)

X = df[features].fillna(0)
y = df["match_outcome"]

# ------------------------------------------------
# 3. TRAIN-TEST SPLIT
# ------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------------------------------
# 4. SCALING + TRAIN MODELS
# ------------------------------------------------

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

# Logistic Regression
lr = LogisticRegression(
    max_iter=5000,
    multi_class="multinomial",
    solver="saga",
    random_state=42
)
lr.fit(X_train_sc, y_train)

# Random Forest
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_sc, y_train)

print("Models trained successfully.")

# ------------------------------------------------
# 5. FEATURE IMPORTANCE — LOGISTIC REGRESSION
# ------------------------------------------------

# Use class index of "home_win" for meaningful interpretation
class_index = list(lr.classes_).index("home_win")

lr_coef = lr.coef_[class_index]

lr_importance = pd.DataFrame({
    "feature": features,
    "importance": lr_coef
}).sort_values("importance", ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x="importance", y="feature", data=lr_importance)
plt.title("Logistic Regression Feature Importance (Home Win Coefficients)")
plt.tight_layout()
plt.savefig("task4_lr_importance.png")
plt.close()

# ------------------------------------------------
# 6. FEATURE IMPORTANCE — RANDOM FOREST
# ------------------------------------------------

rf_importance = pd.DataFrame({
    "feature": features,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x="importance", y="feature", data=rf_importance)
plt.title("Random Forest Feature Importance")
plt.tight_layout()
plt.savefig("task4_rf_importance.png")
plt.close()

# ------------------------------------------------
# 7. SAVE CSV FILES
# ------------------------------------------------

lr_importance.to_csv("task4_lr_importance.csv", index=False)
rf_importance.to_csv("task4_rf_importance.csv", index=False)

print("\nTask 4 Completed Successfully ✅")
print("Saved:")
print(" - task4_lr_importance.png")
print(" - task4_rf_importance.png")
print(" - task4_lr_importance.csv")
print(" - task4_rf_importance.csv")


Loaded: (41794, 28)
Using features: ['home_advantage', 'year', 'win_rate_last10', 'wins_last10_count', 'form_score_last10', 'win_rate_last10_away', 'wins_last10_count_away', 'form_score_last10_away', 'home_rank_proxy', 'away_rank_proxy', 'ranking_diff_proxy']




Models trained successfully.

Task 4 Completed Successfully ✅
Saved:
 - task4_lr_importance.png
 - task4_rf_importance.png
 - task4_lr_importance.csv
 - task4_rf_importance.csv
