In [4]:

import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("cleaned_worldcup_matches.csv")

features = [
    'home_advantage', 'year',
    'win_rate_last10', 'wins_last10_count', 'form_score_last10',
    'win_rate_last10_away', 'wins_last10_count_away', 'form_score_last10_away',
    'home_rank_proxy', 'away_rank_proxy', 'ranking_diff_proxy'
]
features = [f for f in features if f in df.columns]

X = df[features].fillna(0)
y = df["match_outcome"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

# Best model from Task 3 = Random Forest
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train_sc, y_train)

# ----------------------------
# Load 2026 World Cup teams
# ----------------------------
teams_2026 = pd.read_csv("2026_teams.csv")
print("Teams:", teams_2026["team"].tolist())

# ----------------------------
# Simulate matches between all teams
# ----------------------------
def simulate_match(teamA, teamB):
    # Build match-level feature row
    row = {
        "home_advantage": 0,  # Neutral venue assumed for prediction
        "year": 2026,
        "win_rate_last10": teamA["win_rate_last10"],
        "wins_last10_count": teamA["wins_last10_count"],
        "form_score_last10": teamA["form_score_last10"],
        "win_rate_last10_away": teamB["win_rate_last10"],
        "wins_last10_count_away": teamB["wins_last10_count"],
        "form_score_last10_away": teamB["form_score_last10"],
        "home_rank_proxy": teamA["home_rank_proxy"],
        "away_rank_proxy": teamB["away_rank_proxy"],
        "ranking_diff_proxy": teamA["home_rank_proxy"] - teamB["away_rank_proxy"]
    }

    row_df = pd.DataFrame([row])[features].fillna(0)

    row_sc = scaler.transform(row_df)

    pred = rf.predict(row_sc)[0]
    prob = rf.predict_proba(row_sc)[0]

    return pred, prob

# ----------------------------
# Predict outcome probabilities for all matchups
# ----------------------------
results = []

for teamA, teamB in combinations(teams_2026.to_dict("records"), 2):
    pred, prob = simulate_match(teamA, teamB)
    results.append({
        "teamA": teamA["team"],
        "teamB": teamB["team"],
        "predicted_outcome": pred,
        "prob_home_win": prob[rf.classes_.tolist().index("home_win")],
        "prob_draw": prob[rf.classes_.tolist().index("draw")],
        "prob_away_win": prob[rf.classes_.tolist().index("away_win")]
    })

results_df = pd.DataFrame(results)
results_df.to_csv("task5_all_match_predictions.csv", index=False)

print("Saved match predictions → task5_all_match_predictions.csv")

# Determine likely finalists
# Count wins for each team in pairwise comparisons
scores = {}

for r in results_df.itertuples():
    if r.predicted_outcome == "home_win":
        scores[r.teamA] = scores.get(r.teamA, 0) + 1
    elif r.predicted_outcome == "away_win":
        scores[r.teamB] = scores.get(r.teamB, 0) + 1
    else:
        scores[r.teamA] = scores.get(r.teamA, 0) + 0.5
        scores[r.teamB] = scores.get(r.teamB, 0) + 0.5

ranking = sorted(scores.items(), key=lambda x: x[1], reverse=True)

finalists = ranking[:2]  # top 2 teams
print("\nPredicted 2026 Finalists:")
print(finalists)


Teams: ['Qatar', 'Japan', 'South Korea', 'Saudi Arabia', 'Oman', 'United States', 'Bahrain', 'Tunisia', 'Algeria', 'United Arab Emirates', 'Zimbabwe', 'Uganda', 'Ivory Coast', 'China PR', 'Brazil', 'Iraq', 'Senegal', 'Jordan', 'Mexico', 'Peru', 'Nigeria', 'Zambia', 'Mali', 'Tanzania', 'Namibia', 'Madagascar', 'South Africa', 'Nicaragua', 'Mauritania', 'North Korea', 'El Salvador', 'Malaysia']
Saved match predictions → task5_all_match_predictions.csv

Predicted 2026 Finalists:
[('Mexico', 28.0), ('Brazil', 27.5)]
