In [52]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# --- Load data ---
matches = pd.read_csv("results.csv")
matches.columns = matches.columns.str.strip()
rankings = pd.read_csv("fifa_ranking-2024-06-20.csv")

# --- Parse dates ---
matches["date"] = pd.to_datetime(matches["date"])
rankings["rank_date"] = pd.to_datetime(rankings["rank_date"])

# --- Encode categorical variables ---
matches["oppteam_code"] = matches["away_team"].astype("category").cat.codes
matches["hometeam_code"] = matches["home_team"].astype("category").cat.codes
matches["city_code"] = matches["city"].astype("category").cat.codes
matches["country_code"] = matches["country"].astype("category").cat.codes
matches["tournament_code"] = matches["tournament"].astype("category").cat.codes
matches["day_code"] = matches["date"].dt.dayofweek

# --- Get match result (3 = win, 2 = draw, 1 = loss) ---
def get_result(row):
    if row["home_score"] > row["away_score"]:
        return 3
    elif row["home_score"] == row["away_score"]:
        return 2
    else:
        return 1

matches["result"] = matches.apply(get_result, axis=1)

# --- Sort for merging ---
matches = matches.sort_values("date")
rankings = rankings.sort_values("rank_date")

# --- Merge FIFA rankings ---
matches = pd.merge_asof(
    matches, rankings,
    left_on="date", right_on="rank_date",
    left_by="home_team", right_by="country_full",
    direction="backward"
).rename(columns={"rank": "home_rank", "total_points": "home_points"})

matches = pd.merge_asof(
    matches, rankings,
    left_on="date", right_on="rank_date",
    left_by="away_team", right_by="country_full",
    direction="backward"
).rename(columns={"rank": "away_rank", "total_points": "away_points"})

matches.dropna(subset=["home_rank", "away_rank", "home_points", "away_points"], inplace=True)

# --- Derived features ---
matches["rank_diff"] = matches["home_rank"] - matches["away_rank"]
matches["points_diff"] = matches["home_points"] - matches["away_points"]

# --- Recent Form (last 5 matches) ---
def compute_recent_form(df, team_col, result_col, date_col, N=5):
    df = df.sort_values(by=date_col)
    form_data = []
    grouped = df.groupby(team_col)
    for team, group in grouped:
        results = group[[date_col, result_col]].copy()
        results["recent_form"] = results[result_col].shift().rolling(window=N, min_periods=1).mean()
        results[team_col] = team
        form_data.append(results[[date_col, team_col, "recent_form"]])
    return pd.concat(form_data)

home_form = compute_recent_form(matches, "home_team", "result", "date")
away_form = compute_recent_form(matches, "away_team", "result", "date")

matches = pd.merge(matches, home_form, how="left", on=["date", "home_team"]).rename(columns={"recent_form": "home_form"})
matches = pd.merge(matches, away_form, how="left", on=["date", "away_team"]).rename(columns={"recent_form": "away_form"})

# --- Average Goals Scored & Conceded ---
def compute_goal_stats(df, team_col, goals_for, goals_against, date_col, N=5):
    df = df.sort_values(by=date_col)
    goal_data = []
    grouped = df.groupby(team_col)
    for team, group in grouped:
        stats = group[[date_col, goals_for, goals_against]].copy()
        stats["avg_scored"] = stats[goals_for].shift().rolling(window=N, min_periods=1).mean()
        stats["avg_conceded"] = stats[goals_against].shift().rolling(window=N, min_periods=1).mean()
        stats[team_col] = team
        goal_data.append(stats[[date_col, team_col, "avg_scored", "avg_conceded"]])
    return pd.concat(goal_data)

home_stats = compute_goal_stats(matches, "home_team", "home_score", "away_score", "date")
away_stats = compute_goal_stats(matches, "away_team", "away_score", "home_score", "date")

matches = pd.merge(matches, home_stats, how="left", on=["date", "home_team"]).rename(columns={
    "avg_scored": "home_avg_scored", "avg_conceded": "home_avg_conceded"
})
matches = pd.merge(matches, away_stats, how="left", on=["date", "away_team"]).rename(columns={
    "avg_scored": "away_avg_scored", "avg_conceded": "away_avg_conceded"
})

# --- Head-to-head win % ---
def get_matchup_key(row):
    teams = sorted([row["home_team"], row["away_team"]])
    return "_vs_".join(teams)

matches["matchup"] = matches.apply(get_matchup_key, axis=1)

def compute_head_to_head(matches, N=5):
    matches = matches.sort_values("date")
    h2h_win_pct = []
    grouped = matches.groupby("matchup")
    for matchup, group in grouped:
        past_results = []
        for idx, row in group.iterrows():
            past = group[group["date"] < row["date"]].tail(N)
            if past.empty:
                h2h_win_pct.append(None)
                continue
            home_team = row["home_team"]
            wins = 0
            for _, past_row in past.iterrows():
                if past_row["home_team"] == home_team and past_row["home_score"] > past_row["away_score"]:
                    wins += 1
                elif past_row["away_team"] == home_team and past_row["away_score"] > past_row["home_score"]:
                    wins += 1
            h2h_win_pct.append(wins / len(past))
    matches = matches.copy()
    matches["h2h_win_pct"] = h2h_win_pct
    return matches

matches = compute_head_to_head(matches, N=5)

# --- Drop rows with missing features ---
matches.dropna(subset=[
    "home_form", "away_form",
    "home_avg_scored", "home_avg_conceded",
    "away_avg_scored", "away_avg_conceded",
    "h2h_win_pct"
], inplace=True)

# --- Set up predictors ---
predictors = [
    "hometeam_code", "oppteam_code", "city_code", "country_code",
    "tournament_code", "neutral", "day_code",
    "home_rank", "away_rank", "home_points", "away_points",
    "rank_diff", "points_diff",
    "home_form", "away_form",
    "home_avg_scored", "home_avg_conceded",
    "away_avg_scored", "away_avg_conceded",
    "h2h_win_pct"
]

# --- Train/test split ---
train = matches[matches["date"] < '2015-01-01']
test = matches[matches["date"] >= '2015-01-01']

# --- Train model ---
rf = RandomForestClassifier(
    n_estimators= 300,
    min_samples_split=5,
    min_samples_leaf=2,
    max_depth= 20,
    class_weight={3: 2.0, 2: 1.0, 1: 1.0},
    random_state=1,
    n_jobs=-1
)
rf.fit(train[predictors], train["result"])

# --- Evaluate ---
preds = rf.predict(test[predictors])
accuracy = accuracy_score(test["result"], preds)
precision = precision_score(test["result"], preds, average=None)

print("Accuracy:", accuracy)
print("Precision per class:", precision)

Accuracy: 0.5821275242486882
Precision per class: [0.56581858 0.30666667 0.59350885]


In [53]:
def predict_match(team1, team2, team1_rank, team2_rank, city, country, date_str, is_neutral):
    import numpy as np
    from datetime import datetime

    date = pd.to_datetime(date_str)
    matchup_key = "_vs_".join(sorted([team1, team2]))
    
    # Encoding categorical variables
    def safe_encode(val, series):
        return series.cat.categories.get_loc(val) if val in series.cat.categories else -1

    hometeam_code = safe_encode(team1, matches["home_team"].astype("category"))
    oppteam_code = safe_encode(team2, matches["away_team"].astype("category"))
    city_code = safe_encode(city, matches["city"].astype("category"))
    country_code = safe_encode(country, matches["country"].astype("category"))
    tournament_code = safe_encode("FIFA World Cup", matches["tournament"].astype("category"))  # or pass as arg
    day_code = date.dayofweek

    # Get rank/point differences
    home_points = rankings[
        (rankings["country_full"] == team1) & (rankings["rank_date"] <= date)
    ].sort_values("rank_date").iloc[-1]["total_points"]
    away_points = rankings[
        (rankings["country_full"] == team2) & (rankings["rank_date"] <= date)
    ].sort_values("rank_date").iloc[-1]["total_points"]

    rank_diff = team1_rank - team2_rank
    points_diff = home_points - away_points

    # Get recent form, avg scored/conceded
    def recent_stat(team, stat_col):
        sub = matches[(matches["date"] < date) & ((matches["home_team"] == team) | (matches["away_team"] == team))].copy()
        if sub.empty:
            return np.nan
        if "form" in stat_col:
            sub["res"] = sub.apply(get_result, axis=1)
            return sub["res"].tail(5).mean()
        elif "scored" in stat_col:
            if "home" in stat_col:
                return sub[sub["home_team"] == team]["home_score"].tail(5).mean()
            else:
                return sub[sub["away_team"] == team]["away_score"].tail(5).mean()
        elif "conceded" in stat_col:
            if "home" in stat_col:
                return sub[sub["home_team"] == team]["away_score"].tail(5).mean()
            else:
                return sub[sub["away_team"] == team]["home_score"].tail(5).mean()

    home_form = recent_stat(team1, "home_form")
    away_form = recent_stat(team2, "away_form")
    home_avg_scored = recent_stat(team1, "home_avg_scored")
    home_avg_conceded = recent_stat(team1, "home_avg_conceded")
    away_avg_scored = recent_stat(team2, "away_avg_scored")
    away_avg_conceded = recent_stat(team2, "away_avg_conceded")

    # Head-to-head win %
    past = matches[(matches["matchup"] == matchup_key) & (matches["date"] < date)].tail(5)
    if not past.empty:
        wins = 0
        for _, row in past.iterrows():
            if row["home_team"] == team1 and row["home_score"] > row["away_score"]:
                wins += 1
            elif row["away_team"] == team1 and row["away_score"] > row["home_score"]:
                wins += 1
        h2h_win_pct = wins / len(past)
    else:
        h2h_win_pct = 0.5  # default if no history

    # Create feature array
    input_data = pd.DataFrame([{
        "hometeam_code": hometeam_code,
        "oppteam_code": oppteam_code,
        "city_code": city_code,
        "country_code": country_code,
        "tournament_code": tournament_code,
        "neutral": int(is_neutral),
        "day_code": day_code,
        "home_rank": team1_rank,
        "away_rank": team2_rank,
        "home_points": home_points,
        "away_points": away_points,
        "rank_diff": rank_diff,
        "points_diff": points_diff,
        "home_form": home_form,
        "away_form": away_form,
        "home_avg_scored": home_avg_scored,
        "home_avg_conceded": home_avg_conceded,
        "away_avg_scored": away_avg_scored,
        "away_avg_conceded": away_avg_conceded,
        "h2h_win_pct": h2h_win_pct
    }])

    input_data.fillna(0.5, inplace=True)  # handle missing values gracefully

    prediction = rf.predict(input_data)[0]
    if prediction == 3:
        return f"{team1} win"
    elif prediction == 2:
        return "Draw"
    else:
        return f"{team2} win"


In [73]:
result = predict_match(
    team1="Germany",
    team2="Argentina",
    team1_rank=2,
    team2_rank=5,
    city="Rio de Janeiro",
    country="Brazil",
    date_str="2014-07-13",
    is_neutral=False
)

print("Prediction:", result["result"])
print("Predicted Scoreline:", result["score"])

Prediction: Germany win
Predicted Scoreline: 2–2
