In [2]:
import os
os.chdir(r"C:\WEB_PROJECTS\Ball_Knowledge")

import pandas as pd
from pathlib import Path
from models.elo_engine import EloEngine

csv_path = Path(r"C:\WEB_PROJECTS\Ball_Knowledge\data\premier_league_2023_24.csv")
df = pd.read_csv(csv_path)

df.head()



Unnamed: 0,date,home,away,home_goals,away_goals
0,2023-08-11,Burnley,Manchester City,0,3
1,2023-08-12,Arsenal,Nottingham Forest,2,1
2,2023-08-12,Bournemouth,West Ham,1,1
3,2023-08-12,Brighton,Luton,4,1
4,2023-08-12,Everton,Fulham,0,1


In [3]:
# STANDARDIZE TEAM NAMES

name_map = {
    "Man City": "Manchester City",
    "Manchester City FC": "Manchester City",
    "Man Utd": "Manchester United",
    "Man United": "Manchester United",
    "Spurs": "Tottenham",
    "Totenham": "Tottenham",
    "Nottingham Forest": "Nottm Forest",
    "Forest": "Nottm Forest",
}

df['home'] = df['home'].replace(name_map)
df['away'] = df['away'].replace(name_map)


In [4]:
sorted(pd.unique(df['home']))


['Arsenal',
 'Aston Villa',
 'Bournemouth',
 'Brentford',
 'Brighton',
 'Burnley',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Fulham',
 'Liverpool',
 'Luton',
 'Manchester City',
 'Manchester United',
 'Newcastle',
 'Nottm Forest',
 'Sheffield United',
 'Tottenham',
 'West Ham',
 'Wolves']

In [5]:
elo = EloEngine()
final_elos = elo.compute_season(df)
final_elos


{'Burnley': 1399.147213864908,
 'Manchester City': 1665.8265676086185,
 'Arsenal': 1652.7848352093765,
 'Nottm Forest': 1429.7106014050596,
 'Bournemouth': 1471.1392299224399,
 'West Ham': 1509.7506701884272,
 'Brighton': 1489.7624042288255,
 'Luton': 1426.5096019444165,
 'Everton': 1502.6009406416424,
 'Fulham': 1485.3780122819235,
 'Sheffield United': 1371.5274711558131,
 'Crystal Palace': 1474.1165294877503,
 'Newcastle': 1509.7581957364962,
 'Aston Villa': 1524.0693013142168,
 'Brentford': 1422.7794102813727,
 'Tottenham': 1520.2416545384633,
 'Chelsea': 1546.5917575049832,
 'Liverpool': 1610.3335769377961,
 'Manchester United': 1504.8621852856475,
 'Wolves': 1483.1098404618228}

In [6]:
import pandas as pd
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

df.head()


Unnamed: 0,date,home,away,home_goals,away_goals
0,2023-08-11,Burnley,Manchester City,0,3
1,2023-08-12,Arsenal,Nottm Forest,2,1
2,2023-08-12,Bournemouth,West Ham,1,1
3,2023-08-12,Brighton,Luton,4,1
4,2023-08-12,Everton,Fulham,0,1


In [7]:
# STEP 2: Team-wise match rows

# Home team rows
home_df = df[['date', 'home', 'home_goals', 'away_goals']].copy()
home_df.rename(columns={
    'home': 'team',
    'home_goals': 'goals_for',
    'away_goals': 'goals_against'
}, inplace=True)
home_df['is_home'] = 1

# Away team rows
away_df = df[['date', 'away', 'home_goals', 'away_goals']].copy()
away_df.rename(columns={
    'away': 'team',
    'away_goals': 'goals_for',
    'home_goals': 'goals_against'
}, inplace=True)
away_df['is_home'] = 0

# Combine both
import pandas as pd
team_matches = pd.concat([home_df, away_df], ignore_index=True)

# Points: win=3, draw=1, loss=0
def result_points(row):
    if row['goals_for'] > row['goals_against']:
        return 3
    elif row['goals_for'] < row['goals_against']:
        return 0
    else:
        return 1

team_matches['points'] = team_matches.apply(result_points, axis=1)

# Sort by team → date
team_matches = team_matches.sort_values(['team', 'date']).reset_index(drop=True)

team_matches.head()


Unnamed: 0,date,team,goals_for,goals_against,is_home,points
0,2023-08-12,Arsenal,2,1,1,3
1,2023-08-21,Arsenal,1,0,0,3
2,2023-08-26,Arsenal,2,2,1,1
3,2023-09-03,Arsenal,3,1,1,3
4,2023-09-17,Arsenal,1,0,0,3


In [8]:
# STEP 3: Rolling form, attack & defence stats

group = team_matches.groupby('team')

# Last 5 match averages
team_matches['gf_last5'] = group['goals_for'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)
team_matches['ga_last5'] = group['goals_against'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)
team_matches['pts_last5'] = group['points'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)

# Last 10 match averages (more stable for the Power Score)
team_matches['gf_last10'] = group['goals_for'].rolling(10, min_periods=1).mean().reset_index(level=0, drop=True)
team_matches['ga_last10'] = group['goals_against'].rolling(10, min_periods=1).mean().reset_index(level=0, drop=True)

team_matches.tail()


Unnamed: 0,date,team,goals_for,goals_against,is_home,points,gf_last5,ga_last5,pts_last5,gf_last10,ga_last10
685,2024-04-07,Wolves,1,2,0,0,1.0,1.4,1.4,1.9,1.5
686,2024-04-14,Wolves,0,2,0,0,0.8,1.8,0.8,1.5,1.6
687,2024-04-28,Wolves,2,1,1,3,1.2,1.4,1.4,1.4,1.7
688,2024-05-05,Wolves,1,5,0,0,1.0,2.2,0.8,1.4,1.8
689,2024-05-12,Wolves,0,2,0,0,0.8,2.4,0.6,1.0,1.8


In [9]:
# STEP 4: Extract final snapshot per team

final_stats = (
    team_matches.sort_values('date')
                .groupby('team')
                .tail(1)  # last match per team
)

final_stats = final_stats[['team', 'gf_last5', 'ga_last5', 'pts_last5', 'gf_last10', 'ga_last10']].reset_index(drop=True)

final_stats


Unnamed: 0,team,gf_last5,ga_last5,pts_last5,gf_last10,ga_last10
0,Aston Villa,1.4,2.0,0.6,1.5,2.2
1,Sheffield United,0.8,3.0,0.0,1.0,3.0
2,West Ham,1.8,2.2,1.6,1.9,2.0
3,Tottenham,1.4,2.8,0.6,1.7,2.0
4,Crystal Palace,2.0,1.2,2.0,1.6,1.2
5,Nottm Forest,0.8,2.2,0.0,1.1,1.7
6,Arsenal,2.8,0.6,3.0,2.8,0.5
7,Newcastle,3.6,1.0,2.6,2.6,1.4
8,Manchester United,1.4,1.6,1.4,1.4,1.8
9,Bournemouth,1.2,1.8,1.2,1.3,1.6


In [11]:
# Build Elo dataframe from final Elo ratings

elo = EloEngine()
final_elos = elo.compute_season(df)

elo_df = pd.DataFrame([
    {"team": team, "elo": rating}
    for team, rating in final_elos.items()
])

elo_df


Unnamed: 0,team,elo
0,Burnley,1399.147214
1,Manchester City,1665.826568
2,Arsenal,1652.784835
3,Nottm Forest,1429.710601
4,Bournemouth,1471.13923
5,West Ham,1509.75067
6,Brighton,1489.762404
7,Luton,1426.509602
8,Everton,1502.600941
9,Fulham,1485.378012


In [12]:
# STEP 5: Merge Elo with team form/attack/defence stats

team_features = final_stats.merge(elo_df, on='team', how='left')

team_features


Unnamed: 0,team,gf_last5,ga_last5,pts_last5,gf_last10,ga_last10,elo
0,Aston Villa,1.4,2.0,0.6,1.5,2.2,1524.069301
1,Sheffield United,0.8,3.0,0.0,1.0,3.0,1371.527471
2,West Ham,1.8,2.2,1.6,1.9,2.0,1509.75067
3,Tottenham,1.4,2.8,0.6,1.7,2.0,1520.241655
4,Crystal Palace,2.0,1.2,2.0,1.6,1.2,1474.116529
5,Nottm Forest,0.8,2.2,0.0,1.1,1.7,1429.710601
6,Arsenal,2.8,0.6,3.0,2.8,0.5,1652.784835
7,Newcastle,3.6,1.0,2.6,2.6,1.4,1509.758196
8,Manchester United,1.4,1.6,1.4,1.4,1.8,1504.862185
9,Bournemouth,1.2,1.8,1.2,1.3,1.6,1471.13923


In [13]:
# STEP 6: Normalize features to 0–1

import numpy as np

tf = team_features.copy()

# Convert defence: LOWER goals conceded = BETTER → invert
tf['defence_strength'] = -tf['ga_last10']

# Attack strength
tf['attack_strength'] = tf['gf_last10']

# Form strength
tf['form_strength'] = tf['pts_last5']

# Keep Elo separately
tf['elo_strength'] = tf['elo']

# Columns to normalize
cols_to_norm = ['elo_strength', 'attack_strength', 'defence_strength', 'form_strength']

# Min-max normalization
for col in cols_to_norm:
    col_min = tf[col].min()
    col_max = tf[col].max()

    if col_max == col_min:
        tf[col + '_norm'] = 0.5  # neutral value
    else:
        tf[col + '_norm'] = (tf[col] - col_min) / (col_max - col_min)

tf.head()


Unnamed: 0,team,gf_last5,ga_last5,pts_last5,gf_last10,ga_last10,elo,defence_strength,attack_strength,form_strength,elo_strength,elo_strength_norm,attack_strength_norm,defence_strength_norm,form_strength_norm
0,Aston Villa,1.4,2.0,0.6,1.5,2.2,1524.069301,-2.2,1.5,0.6,1524.069301,0.518322,0.346154,0.32,0.2
1,Sheffield United,0.8,3.0,0.0,1.0,3.0,1371.527471,-3.0,1.0,0.0,1371.527471,0.0,0.153846,0.0,0.0
2,West Ham,1.8,2.2,1.6,1.9,2.0,1509.75067,-2.0,1.9,1.6,1509.75067,0.469669,0.5,0.4,0.533333
3,Tottenham,1.4,2.8,0.6,1.7,2.0,1520.241655,-2.0,1.7,0.6,1520.241655,0.505316,0.423077,0.4,0.2
4,Crystal Palace,2.0,1.2,2.0,1.6,1.2,1474.116529,-1.2,1.6,2.0,1474.116529,0.348588,0.384615,0.72,0.666667


In [14]:
# STEP 7: Compute final Power Score (0–100)

w_elo = 0.4
w_att = 0.25
w_def = 0.20
w_form = 0.15

tf['raw_power'] = (
    w_elo  * tf['elo_strength_norm'] +
    w_att  * tf['attack_strength_norm'] +
    w_def  * tf['defence_strength_norm'] +
    w_form * tf['form_strength_norm']
)

# Scale to 0–100
col_min = tf['raw_power'].min()
col_max = tf['raw_power'].max()

tf['power_score'] = 100 * (tf['raw_power'] - col_min) / (col_max - col_min)
tf['power_score'] = tf['power_score'].round(1)

# Final sorted table
power_table = tf[['team', 'power_score', 'elo', 'gf_last10', 'ga_last10', 'pts_last5']] \
                .sort_values('power_score', ascending=False) \
                .reset_index(drop=True)

power_table


Unnamed: 0,team,power_score,elo,gf_last10,ga_last10,pts_last5
0,Manchester City,100.0,1665.826568,3.2,0.7,3.0
1,Arsenal,95.7,1652.784835,2.8,0.5,3.0
2,Liverpool,66.7,1610.333577,2.0,1.5,1.8
3,Newcastle,63.4,1509.758196,2.6,1.4,2.6
4,Chelsea,54.7,1546.591758,2.2,2.2,2.0
5,Everton,51.2,1502.600941,1.1,0.8,2.4
6,Crystal Palace,46.7,1474.116529,1.6,1.2,2.0
7,West Ham,45.9,1509.75067,1.9,2.0,1.6
8,Fulham,41.3,1485.378012,1.5,1.9,2.0
9,Manchester United,40.8,1504.862185,1.4,1.8,1.4


In [15]:
power_lookup = dict(zip(power_table['team'], power_table['power_score']))
power_lookup


{'Manchester City': 100.0,
 'Arsenal': 95.7,
 'Liverpool': 66.7,
 'Newcastle': 63.4,
 'Chelsea': 54.7,
 'Everton': 51.2,
 'Crystal Palace': 46.7,
 'West Ham': 45.9,
 'Fulham': 41.3,
 'Manchester United': 40.8,
 'Tottenham': 40.1,
 'Aston Villa': 37.0,
 'Bournemouth': 35.6,
 'Brighton': 30.7,
 'Wolves': 29.4,
 'Brentford': 28.3,
 'Nottm Forest': 20.4,
 'Luton': 19.2,
 'Burnley': 17.9,
 'Sheffield United': 0.0}

In [17]:
import numpy as np

def predict_match(home, away, elo_engine, power_lookup):
    # --- ELO COMPONENT ---
    elo_home = elo_engine.get_elo(home)
    elo_away = elo_engine.get_elo(away)
    elo_diff = elo_home - elo_away

    # Elo probability (classic formula)
    prob_home_elo = 1 / (1 + 10 ** (-elo_diff / 400))

    # --- POWER SCORE COMPONENT ---
    ps_home = power_lookup[home]
    ps_away = power_lookup[away]
    ps_diff = ps_home - ps_away

    # Logistic function for Power Score impact
    prob_home_power = 1 / (1 + np.exp(-ps_diff / 12))

    # --- COMBINE BOTH MODELS ---
    final_home = 0.55 * prob_home_elo + 0.45 * prob_home_power
    final_away = 1 - final_home

    # Add draw probability baseline
    base_draw = 0.22  # Later we can make dynamic draw model

    # Normalize so all sum to 1
    total = final_home + final_away + base_draw

    return {
        "home": home,
        "away": away,
        "home_win": round(final_home / total, 3),
        "draw": round(base_draw / total, 3),
        "away_win": round(final_away / total, 3),
        "elo_diff": round(elo_diff, 2),
        "power_diff": round(ps_diff, 2)
    }


In [18]:
predict_match("Arsenal", "Liverpool", elo, power_lookup)


{'home': 'Arsenal',
 'away': 'Liverpool',
 'home_win': np.float64(0.591),
 'draw': np.float64(0.18),
 'away_win': np.float64(0.228),
 'elo_diff': 42.45,
 'power_diff': 29.0}