In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson

In [24]:
url = "https://www.football-data.co.uk/mmz4281/2526/D1.csv"
url2 = "https://www.football-data.co.uk/mmz4281/2425/D1.csv"

df = pd.read_csv(url)

# Ensure Date is datetime and drop rows with invalid dates
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
df = df.dropna(subset=['Date'])

# Ensure 'Date' is parsed correctly (football-data uses DD/MM/YY)
print(df.head())

  Div       Date   Time       HomeTeam       AwayTeam  FTHG  FTAG FTR  HTHG  \
0  D1 2025-08-22  19:30  Bayern Munich     RB Leipzig     6     0   H     3   
1  D1 2025-08-23  14:30  Ein Frankfurt  Werder Bremen     4     1   H     2   
2  D1 2025-08-23  14:30       Freiburg       Augsburg     1     3   A     0   
3  D1 2025-08-23  14:30     Heidenheim      Wolfsburg     1     3   A     1   
4  D1 2025-08-23  14:30     Leverkusen     Hoffenheim     1     2   A     1   

   HTAG  ... B365CAHH  B365CAHA  PCAHH  PCAHA  MaxCAHH  MaxCAHA  AvgCAHH  \
0     0  ...     1.98      1.88   1.98   1.93     1.99     1.93     1.90   
1     0  ...     1.83      2.03   2.02   1.91     1.91     2.03     1.83   
2     3  ...     1.93      1.93   1.97   1.95     1.97     1.93     1.90   
3     1  ...     2.03      1.83   2.06   1.87     2.03     1.85     1.97   
4     1  ...     1.98      1.88   1.97   1.95     1.98     2.02     1.88   

   AvgCAHA  BFECAHH  BFECAHA  
0     1.86     2.07     1.92  
1     

In [71]:
# Aggregate goals and games
home = df.groupby('HomeTeam').agg(
    HGF=('FTHG','sum'), HGA=('FTAG','sum'), HG=('HomeTeam','size')
)
away = df.groupby('AwayTeam').agg(
    AGF=('FTAG','sum'), AGA=('FTHG','sum'), AG=('AwayTeam','size')
)

teams = home.join(away, how='outer').fillna(0)
teams['GF'] = teams['HGF'] + teams['AGF']
teams['GA'] = teams['HGA'] + teams['AGA']
teams['G']  = teams['HG'] + teams['AG']

# League baselines
avg_home = df['FTHG'].mean()
avg_away = df['FTAG'].mean()
avg_all  = (df['FTHG'].sum() + df['FTAG'].sum()) / len(df)

# Rates
teams['rate_overall_scored']   = teams['GF'] / teams['G']
teams['rate_overall_conceded'] = teams['GA'] / teams['G']
teams['rate_home_scored']      = teams['HGF'] / teams['HG'].replace(0,np.nan)
teams['rate_home_conceded']    = teams['HGA'] / teams['HG'].replace(0,np.nan)
teams['rate_away_scored']      = teams['AGF'] / teams['AG'].replace(0,np.nan)
teams['rate_away_conceded']    = teams['AGA'] / teams['AG'].replace(0,np.nan)

# Shrinkage (blend venue with overall). k controls strength of shrinkage.
k = 1.5
teams['attack_home'] = (
    (teams['rate_home_scored'] * teams['HG']) + (k * teams['rate_overall_scored'])
) / (teams['HG'] + k)
teams['attack_away'] = (
    (teams['rate_away_scored'] * teams['AG']) + (k * teams['rate_overall_scored'])
) / (teams['AG'] + k)

teams['defense_home'] = (
    (teams['rate_home_conceded'] * teams['HG']) + (k * teams['rate_overall_conceded'])
) / (teams['HG'] + k)
teams['defense_away'] = (
    (teams['rate_away_conceded'] * teams['AG']) + (k * teams['rate_overall_conceded'])
) / (teams['AG'] + k)

# Convert to strengths relative to league (higher attack/defense = better)
teams['AttackStrengthHome'] = teams['attack_home'] / avg_home
teams['AttackStrengthAway'] = teams['attack_away'] / avg_away
teams['DefenseStrengthHome'] = avg_away / teams['defense_home']   # fewer conceded -> higher
teams['DefenseStrengthAway'] = avg_home / teams['defense_away']

# Example: expected goals using venue-aware strengths

def expected_goals(home_team, away_team, tbl, avg_home, avg_away, max_goals = 12):
    ha = tbl.loc[home_team, 'AttackStrengthHome']
    hd = tbl.loc[home_team, 'DefenseStrengthHome']
    aa = tbl.loc[away_team, 'AttackStrengthAway']
    ad = tbl.loc[away_team, 'DefenseStrengthAway']
    lam_home = ha * (1 / ad) * avg_home
    lam_away = aa * (1 / hd) * avg_away
    best = (0,0); best_p = 0.0
    for i in range(max_goals+1):
        for j in range(max_goals+1):
            p = poisson.pmf(i, lam_home) * poisson.pmf(j, lam_away)
            if p > best_p:
                best_p = p; best = (i,j)
    return best 

lam_h, lam_a = expected_goals('Werder Bremen', 'Stuttgart', teams, avg_home, avg_away)

In [72]:

# Last 9 league games overall
last9_league = df.sort_values('Date').tail(9)
print(last9_league[['Date','HomeTeam','AwayTeam','FTHG','FTAG']])
for i, row in enumerate(last9_league.itertuples(index=False),start=1):
    print("HomeTeam: "+row.HomeTeam + " AwayTeam "+ row.AwayTeam)
    score = expected_goals(row.HomeTeam, row.AwayTeam, teams,avg_home, avg_away)
    print(score)


          Date    HomeTeam       AwayTeam  FTHG  FTAG
108 2025-12-05       Mainz     M'gladbach     0     1
112 2025-12-06   Stuttgart  Bayern Munich     0     5
111 2025-12-06  Heidenheim       Freiburg     2     1
113 2025-12-06   Wolfsburg   Union Berlin     3     1
109 2025-12-06    Augsburg     Leverkusen     2     0
110 2025-12-06     FC Koln       St Pauli     1     1
114 2025-12-06  RB Leipzig  Ein Frankfurt     6     0
115 2025-12-07     Hamburg  Werder Bremen     3     2
116 2025-12-07    Dortmund     Hoffenheim     2     0
HomeTeam: Mainz AwayTeam M'gladbach
(0, 1)
HomeTeam: Stuttgart AwayTeam Bayern Munich
(0, 3)
HomeTeam: Heidenheim AwayTeam Freiburg
(1, 1)
HomeTeam: Wolfsburg AwayTeam Union Berlin
(1, 1)
HomeTeam: Augsburg AwayTeam Leverkusen
(1, 2)
HomeTeam: FC Koln AwayTeam St Pauli
(2, 0)
HomeTeam: RB Leipzig AwayTeam Ein Frankfurt
(4, 0)
HomeTeam: Hamburg AwayTeam Werder Bremen
(1, 1)
HomeTeam: Dortmund AwayTeam Hoffenheim
(1, 0)


In [47]:

def match_probabilities(home_team, away_team, df_stats, avg_home, avg_away, max_goals=6):
    try:
        ha = df_stats.loc[home_team, 'AttackStrength']
        hd = df_stats.loc[home_team, 'DefenseStrength']
        aa = df_stats.loc[away_team, 'AttackStrength']
        ad = df_stats.loc[away_team, 'DefenseStrength']
    except KeyError as e:
        print(f"Team not found: {e}")
        return

    lam_home = ha * (1 / ad) * avg_home
    lam_away = aa * (1 / hd) * avg_away
    print(f"Expected goals: {home_team} {lam_home:.2f}, {away_team} {lam_away:.2f}")

    best = (0,0); best_p = 0.0
    for i in range(max_goals+1):
        for j in range(max_goals+1):
            p = poisson.pmf(i, lam_home) * poisson.pmf(j, lam_away)
            if p > best_p:
                best_p = p; best = (i,j)
                print(f"Most probable score: {best} (p={best_p:.3f})")


match_probabilities('Werder Bremen', 'Stuttgart', teams, avg_home, avg_away)

Team not found: 'AttackStrength'
