In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("match_data.csv", index_col=0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,...,Match Report,,18.0,5.0,14.8,0.0,0,0,2025,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,...,Match Report,,19.0,8.0,13.6,1.0,0,0,2025,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,...,Match Report,,11.0,3.0,13.4,0.0,0,0,2025,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,...,Match Report,,14.0,5.0,14.9,0.0,0,0,2025,Liverpool
5,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,...,Match Report,,19.0,12.0,16.6,0.0,0,0,2025,Liverpool


In [4]:
matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf               float64
ga               float64
opponent          object
xg               float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes            float64
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

In [5]:
del matches["comp"]
del matches["notes"]

In [6]:
matches.dtypes
#need to clean data

date              object
time              object
round             object
day               object
venue             object
result            object
gf               float64
ga               float64
opponent          object
xg               float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

In [7]:
matches["date"] = pd.to_datetime(matches["date"])

In [8]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [9]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [10]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [11]:
matches["day_code"] = matches["date"].dt.dayofweek

In [12]:
matches["target"] = (matches["result"] == "W").astype("int")
#0 for loss and draw, 1 for win

In [13]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2024-08-17,12:30,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,2.6,...,0.0,0,0,2025,Liverpool,0,10,12,5,1
1,2024-08-25,16:30,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,2.5,...,1.0,0,0,2025,Liverpool,1,3,16,6,1
2,2024-09-01,16:00,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,1.8,...,0.0,0,0,2025,Liverpool,0,16,16,6,1
3,2024-09-14,15:00,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,0.9,...,0.0,0,0,2025,Liverpool,1,19,15,5,0
5,2024-09-21,15:00,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,2.0,...,0.0,0,0,2025,Liverpool,1,2,15,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2022-04-30,15:00,Matchweek 35,Sat,Away,L,0.0,2.0,Aston Villa,0.5,...,0.0,0,0,2022,Norwich City,0,1,15,5,0
39,2022-05-08,14:00,Matchweek 36,Sun,Home,L,0.0,4.0,West Ham,0.8,...,1.0,0,0,2022,Norwich City,1,24,14,6,0
40,2022-05-11,19:45,Matchweek 21,Wed,Away,L,0.0,3.0,Leicester City,1.1,...,0.0,0,0,2022,Norwich City,0,12,19,2,0
41,2022-05-15,14:00,Matchweek 37,Sun,Away,D,1.0,1.0,Wolves,1.1,...,0.0,0,0,2022,Norwich City,0,25,14,6,0


In [14]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=5000, min_samples_split=5, random_state=1)
train = matches[matches["date"] < '2022-01-01']
test = matches[matches["date"] > '2022-01-01']
predictors = ["venue_code", "opp_code", "hour", "day_code"]
rf.fit(train[predictors], train["target"])
preds = rf.predict(test[predictors])
from sklearn.metrics import accuracy_score
acc = accuracy_score(test["target"], preds)
acc

0.5848899958489

In [15]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1146,330
1,670,263


In [16]:
from sklearn.metrics import precision_score

precision_score(test["target"], preds)

0.44350758853288363

In [17]:
grouped_matches = matches.groupby("team")

In [18]:
group = grouped_matches.get_group("Manchester City").sort_values("date")

In [19]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [20]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,2.8,...,5,1,3.333333,0.333333,19.666667,6.000000,16.866667,0.666667,0.0,0.000000
6,2021-09-18,15:00,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,1.0,...,5,0,3.666667,0.000000,22.000000,7.333333,15.866667,0.333333,0.0,0.000000
8,2021-09-25,12:30,Matchweek 6,Sat,Away,W,1.0,0.0,Chelsea,1.7,...,5,1,2.000000,0.000000,22.000000,6.333333,15.166667,0.333333,0.0,0.000000
10,2021-10-03,16:30,Matchweek 7,Sun,Away,D,2.0,2.0,Liverpool,1.2,...,6,0,0.666667,0.000000,18.666667,4.000000,15.933333,0.333333,0.0,0.000000
11,2021-10-16,15:00,Matchweek 8,Sat,Home,W,2.0,0.0,Burnley,2.1,...,5,1,1.000000,0.666667,14.333333,2.333333,16.833333,0.666667,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30,2025-01-14,19:30,Matchweek 21,Tue,Away,D,2.0,2.0,Brentford,2.2,...,1,0,2.333333,0.666667,15.666667,5.333333,17.366667,0.666667,0.0,0.333333
31,2025-01-19,16:30,Matchweek 22,Sun,Away,W,6.0,0.0,Ipswich Town,3.0,...,6,1,2.666667,1.000000,15.000000,6.666667,16.666667,0.333333,0.0,0.000000
33,2025-01-25,17:30,Matchweek 23,Sat,Home,W,3.0,1.0,Chelsea,2.2,...,5,1,4.000000,1.000000,16.000000,8.000000,15.533333,0.000000,0.0,0.000000
35,2025-02-02,16:30,Matchweek 24,Sun,Away,L,1.0,5.0,Arsenal,0.8,...,6,0,3.666667,1.000000,17.666667,7.666667,15.700000,0.333333,0.0,0.000000


In [21]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,4,2021-09-11,15:00,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,2.7,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.0,0.0
Arsenal,5,2021-09-18,15:00,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,1.1,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.0,0.0
Arsenal,7,2021-09-26,16:30,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,1.1,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.0,0.0
Arsenal,8,2021-10-02,17:30,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,0.4,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.0,0.0
Arsenal,9,2021-10-18,20:00,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,1.7,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,23,2025-01-15,19:30,Matchweek 21,Wed,Away,L,0.0,3.0,Newcastle Utd,1.6,...,2,0,1.333333,1.666667,10.333333,4.000000,19.800000,0.666667,0.0,0.0
Wolverhampton Wanderers,24,2025-01-20,20:00,Matchweek 22,Mon,Away,L,1.0,3.0,Chelsea,0.8,...,0,0,0.666667,2.666667,12.333333,5.000000,17.566667,0.333333,0.0,0.0
Wolverhampton Wanderers,25,2025-01-25,15:00,Matchweek 23,Sat,Home,L,0.0,1.0,Arsenal,0.7,...,5,0,0.333333,3.000000,11.666667,5.333333,16.466667,0.000000,0.0,0.0
Wolverhampton Wanderers,26,2025-02-01,17:30,Matchweek 24,Sat,Home,W,2.0,0.0,Aston Villa,1.6,...,5,1,0.333333,2.333333,10.333333,5.000000,18.700000,0.000000,0.0,0.0


In [22]:
matches_rolling = matches_rolling.droplevel('team')

In [23]:
matches_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,2.7,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.0,0.0
5,2021-09-18,15:00,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,1.1,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.0,0.0
7,2021-09-26,16:30,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,1.1,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.0,0.0
8,2021-10-02,17:30,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,0.4,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.0,0.0
9,2021-10-18,20:00,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,1.7,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,2025-01-15,19:30,Matchweek 21,Wed,Away,L,0.0,3.0,Newcastle Utd,1.6,...,2,0,1.333333,1.666667,10.333333,4.000000,19.800000,0.666667,0.0,0.0
24,2025-01-20,20:00,Matchweek 22,Mon,Away,L,1.0,3.0,Chelsea,0.8,...,0,0,0.666667,2.666667,12.333333,5.000000,17.566667,0.333333,0.0,0.0
25,2025-01-25,15:00,Matchweek 23,Sat,Home,L,0.0,1.0,Arsenal,0.7,...,5,0,0.333333,3.000000,11.666667,5.333333,16.466667,0.000000,0.0,0.0
26,2025-02-01,17:30,Matchweek 24,Sat,Home,W,2.0,0.0,Aston Villa,1.6,...,5,1,0.333333,2.333333,10.333333,5.000000,18.700000,0.000000,0.0,0.0


In [24]:
matches_rolling.index = range(matches_rolling.shape[0])

In [25]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [26]:
combined, prec = make_predictions(matches_rolling, predictors + new_cols)
acc = accuracy_score(test["target"], preds)
acc

0.5848899958489

In [27]:
prec

0.528046421663443

In [28]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [29]:
combined.head(10)

Unnamed: 0,actual,predicted,date,team,opponent,result
17,0,0,2022-01-23,Arsenal,Burnley,D
18,1,0,2022-02-10,Arsenal,Wolves,W
19,1,0,2022-02-19,Arsenal,Brentford,W
20,1,0,2022-02-24,Arsenal,Wolves,W
21,1,1,2022-03-06,Arsenal,Watford,W
22,1,1,2022-03-13,Arsenal,Leicester City,W
23,0,1,2022-03-16,Arsenal,Liverpool,L
24,1,0,2022-03-19,Arsenal,Aston Villa,W
25,0,0,2022-04-04,Arsenal,Crystal Palace,L
26,0,0,2022-04-09,Arsenal,Brighton,L


In [30]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)

In [31]:
combined["new_team"] = combined["team"].map(mapping)

In [32]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [33]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,0,0,2022-01-23,Arsenal,Burnley,D,Arsenal,0,0,Burnley,Arsenal,D,Burnley
1,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
2,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal,0,0,Brentford,Arsenal,L,Brentford
3,1,0,2022-02-24,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
4,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,0,0,2025-01-15,Wolverhampton Wanderers,Newcastle Utd,L,Wolves,1,1,Newcastle United,Wolves,W,Newcastle Utd
2236,0,0,2025-01-20,Wolverhampton Wanderers,Chelsea,L,Wolves,1,0,Chelsea,Wolves,W,Chelsea
2237,0,0,2025-01-25,Wolverhampton Wanderers,Arsenal,L,Wolves,1,0,Arsenal,Wolves,W,Arsenal
2238,1,0,2025-02-01,Wolverhampton Wanderers,Aston Villa,W,Wolves,0,0,Aston Villa,Wolves,L,Aston Villa


In [42]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

actual_x
1    229
0    181
Name: count, dtype: int64

In [49]:
new_data = matches[matches["date"] > '2022-01-01']

rolling_features = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
rolling_cols = [f"{c}_rolling" for c in rolling_features]

matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, rolling_features, rolling_cols))
matches_rolling = matches_rolling.droplevel("team")

new_data = matches_rolling[matches_rolling["date"] > '2022-01-01']

full_train_data = pd.concat([matches_rolling[matches_rolling["date"] < '2022-01-01'], new_data], ignore_index=True)

predictors = ["venue_code", "opp_code", "hour", "day_code"] + rolling_cols

# Retrain the model
rf = RandomForestClassifier(n_estimators=5000, min_samples_split=5, random_state=1)
rf.fit(full_train_data[predictors], full_train_data["target"])

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, rolling_features, rolling_cols))


In [68]:
# Define home team and opponent
home_team = "Manchester United"
opponent = "Wolves"

# Encode teams
home_team_code = matches[matches["team"] == home_team]["team"].astype("category").cat.codes.iloc[0]
opp_code = matches[matches["opponent"] == opponent]["opponent"].astype("category").cat.codes.iloc[0]
venue_code = matches[matches["team"] == home_team]["venue"].astype("category").cat.codes.iloc[0]

# Compute rolling averages (last 3 games)
rolling_features = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
rolling_cols = [f"{c}_rolling" for c in rolling_features]
home_team_matches = matches[matches["team"] == home_team].sort_values("date").tail(3)
rolling_avg_values = home_team_matches[rolling_features].mean().to_dict()

# Define match details with rolling averages
upcoming_match = pd.DataFrame({
    "venue_code": [venue_code],
    "opp_code": [opp_code],
    "hour": [9],  
    "day_code": [6],  
    **{col: [rolling_avg_values[col.replace("_rolling", "")]] for col in rolling_cols}  # Add rolling averages
})

# Predict result
prediction = rf.predict(upcoming_match)[0]
probs = rf.predict_proba(upcoming_match)[0]

# Output result
result = "Win" if prediction == 1 else "Draw/Loss"
print(f"{home_team} vs {opponent}")
print(f"Prediction: {result}")
print(f"Win Probability: {probs[1]:.2f}, Draw/Loss Probability: {probs[0]:.2f}")

Manchester United vs Wolves
Prediction: Draw/Loss
Win Probability: 0.34, Draw/Loss Probability: 0.66
