In [3]:
import pandas as pd

matches = pd.read_csv("matches.csv", index_col = 0)

matches["date"] = pd.to_datetime(matches["date"])

matches["venue_code"] = matches["venue"].astype("category").cat.codes 

matches["opp_code"] = matches["opponent"].astype("category").cat.codes

matches["hour"] = matches["time"].str.replace(":.+","",regex=True).astype("int")

matches["day_code"] = matches["date"].dt.dayofweek

matches["target"] = (matches["result"] == "W").astype("int")

In [4]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [6]:
train = matches[matches["date"] < '2022-01-01']

In [7]:
test = matches[matches["date"] > '2022-01-01']

In [10]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [11]:
rf.fit(train[predictors], train["target"])

In [12]:
preds = rf.predict(test[predictors])

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
acc = accuracy_score(test["target"], preds)

In [15]:
acc

0.6056701030927835

In [16]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [18]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,186,47
1,106,49


In [19]:
from sklearn.metrics import precision_score

In [20]:
precision_score(test["target"], preds)

0.5104166666666666

In [21]:
grouped_matches = matches.groupby("team")

In [22]:
group = grouped_matches.get_group("Manchester City")

In [23]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,1.0,0,0,2022,Manchester City,0,18,16,6,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,1.0,0,0,2022,Manchester City,1,15,15,5,1
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,0.0,0,0,2022,Manchester City,1,0,12,5,1
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,0.0,0,0,2022,Manchester City,0,10,15,5,1
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,1.0,0,0,2022,Manchester City,1,17,15,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,2021-05-01,12:30,Premier League,Matchweek 34,Sat,Away,W,2,0,Crystal Palace,...,1.0,0,0,2021,Manchester City,0,6,12,5,1
56,2021-05-08,17:30,Premier League,Matchweek 35,Sat,Home,L,1,2,Chelsea,...,0.0,0,1,2021,Manchester City,1,5,17,5,0
57,2021-05-14,20:00,Premier League,Matchweek 36,Fri,Away,W,4,3,Newcastle Utd,...,1.0,0,0,2021,Manchester City,0,14,20,4,1
58,2021-05-18,19:00,Premier League,Matchweek 37,Tue,Away,L,2,3,Brighton,...,1.0,0,0,2021,Manchester City,0,3,19,1,0


In [24]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [25]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_roling" for c in cols]

In [26]:
new_cols

['gf_roling',
 'ga_roling',
 'sh_roling',
 'sot_roling',
 'dist_roling',
 'fk_roling',
 'pk_roling',
 'pkatt_roling']

In [27]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_roling,ga_roling,sh_roling,sot_roling,dist_roling,fk_roling,pk_roling,pkatt_roling
5,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1,0,Arsenal,...,5,1,2.000000,2.333333,17.333333,4.666667,19.700000,1.333333,0.333333,0.333333
7,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1,1,West Ham,...,5,0,1.333333,2.000000,17.333333,3.666667,18.566667,0.666667,0.000000,0.000000
9,2020-10-31,12:30,Premier League,Matchweek 7,Sat,Away,W,1,0,Sheffield Utd,...,5,1,1.000000,0.666667,16.666667,4.333333,18.933333,0.666667,0.000000,0.000000
11,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Home,D,1,1,Liverpool,...,6,0,1.000000,0.333333,14.333333,6.666667,19.033333,1.000000,0.000000,0.000000
12,2020-11-21,17:30,Premier League,Matchweek 9,Sat,Away,L,0,2,Tottenham,...,5,0,1.000000,0.666667,12.000000,5.666667,20.000000,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2022-04-30,17:30,Premier League,Matchweek 35,Sat,Away,W,4,0,Leeds United,...,5,1,3.333333,1.000000,16.000000,5.333333,17.466667,0.333333,0.333333,0.333333
54,2022-05-08,16:30,Premier League,Matchweek 36,Sun,Home,W,5,0,Newcastle Utd,...,6,1,4.000000,0.333333,18.666667,6.000000,16.933333,0.000000,0.333333,0.333333
55,2022-05-11,20:15,Premier League,Matchweek 33,Wed,Away,W,5,1,Wolves,...,2,1,4.666667,0.333333,20.000000,7.333333,15.600000,0.333333,0.333333,0.333333
56,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,2,2,West Ham,...,6,0,4.666667,0.333333,18.333333,6.666667,15.633333,0.333333,0.000000,0.000000


In [29]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [30]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_roling,ga_roling,sh_roling,sot_roling,dist_roling,fk_roling,pk_roling,pkatt_roling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2,1,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
Arsenal,7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0,1,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
Arsenal,9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
Arsenal,11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
Arsenal,13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,37,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Home,L,0,3,Brighton,...,5,0,0.666667,1.000000,8.666667,3.333333,17.400000,0.000000,0.000000,0.000000
Wolverhampton Wanderers,38,2022-05-07,15:00,Premier League,Matchweek 36,Sat,Away,D,2,2,Chelsea,...,5,0,0.000000,1.666667,8.666667,2.333333,18.633333,0.333333,0.000000,0.000000
Wolverhampton Wanderers,39,2022-05-11,20:15,Premier League,Matchweek 33,Wed,Home,L,1,5,Manchester City,...,2,0,0.666667,2.000000,11.666667,3.000000,17.766667,0.333333,0.000000,0.000000
Wolverhampton Wanderers,40,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Home,D,1,1,Norwich City,...,6,0,1.000000,3.333333,10.666667,2.666667,17.066667,0.333333,0.000000,0.000000


In [31]:
matches_rolling = matches_rolling.droplevel('team')

In [32]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_roling,ga_roling,sh_roling,sot_roling,dist_roling,fk_roling,pk_roling,pkatt_roling
6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2,1,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0,1,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Home,L,0,3,Brighton,...,5,0,0.666667,1.000000,8.666667,3.333333,17.400000,0.000000,0.000000,0.000000
38,2022-05-07,15:00,Premier League,Matchweek 36,Sat,Away,D,2,2,Chelsea,...,5,0,0.000000,1.666667,8.666667,2.333333,18.633333,0.333333,0.000000,0.000000
39,2022-05-11,20:15,Premier League,Matchweek 33,Wed,Home,L,1,5,Manchester City,...,2,0,0.666667,2.000000,11.666667,3.000000,17.766667,0.333333,0.000000,0.000000
40,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Home,D,1,1,Norwich City,...,6,0,1.000000,3.333333,10.666667,2.666667,17.066667,0.333333,0.000000,0.000000


In [33]:
matches_rolling.index = range(matches_rolling.shape[0])

In [34]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_roling,ga_roling,sh_roling,sot_roling,dist_roling,fk_roling,pk_roling,pkatt_roling
0,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2,1,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
1,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0,1,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
2,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
3,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
4,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1443,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Home,L,0,3,Brighton,...,5,0,0.666667,1.000000,8.666667,3.333333,17.400000,0.000000,0.000000,0.000000
1444,2022-05-07,15:00,Premier League,Matchweek 36,Sat,Away,D,2,2,Chelsea,...,5,0,0.000000,1.666667,8.666667,2.333333,18.633333,0.333333,0.000000,0.000000
1445,2022-05-11,20:15,Premier League,Matchweek 33,Wed,Home,L,1,5,Manchester City,...,2,0,0.666667,2.000000,11.666667,3.000000,17.766667,0.333333,0.000000,0.000000
1446,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Home,D,1,1,Norwich City,...,6,0,1.000000,3.333333,10.666667,2.666667,17.066667,0.333333,0.000000,0.000000


In [35]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    pred = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [36]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [37]:
precision

0.4375

In [41]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [42]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
55,0,0,2022-01-23,Arsenal,Burnley,D
56,1,0,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W
58,1,1,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...
1443,0,0,2022-04-30,Wolverhampton Wanderers,Brighton,L
1444,0,0,2022-05-07,Wolverhampton Wanderers,Chelsea,D
1445,0,0,2022-05-11,Wolverhampton Wanderers,Manchester City,L
1446,0,0,2022-05-15,Wolverhampton Wanderers,Norwich City,D


In [43]:
class MissingDict(dict):
    __missing__ = lambda self, key: key 

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manuchest Utd",
    "Newcastle United": "Newcastle Utd",
    "Totthenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}

mapping = MissingDict(**map_values)

In [44]:
mapping["West Ham United"]

'West Ham'

In [46]:
combined["new_team"] = combined["team"].map(mapping)

In [49]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [51]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

actual_x
0    40
1    28
Name: count, dtype: int64

In [53]:
matches.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team', 'venue_code', 'opp_code', 'hour', 'day_code',
       'target'],
      dtype='object')