In [4]:
import pandas as pd

In [21]:
matches = pd.read_csv("matches.csv", index_col = 0)

In [22]:
matches.head()

Unnamed: 0_level_0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR
Season_End_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1993,1,1992-08-15,Coventry City,2,1,Middlesbrough,H
1993,1,1992-08-15,Leeds United,2,1,Wimbledon,H
1993,1,1992-08-15,Sheffield Utd,2,1,Manchester Utd,H
1993,1,1992-08-15,Crystal Palace,3,3,Blackburn,D
1993,1,1992-08-15,Arsenal,2,4,Norwich City,A


In [23]:
matches.shape

(12026, 7)

In [27]:
38 * 20 * 30

22800

In [29]:
matches["FTR"].value_counts()

FTR
H    5519
A    3410
D    3097
Name: count, dtype: int64

In [32]:
# checking the number of match weeks
matches["Wk"].value_counts()

Wk
1     313
30    313
23    313
24    313
28    313
25    313
26    313
27    313
29    313
33    313
21    313
31    313
32    313
34    313
35    313
36    313
37    313
38    313
2     313
22    313
20    313
10    313
3     313
4     313
5     313
6     313
7     313
8     313
19    313
9     313
11    313
12    313
13    313
14    313
15    313
16    313
17    313
18    313
40     33
39     33
41     33
42     33
Name: count, dtype: int64

In [33]:
matches.dtypes

Wk            int64
Date         object
Home         object
HomeGoals     int64
AwayGoals     int64
Away         object
FTR          object
dtype: object

In [34]:
matches["Date"] = pd.to_datetime(matches["Date"])

In [35]:
matches.dtypes

Wk                    int64
Date         datetime64[ns]
Home                 object
HomeGoals             int64
AwayGoals             int64
Away                 object
FTR                  object
dtype: object

In [36]:
matches["Home_adv_code"] = matches["Home"].astype("category").cat.codes

In [38]:
matches["Away_adv_code"] = matches["Away"].astype("category").cat.codes

In [40]:
matches["day_code"] = matches["Date"].dt.dayofweek

In [44]:
matches["target"] = (matches["FTR"] == "H").astype("int")

In [46]:
# machine learning model
from sklearn.ensemble import RandomForestClassifier

In [47]:
rf  = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [49]:
# splitting the training data and test 
training = matches[matches["Date"] < '2023-01-01']

In [50]:
# testing data
testing  = matches[matches["Date"] > '2023-01-01']

In [51]:
predictors = ["Home_adv_code", "Away_adv_code", "day_code"]

In [99]:
rf.fit(training[predictors], training["target"])

In [100]:
newPredictors = rf.predict(testing[predictors])

In [101]:
from sklearn.metrics import accuracy_score

In [102]:
acc = accuracy_score(testing["target"], newPredictors)

In [103]:
# predicting the accuracy of the intial model 
acc

0.5841121495327103

In [104]:
combined = pd.DataFrame(dict(actual=testing["target"], prediction=newPredictors))

In [105]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,75,32
1,57,50


In [106]:
from sklearn.metrics import precision_score

In [107]:
precision_score(testing["target"], newPredictors)

0.6097560975609756

In [108]:
group_matches = matches.groupby("Home")

In [109]:
group = group_matches.get_group("Liverpool")

In [110]:
group

Unnamed: 0_level_0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,Home_adv_code,Away_adv_code,day_code,target
Season_End_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993,2,1992-08-19,Liverpool,2,1,Sheffield Utd,H,25,36,2,1
1993,3,1992-08-23,Liverpool,0,2,Arsenal,A,25,0,6,0
1993,6,1992-09-01,Liverpool,1,1,Southampton,D,25,38,1,0
1993,7,1992-09-05,Liverpool,2,1,Chelsea,H,25,14,5,1
1993,10,1992-09-26,Liverpool,2,3,Wimbledon,A,25,48,5,0
...,...,...,...,...,...,...,...,...,...,...,...
2023,32,2023-04-22,Liverpool,3,2,Nott'ham Forest,H,25,31,5,1
2023,34,2023-04-30,Liverpool,4,3,Tottenham,H,25,43,6,1
2023,28,2023-05-03,Liverpool,1,0,Fulham,H,25,19,2,1
2023,35,2023-05-06,Liverpool,1,0,Brentford,H,25,9,5,1


In [111]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3, closed="left").mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [112]:
cols = ["HomeGoals", "AwayGoals"]
new_cols = [f"{c}_rolling" for c in cols]

In [113]:
new_cols

['HomeGoals_rolling', 'AwayGoals_rolling']

In [114]:
rolling_averages(group, cols, new_cols)

Unnamed: 0_level_0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,Home_adv_code,Away_adv_code,day_code,target,HomeGoals_rolling,AwayGoals_rolling
Season_End_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1993,7,1992-09-05,Liverpool,2,1,Chelsea,H,25,14,5,1,1.000000,1.333333
1993,10,1992-09-26,Liverpool,2,3,Wimbledon,A,25,48,5,0,1.000000,1.333333
1993,11,1992-10-03,Liverpool,1,0,Sheffield Weds,H,25,37,5,1,1.666667,1.666667
1993,13,1992-10-25,Liverpool,4,1,Norwich City,H,25,30,6,1,1.666667,1.333333
1993,15,1992-11-07,Liverpool,4,1,Middlesbrough,H,25,28,5,1,2.333333,1.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,32,2023-04-22,Liverpool,3,2,Nott'ham Forest,H,25,31,5,1,3.666667,0.666667
2023,34,2023-04-30,Liverpool,4,3,Tottenham,H,25,43,6,1,4.000000,1.333333
2023,28,2023-05-03,Liverpool,1,0,Fulham,H,25,19,2,1,3.000000,2.333333
2023,35,2023-05-06,Liverpool,1,0,Brentford,H,25,9,5,1,2.666667,1.666667


In [115]:
matches_rolling = matches.groupby("Home").apply(lambda x: rolling_averages(x, cols, new_cols))

In [124]:
matches_rolling

Unnamed: 0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,Home_adv_code,Away_adv_code,day_code,target,HomeGoals_rolling,AwayGoals_rolling
0,8,1992-09-12,Arsenal,0,1,Blackburn,A,0,4,5,0,2.000000,1.666667
1,10,1992-09-28,Arsenal,1,0,Manchester City,H,0,26,0,1,1.333333,0.666667
2,11,1992-10-03,Arsenal,2,1,Chelsea,H,0,14,5,1,1.000000,0.666667
3,13,1992-10-24,Arsenal,2,0,Everton,H,0,18,5,1,1.000000,0.666667
4,15,1992-11-07,Arsenal,3,0,Coventry City,H,0,15,5,1,1.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11871,30,2023-04-08,Wolves,1,0,Chelsea,H,49,14,5,1,1.000000,1.666667
11872,31,2023-04-15,Wolves,2,0,Brentford,H,49,9,5,1,1.333333,1.333333
11873,33,2023-04-25,Wolves,2,0,Crystal Palace,H,49,16,1,1,1.666667,1.333333
11874,35,2023-05-06,Wolves,1,0,Aston Villa,H,49,1,5,1,1.666667,0.000000


In [126]:
matches_rolling.index = range(matches_rolling.shape[0])

In [127]:
matches_rolling

Unnamed: 0,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,Home_adv_code,Away_adv_code,day_code,target,HomeGoals_rolling,AwayGoals_rolling
0,8,1992-09-12,Arsenal,0,1,Blackburn,A,0,4,5,0,2.000000,1.666667
1,10,1992-09-28,Arsenal,1,0,Manchester City,H,0,26,0,1,1.333333,0.666667
2,11,1992-10-03,Arsenal,2,1,Chelsea,H,0,14,5,1,1.000000,0.666667
3,13,1992-10-24,Arsenal,2,0,Everton,H,0,18,5,1,1.000000,0.666667
4,15,1992-11-07,Arsenal,3,0,Coventry City,H,0,15,5,1,1.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11871,30,2023-04-08,Wolves,1,0,Chelsea,H,49,14,5,1,1.000000,1.666667
11872,31,2023-04-15,Wolves,2,0,Brentford,H,49,9,5,1,1.333333,1.333333
11873,33,2023-04-25,Wolves,2,0,Crystal Palace,H,49,16,1,1,1.666667,1.333333
11874,35,2023-05-06,Wolves,1,0,Aston Villa,H,49,1,5,1,1.666667,0.000000


In [133]:
def make_predictions(data, predictors):
    training = data[data["Date"] < '2023-01-01']
    testing  = data[data["Date"] > '2023-01-01']
    rf.fit(training[predictors], training["target"])
    preds = rf.predict(testing[predictors])
    combined = pd.DataFrame(dict(actual=testing["target"], predicted=preds), index=testing.index)
    precision = precision_score(testing["target"], preds)
    return combined,precision

In [136]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [137]:
precision

0.6025641025641025

In [139]:
combined.head()


Unnamed: 0,actual,predicted
580,0,1
581,1,1
582,0,1
583,0,0
584,1,0
