In [1]:
import pandas as pd 
matches = pd.read_csv("matches.csv", index_col = 0)
matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,Match Report,,17.0,8.0,13.9,0.0,0,0,2024,Manchester City
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,Match Report,,14.0,4.0,17.9,0.0,0,0,2024,Manchester City
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,Match Report,,29.0,9.0,17.3,2.0,0,1,2024,Manchester City
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,Match Report,,6.0,4.0,14.8,0.0,1,1,2024,Manchester City
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,Match Report,,29.0,13.0,16.4,1.0,0,0,2024,Manchester City


In [2]:
matches["Date"] = pd.to_datetime(matches["Date"])
matches.dtypes


Date            datetime64[ns]
Time                    object
Comp                    object
Round                   object
Day                     object
Venue                   object
Result                  object
GF                       int64
GA                       int64
Opponent                object
xG                     float64
xGA                    float64
Poss                   float64
Attendance             float64
Captain                 object
Formation               object
Referee                 object
Match Report            object
Notes                  float64
Sh                     float64
SoT                    float64
Dist                   float64
FK                     float64
PK                       int64
PKatt                    int64
Season                   int64
Team                    object
dtype: object

In [3]:
matches["venue_code"] = matches["Venue"].astype("category").cat.codes

In [4]:
matches["opponent_code"] = matches["Opponent"].astype("category").cat.codes

In [5]:
matches["hour"] = matches["Time"].str.replace(":.+", "", regex=True).astype("int")

In [6]:
matches["day_code"] = matches["Date"].dt.dayofweek
matches["target"] = (matches["Result"] == "W").astype("int")

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
rf = RandomForestClassifier(n_estimators = 50, min_samples_split=10, random_state=1) 

In [9]:
train = matches[matches["Date"] < '2023-01-01']

In [10]:
test = matches[matches["Date"] > '2023-01-01']

In [11]:
predictors = ["venue_code", "opponent_code", "hour", "day_code"]

In [12]:
rf.fit(train[predictors], train["target"])

In [13]:
preds = rf.predict(test[predictors]) 

In [14]:
from sklearn.metrics import accuracy_score

In [15]:
acc = accuracy_score(test["target"], preds)

In [16]:
acc

0.6136363636363636

In [17]:
combined = pd.DataFrame(dict(actual=test["target"], prediction = preds))

In [18]:
pd.crosstab(index=combined["actual"], columns = combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,579,146
1,313,150


In [19]:
from sklearn.metrics import precision_score

In [20]:
precision_score(test["target"], preds)

0.5067567567567568

In [21]:
grouped_matches = matches.groupby("Team")

In [22]:
group = grouped_matches.get_group("Manchester City")

In [23]:
def rolling_averages(group, cols, new_cols): 
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset= new_cols) 
    return group
    

In [24]:
cols = ["GF", "GA", "Sh", "SoT", "Dist" , "FK", "PK", "PKatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [25]:
new_cols

['GF_rolling',
 'GA_rolling',
 'Sh_rolling',
 'SoT_rolling',
 'Dist_rolling',
 'FK_rolling',
 'PK_rolling',
 'PKatt_rolling']

In [26]:
rolling_averages(group, cols, new_cols) 

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,5,1,3.000000,1.000000,17.666667,6.000000,17.466667,0.666667,0.333333,0.333333
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,2,1,3.666667,1.666667,19.333333,7.333333,15.933333,0.333333,0.000000,0.000000
6,2022-09-03,17:30,Premier League,Matchweek 6,Sat,Away,D,1,1,Aston Villa,...,5,0,4.333333,1.666667,18.666667,8.000000,15.033333,0.333333,0.000000,0.000000
9,2022-09-17,12:30,Premier League,Matchweek 8,Sat,Away,W,3,0,Wolves,...,5,1,3.666667,1.000000,16.000000,6.000000,15.233333,0.333333,0.000000,0.000000
10,2022-10-02,14:00,Premier League,Matchweek 9,Sun,Home,W,6,3,Manchester Utd,...,6,1,3.333333,0.333333,15.333333,6.666667,17.000000,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,2024-04-28,16:30,Premier League,Matchweek 35,Sun,Away,W,2,0,Nott'ham Forest,...,6,1,4.333333,1.000000,22.666667,8.666667,16.666667,0.333333,0.333333,0.333333
52,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Home,W,5,1,Wolves,...,5,1,3.666667,0.333333,20.333333,7.666667,17.533333,0.666667,0.333333,0.333333
53,2024-05-11,12:30,Premier League,Matchweek 37,Sat,Away,W,4,0,Fulham,...,5,1,3.666667,0.333333,14.333333,7.000000,17.000000,0.666667,0.666667,0.666667
54,2024-05-14,20:00,Premier League,Matchweek 34,Tue,Away,W,2,0,Tottenham,...,1,1,3.666667,0.333333,14.666667,7.666667,17.200000,0.333333,1.000000,1.000000


In [27]:
matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [28]:
matches_rolling = matches_rolling.droplevel("Team")

In [29]:
matches_rolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,...,5,1,3.000000,0.666667,14.333333,5.000000,14.133333,0.333333,0.000000,0.000000
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,...,2,1,3.000000,1.000000,18.333333,7.000000,14.433333,0.333333,0.000000,0.000000
5,2022-09-04,16:30,Premier League,Matchweek 6,Sun,Away,L,1,3,Manchester Utd,...,6,0,2.333333,0.666667,19.333333,7.333333,15.533333,0.666667,0.000000,0.000000
7,2022-09-18,12:00,Premier League,Matchweek 8,Sun,Away,W,3,0,Brentford,...,6,1,1.666667,1.666667,20.000000,6.333333,16.800000,1.000000,0.000000,0.000000
8,2022-10-01,12:30,Premier League,Matchweek 9,Sat,Home,W,3,1,Tottenham,...,5,1,2.000000,1.333333,17.000000,6.000000,17.700000,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40,2024-04-24,19:45,Premier League,Matchweek 29,Wed,Home,L,0,1,Bournemouth,...,2,0,1.000000,2.000000,9.666667,4.000000,20.533333,0.333333,0.333333,0.333333
41,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Luton Town,...,5,1,0.666667,1.666667,10.333333,3.333333,18.766667,0.000000,0.000000,0.000000
42,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Away,L,1,5,Manchester City,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
43,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1,3,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000


In [30]:
matches_rolling.index = range(matches_rolling.shape[0])

In [31]:
matches_rolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
0,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,...,5,1,3.000000,0.666667,14.333333,5.000000,14.133333,0.333333,0.000000,0.000000
1,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,...,2,1,3.000000,1.000000,18.333333,7.000000,14.433333,0.333333,0.000000,0.000000
2,2022-09-04,16:30,Premier League,Matchweek 6,Sun,Away,L,1,3,Manchester Utd,...,6,0,2.333333,0.666667,19.333333,7.333333,15.533333,0.666667,0.000000,0.000000
3,2022-09-18,12:00,Premier League,Matchweek 8,Sun,Away,W,3,0,Brentford,...,6,1,1.666667,1.666667,20.000000,6.333333,16.800000,1.000000,0.000000,0.000000
4,2022-10-01,12:30,Premier League,Matchweek 9,Sat,Home,W,3,1,Tottenham,...,5,1,2.000000,1.333333,17.000000,6.000000,17.700000,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1446,2024-04-24,19:45,Premier League,Matchweek 29,Wed,Home,L,0,1,Bournemouth,...,2,0,1.000000,2.000000,9.666667,4.000000,20.533333,0.333333,0.333333,0.333333
1447,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Luton Town,...,5,1,0.666667,1.666667,10.333333,3.333333,18.766667,0.000000,0.000000,0.000000
1448,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Away,L,1,5,Manchester City,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
1449,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1,3,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000


In [32]:
def make_predictions(data, predictors): 
    train = data[data["Date"] < '2023-01-01']
    test = data[data["Date"] > '2023-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors]) 
    combined = pd.DataFrame(dict(actual = test["target"], predicted = preds), index = test.index) 
    precision = precision_score(test["target"], preds)
    return combined, precision 

In [33]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols) 

In [34]:
precision

0.5294117647058824

In [35]:
combined= combined.merge(matches_rolling[["Date", "Team", "Opponent", "Result"]], left_index = True, right_index = True)

In [36]:
combined

Unnamed: 0,actual,predicted,Date,Team,Opponent,Result
13,0,1,2023-01-03,Arsenal,Newcastle Utd,D
14,1,1,2023-01-15,Arsenal,Tottenham,W
15,1,1,2023-01-22,Arsenal,Manchester Utd,W
16,0,0,2023-02-04,Arsenal,Everton,L
17,0,1,2023-02-11,Arsenal,Brentford,D
...,...,...,...,...,...,...
1446,0,0,2024-04-24,Wolverhampton Wanderers,Bournemouth,L
1447,1,0,2024-04-27,Wolverhampton Wanderers,Luton Town,W
1448,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L
1449,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L


In [37]:
class MissingDict(dict):
    __missing__ = lambda self, key: key    

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United" : "Manchester Utd",
    "Newcastle United": "Newcastle Utd", 
    "Tottenham Hotspur": "Tottentam", 
    "West Ham United" : "West Ham", 
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)

In [38]:
combined["new_team"] = combined["Team"].map(mapping)

In [39]:
merged = combined.merge(combined, left_on=["Date", "new_team"], right_on=["Date", "Opponent"])

In [40]:
merged

Unnamed: 0,actual_x,predicted_x,Date,Team_x,Opponent_x,Result_x,new_team_x,actual_y,predicted_y,Team_y,Opponent_y,Result_y,new_team_y
0,0,1,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal,0,0,Newcastle United,Arsenal,D,Newcastle Utd
1,1,1,2023-01-15,Arsenal,Tottenham,W,Arsenal,0,0,Tottenham Hotspur,Arsenal,L,Tottentam
2,1,1,2023-01-22,Arsenal,Manchester Utd,W,Arsenal,0,0,Manchester United,Arsenal,L,Manchester Utd
3,0,0,2023-02-04,Arsenal,Everton,L,Arsenal,1,0,Everton,Arsenal,W,Everton
4,0,1,2023-02-11,Arsenal,Brentford,D,Arsenal,0,0,Brentford,Arsenal,D,Brentford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1014,0,0,2024-04-24,Wolverhampton Wanderers,Bournemouth,L,Wolves,1,0,Bournemouth,Wolves,W,Bournemouth
1015,1,0,2024-04-27,Wolverhampton Wanderers,Luton Town,W,Wolves,0,0,Luton Town,Wolves,L,Luton Town
1016,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L,Wolves,1,1,Manchester City,Wolves,W,Manchester City
1017,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,1,Crystal Palace,Wolves,W,Crystal Palace


In [41]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()


actual_x
1    133
0    104
Name: count, dtype: int64