In [None]:
import pandas as pd
matches = pd.read_csv("matches.csv", index_col=0)

matches.dtypes

In [None]:
matches["date"] = pd.to_datetime(matches["date"]) # Convert existing date column to type datetime
matches.dtypes

In [14]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes # convert string to category then category to integers in new col
matches["opp_code"] = matches["opponent"].astype("category").cat.codes # same thing but for opponents
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int") # new col of time using the hour
matches["day_code"] = matches["date"].dt.dayofweek # convert day to a number (1 to 7) in new col
matches["target"] = (matches["result"] == "W").astype("int") # col that has a 1 for a win, 0 for draw/loss

In [19]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, min_samples_leaf=10, random_state=1)
train = matches[matches["date"] < '2023-01-01']
test = matches[matches["date"] > '2023-01-01']
predictors = ["venue_code", "opp_code", "hour", "day_code"]
rf.fit(train[predictors], train["target"])
preds = rf.predict(test[predictors])

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(test["target"], preds)
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

In [22]:
from sklearn.metrics import precision_score
precision_score(test["target"], preds)

0.47761194029850745

In [None]:
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Manchester City")

In [26]:
def rolling_averages(group, cols, new_cols): # use past gameweek stats to help predict future gameweeks
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [34]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"] # diff stats from the games
new_cols = [f"{c}_rolling" for c in cols] # rolling avg cols for each stat
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1,0,Arsenal,...,5,1,2.000000,2.333333,17.333333,4.666667,19.700000,1.333333,0.333333,0.333333
7,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1,1,West Ham,...,5,0,1.333333,2.000000,17.333333,3.666667,18.566667,0.666667,0.000000,0.000000
9,2020-10-31,12:30,Premier League,Matchweek 7,Sat,Away,W,1,0,Sheffield Utd,...,5,1,1.000000,0.666667,16.666667,4.333333,18.933333,0.666667,0.000000,0.000000
11,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Home,D,1,1,Liverpool,...,6,0,1.000000,0.333333,14.333333,6.666667,19.033333,1.000000,0.000000,0.000000
12,2020-11-21,17:30,Premier League,Matchweek 9,Sat,Away,L,0,2,Tottenham,...,5,0,1.000000,0.666667,12.000000,5.666667,20.000000,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2023-05-06,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Leeds United,...,5,1,3.000000,0.666667,13.666667,8.000000,15.433333,0.000000,0.333333,0.333333
54,2023-05-14,14:00,Premier League,Matchweek 36,Sun,Away,W,3,0,Everton,...,6,1,2.333333,0.666667,14.666667,7.000000,16.366667,0.666667,0.333333,0.666667
56,2023-05-21,16:00,Premier League,Matchweek 37,Sun,Home,W,1,0,Chelsea,...,6,1,2.666667,0.333333,14.000000,5.666667,18.100000,1.333333,0.000000,0.333333
57,2023-05-24,20:00,Premier League,Matchweek 32,Wed,Away,D,1,1,Brighton,...,2,0,2.000000,0.333333,13.666667,4.000000,18.933333,1.333333,0.000000,0.333333


In [35]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0]) # make unique indices for each row in dataframe

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [36]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2023-01-01']
    test = data[data["date"] > '2023-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [45]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [46]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

maps_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
    "NottingHam Forest": "Nott'ham Forest",
    "West Bromwich Albion": "West Brom",
    "Sheffield United": "Sheffield Utd"
}
mapping = MissingDict(**maps_values)

In [49]:
combined["new_team"] = combined["team"].map(mapping)
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
merged[(merged["predicted_x"] == 1) &  (merged["predicted_y"] == 0)]["actual_x"].value_counts()

actual_x
1    37
0    22
Name: count, dtype: int64

In [51]:
matches.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team', 'venue_code', 'opp_code', 'hour', 'day_code',
       'target'],
      dtype='object')