In [2]:
import pandas as pd

In [4]:
matches = pd.read_csv("liverpool_data.csv", index_col = 0)

In [10]:
matches.dtypes

Date              object
Time              object
Round             object
Day               object
Venue             object
Result            object
GF                 int64
GA                 int64
Opponent          object
xG_x             float64
xGA              float64
Poss               int64
Attendance         int64
Captain           object
Formation         object
Opp Formation     object
Referee           object
Match Report      object
Notes            float64
Sh                 int64
SoT                int64
Dist             float64
FK                 int64
PK                 int64
PKatt              int64
xG_y             float64
dtype: object

In [28]:
matches["Date"] = pd.to_datetime(matches["Date"]) # converts "Date" column values to datetime data type

In [30]:
matches["venue_code"] = matches["Venue"].astype("category").cat.codes # gives away or home a value of 0 or 1

In [32]:
matches["opp_code"] = matches["Opponent"].astype("category").cat.codes # gives each unique opponent a number

In [38]:
matches["hour"] = matches["Time"].str.replace(":.+", "", regex=True).astype("int") # allows us to convert the time to an int

In [56]:
matches["day_code"] = matches["Date"].astype("category").dt.dayofweek # gets day of week property from the date

In [58]:
matches["target"] = (matches["Result"] == "W").astype("int") # converts win or loss/draw to numerical value

In [62]:
from sklearn.ensemble import RandomForestClassifier

In [66]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
# n_estimators=50 - number of individual decision trees to train
# min_samples_split=10 - number of samples in a leaf of the decision tree before splitting node
# random_state=1 - same results when ran multiple times

In [70]:
train = matches[matches["Date"] < '2024-01-01'] # training our data with first half of the season

In [72]:
test = matches[matches["Date"] > '2024-01-01'] # testing on the last half of the season

In [74]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [76]:
rf.fit(train[predictors], train["target"]) # train random forest model with predictors (defined above) to try to predict the target (win or loss/draw)

In [78]:
preds = rf.predict(test[predictors]) # generate predictions

In [80]:
from sklearn.metrics import accuracy_score

In [84]:
acc = accuracy_score(test["target"], preds) # passing in the actual data, to compare with the predictions, gives an accuracy score

In [86]:
acc # yields 72% accuracy, meaning our model made the right prediction 72% of the time

0.7222222222222222

In [90]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds)) 
# creating a new dataframe with the actual results and predictions so we can compare

In [94]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"]) # consolodating our new dataframe
# results yield that 
# the model predicted 2 loss/draws, and was wrong 0 times
# the model predicted 11 wins, and was wrong 5 times

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,5
1,0,11


In [96]:
from sklearn.metrics import precision_score

In [98]:
precision_score(test["target"], preds) # when we predicted a win, the team actually won 68.75% of the time

0.6875

In [102]:
# allows model to predict score based on 3 games before it
def rolling_averages(matches, cols, new_cols):
    matches = matches.sort_values("Date") # sort matches by dates
    rolling_stats = matches[cols].rolling(3, closed='left').mean() # take columns and compute rolling averages for the columns not including current week
    matches[new_cols] = rolling_stats # set the new columns as the rolling averages
    matches = matches.dropna(subset=new_cols) # drops missing values from dataframe (specifically in case where 3 games haven't been played yet)
    return matches

In [116]:
cols = ["GF", "GA", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"] # the columns we want rolling averages for
new_cols = [f"{c}_rolling" for c in cols] # adding the word "rolling" to the end of the names for the column titles

In [158]:
matches = rolling_averages(matches, cols, new_cols)

In [160]:
matches

Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG_x,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
3,2023-09-03,14:00,Matchweek 4,Sun,Home,W,3,0,Aston Villa,2.5,...,6,1,2.0,1.0,15.666667,4.666667,17.266667,0.666667,0.0,0.333333
4,2023-09-16,12:30,Matchweek 5,Sat,Away,W,3,1,Wolves,2.5,...,5,1,2.666667,0.666667,17.0,5.666667,16.233333,0.666667,0.0,0.333333
5,2023-09-24,14:00,Matchweek 6,Sun,Home,W,3,1,West Ham,3.0,...,6,1,2.666667,0.666667,14.0,4.333333,15.9,0.333333,0.0,0.0
6,2023-09-30,17:30,Matchweek 7,Sat,Away,L,1,2,Tottenham,1.3,...,5,0,3.0,0.666667,18.0,5.0,16.6,0.666667,0.333333,0.333333
7,2023-10-08,14:00,Matchweek 8,Sun,Away,D,2,2,Brighton,2.3,...,6,0,2.333333,1.333333,16.333333,5.0,16.666667,0.666667,0.333333,0.333333
8,2023-10-21,12:30,Matchweek 9,Sat,Home,W,2,0,Everton,2.2,...,5,1,2.0,1.666667,15.333333,4.333333,17.666667,0.666667,0.666667,0.666667
9,2023-10-29,14:00,Matchweek 10,Sun,Home,W,3,0,Nott'ham Forest,3.2,...,6,1,1.666667,1.333333,16.666667,4.0,18.7,0.333333,0.666667,0.666667
10,2023-11-05,16:30,Matchweek 11,Sun,Away,D,1,1,Luton Town,2.6,...,6,0,2.333333,0.666667,19.666667,5.333333,20.233333,1.0,0.666667,0.666667
11,2023-11-12,14:00,Matchweek 12,Sun,Home,W,3,0,Brentford,1.7,...,6,1,2.0,0.333333,23.333333,6.333333,19.8,1.0,0.333333,0.333333
12,2023-11-25,12:30,Matchweek 13,Sat,Away,D,1,1,Manchester City,0.6,...,5,0,2.333333,0.333333,20.666667,8.0,18.666667,0.666667,0.0,0.0


In [162]:
matches.index = range(matches.shape[0]) # assign proper indices for matchweek

In [168]:
# this function does the predictions pretty similar to how they were made above, this time with the rolling averages included
def make_predictions(data, predictors):
    train = data[data["Date"] < '2024-01-01'] # training our data with first half of the season
    test = data[data["Date"] > '2024-01-01'] # testing on the last half of the season
    rf.fit(train[predictors], train["target"]) # train random forest model with predictors (defined above) to try to predict the target (win or loss/draw)
    preds = rf.predict(test[predictors]) # generate predictions
    combined = pd.DataFrame(dict(actual=test["target"], prediction=preds)) # creating a new dataframe with the actual results and predictions so we can compare
    precision= precision_score(test["target"], preds) # check precision
    return combined, precision

In [170]:
combined, precision = make_predictions(matches, predictors + new_cols)

In [172]:
precision

0.6363636363636364

In [174]:
combined

Unnamed: 0,actual,prediction
17,1,0
18,1,0
19,0,0
20,1,1
21,1,0
22,1,1
23,1,1
24,0,1
25,1,1
26,1,1


In [180]:
combined = combined.merge(matches[["Date", "Opponent", "Result"]], left_index=True, right_index=True) # adding data to our combined dataframe to get more information

In [182]:
combined

Unnamed: 0,actual,prediction,Date,Opponent,Result
17,1,0,2024-01-21,Bournemouth,W
18,1,0,2024-01-31,Chelsea,W
19,0,0,2024-02-04,Arsenal,L
20,1,1,2024-02-10,Burnley,W
21,1,0,2024-02-17,Brentford,W
22,1,1,2024-02-21,Luton Town,W
23,1,1,2024-03-02,Nott'ham Forest,W
24,0,1,2024-03-10,Manchester City,D
25,1,1,2024-03-31,Brighton,W
26,1,1,2024-04-04,Sheffield Utd,W


In [2]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

NameError: name 'pd' is not defined