In [37]:
#Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier


In [38]:
#load in datasets
matches = pd.read_csv("epl_matches.csv", index_col=0) 
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2.0,1.0,Nott'ham Forest,...,Match Report,,15.0,7.0,19.1,0.0,0,0,2024,Arsenal
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1.0,0.0,Crystal Palace,...,Match Report,,13.0,2.0,16.4,0.0,1,1,2024,Arsenal
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2.0,2.0,Fulham,...,Match Report,,18.0,9.0,13.8,0.0,1,1,2024,Arsenal
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,Match Report,,17.0,5.0,15.0,0.0,0,0,2024,Arsenal
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,Match Report,,13.0,4.0,17.4,0.0,0,0,2024,Arsenal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2019-04-13,12:30,Premier League,Matchweek 34,Sat,Away,L,0.0,4.0,Tottenham,...,Match Report,,7.0,1.0,18.9,1.0,0,0,2019,Huddersfield Town
36,2019-04-20,15:00,Premier League,Matchweek 35,Sat,Home,L,1.0,2.0,Watford,...,Match Report,,13.0,3.0,18.1,1.0,0,0,2019,Huddersfield Town
37,2019-04-26,20:00,Premier League,Matchweek 36,Fri,Away,L,0.0,5.0,Liverpool,...,Match Report,,5.0,1.0,21.6,0.0,0,0,2019,Huddersfield Town
38,2019-05-05,14:00,Premier League,Matchweek 37,Sun,Home,D,1.0,1.0,Manchester Utd,...,Match Report,,7.0,3.0,19.0,1.0,0,0,2019,Huddersfield Town


In [39]:
# Get the shape
matches.shape

(4366, 27)

In [40]:
# How many Matches each team has played. Keep in mind of relegation and promotion rules.
matches["team"].value_counts()

team
West Ham United             219
Arsenal                     218
Crystal Palace              218
Manchester City             218
Everton                     218
Tottenham Hotspur           218
Manchester United           218
Brighton and Hove Albion    218
Wolverhampton Wanderers     218
Newcastle United            218
Liverpool                   218
Chelsea                     217
Leicester City              190
Southampton                 190
Aston Villa                 181
Burnley                     181
Fulham                      143
Bournemouth                 142
Watford                     114
Leeds United                114
Brentford                   105
Sheffield United            104
Norwich City                 76
Nottingham Forest            67
West Bromwich Albion         38
Cardiff City                 38
Huddersfield Town            38
Luton Town                   29
Name: count, dtype: int64

In [41]:
# Datatype of each value
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk                int64
pkatt             int64
season            int64
team             object
dtype: object

In [42]:
#convert date to date-time
matches["date"] = pd.to_datetime(matches["date"])
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                       int64
pkatt                    int64
season                   int64
team                    object
dtype: object

In [43]:
# setting the venue code

matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches
# 0 = away 
# 1 = home

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,notes,sh,sot,dist,fk,pk,pkatt,season,team,venue_code
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2.0,1.0,Nott'ham Forest,...,,15.0,7.0,19.1,0.0,0,0,2024,Arsenal,1
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1.0,0.0,Crystal Palace,...,,13.0,2.0,16.4,0.0,1,1,2024,Arsenal,0
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2.0,2.0,Fulham,...,,18.0,9.0,13.8,0.0,1,1,2024,Arsenal,1
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,,17.0,5.0,15.0,0.0,0,0,2024,Arsenal,1
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,,13.0,4.0,17.4,0.0,0,0,2024,Arsenal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2019-04-13,12:30,Premier League,Matchweek 34,Sat,Away,L,0.0,4.0,Tottenham,...,,7.0,1.0,18.9,1.0,0,0,2019,Huddersfield Town,0
36,2019-04-20,15:00,Premier League,Matchweek 35,Sat,Home,L,1.0,2.0,Watford,...,,13.0,3.0,18.1,1.0,0,0,2019,Huddersfield Town,1
37,2019-04-26,20:00,Premier League,Matchweek 36,Fri,Away,L,0.0,5.0,Liverpool,...,,5.0,1.0,21.6,0.0,0,0,2019,Huddersfield Town,0
38,2019-05-05,14:00,Premier League,Matchweek 37,Sun,Home,D,1.0,1.0,Manchester Utd,...,,7.0,3.0,19.0,1.0,0,0,2019,Huddersfield Town,1


In [44]:
# Create numeric codes for each unique 'opponent' value and store them in a new column 'opp_code'.
matches["opp_code"] = matches["opponent"].astype("category").cat.codes


In [45]:
# Extract the hour component from the 'time' column and store it as integers in a new column named 'hour'.
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2.0,1.0,Nott'ham Forest,...,7.0,19.1,0.0,0,0,2024,Arsenal,1,20,12
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1.0,0.0,Crystal Palace,...,2.0,16.4,0.0,1,1,2024,Arsenal,0,8,20
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2.0,2.0,Fulham,...,9.0,13.8,0.0,1,1,2024,Arsenal,1,10,15
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,5.0,15.0,0.0,0,0,2024,Arsenal,1,17,16
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,4.0,17.4,0.0,0,0,2024,Arsenal,0,9,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2019-04-13,12:30,Premier League,Matchweek 34,Sat,Away,L,0.0,4.0,Tottenham,...,1.0,18.9,1.0,0,0,2019,Huddersfield Town,0,23,12
36,2019-04-20,15:00,Premier League,Matchweek 35,Sat,Home,L,1.0,2.0,Watford,...,3.0,18.1,1.0,0,0,2019,Huddersfield Town,1,24,15
37,2019-04-26,20:00,Premier League,Matchweek 36,Fri,Away,L,0.0,5.0,Liverpool,...,1.0,21.6,0.0,0,0,2019,Huddersfield Town,0,14,20
38,2019-05-05,14:00,Premier League,Matchweek 37,Sun,Home,D,1.0,1.0,Manchester Utd,...,3.0,19.0,1.0,0,0,2019,Huddersfield Town,1,17,14


In [46]:
# Add a new column 'day_code' to the 'matches' DataFrame, containing the day of the week from the 'date' column.
matches["day_code"] = matches["date"].dt.dayofweek
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2.0,1.0,Nott'ham Forest,...,19.1,0.0,0,0,2024,Arsenal,1,20,12,5
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1.0,0.0,Crystal Palace,...,16.4,0.0,1,1,2024,Arsenal,0,8,20,0
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2.0,2.0,Fulham,...,13.8,0.0,1,1,2024,Arsenal,1,10,15,5
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,15.0,0.0,0,0,2024,Arsenal,1,17,16,6
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,17.4,0.0,0,0,2024,Arsenal,0,9,16,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2019-04-13,12:30,Premier League,Matchweek 34,Sat,Away,L,0.0,4.0,Tottenham,...,18.9,1.0,0,0,2019,Huddersfield Town,0,23,12,5
36,2019-04-20,15:00,Premier League,Matchweek 35,Sat,Home,L,1.0,2.0,Watford,...,18.1,1.0,0,0,2019,Huddersfield Town,1,24,15,5
37,2019-04-26,20:00,Premier League,Matchweek 36,Fri,Away,L,0.0,5.0,Liverpool,...,21.6,0.0,0,0,2019,Huddersfield Town,0,14,20,4
38,2019-05-05,14:00,Premier League,Matchweek 37,Sun,Home,D,1.0,1.0,Manchester Utd,...,19.0,1.0,0,0,2019,Huddersfield Town,1,17,14,6


In [47]:
# target will be if team won lost or drawn
conditions = [
    (matches["result"] == "W"),  # Win condition
    (matches["result"] == "L"),  # Lose condition
    (matches["result"] == "D")   # Draw condition
]

values = [1, -1, 0]  # 1 for win, -1 for lose, 0 for draw
matches["target"] = np.select(conditions, values, default=np.nan)
# Apply np.select to create the 'target' column based on the defined conditions and values
matches


Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2.0,1.0,Nott'ham Forest,...,0.0,0,0,2024,Arsenal,1,20,12,5,1.0
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1.0,0.0,Crystal Palace,...,0.0,1,1,2024,Arsenal,0,8,20,0,1.0
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2.0,2.0,Fulham,...,0.0,1,1,2024,Arsenal,1,10,15,5,0.0
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,0.0,0,0,2024,Arsenal,1,17,16,6,1.0
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,0.0,0,0,2024,Arsenal,0,9,16,6,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2019-04-13,12:30,Premier League,Matchweek 34,Sat,Away,L,0.0,4.0,Tottenham,...,1.0,0,0,2019,Huddersfield Town,0,23,12,5,-1.0
36,2019-04-20,15:00,Premier League,Matchweek 35,Sat,Home,L,1.0,2.0,Watford,...,1.0,0,0,2019,Huddersfield Town,1,24,15,5,-1.0
37,2019-04-26,20:00,Premier League,Matchweek 36,Fri,Away,L,0.0,5.0,Liverpool,...,0.0,0,0,2019,Huddersfield Town,0,14,20,4,-1.0
38,2019-05-05,14:00,Premier League,Matchweek 37,Sun,Home,D,1.0,1.0,Manchester Utd,...,1.0,0,0,2019,Huddersfield Town,1,17,14,6,0.0


In [48]:
#random forest classifier
# Define the list of predictors including venue code, opponent code, hour, and day code.
predictors = ["venue_code", "opp_code", "hour", "day_code"]

# Define the features (predictors) and the target variable
X = matches[predictors]
y = matches["target"]

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Initialize a Random Forest classifier with 50 trees, minimum samples split of 10, and a fixed random state.
rf_model = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

# Train the Random Forest classifier on the training data using specified predictors.
rf_model = rf_model.fit(X_train, y_train)

In [49]:
# Generate predictions using the trained Random Forest classifier on the test data using specified predictors.
y_preds = rf_model.predict(X_test)

In [50]:
# measurements of model accuracy
acc = accuracy_score(y_test, y_preds)
acc

0.44047619047619047

In [51]:
# Create a contingency table showing the counts of actual versus predicted labels
combined = pd.DataFrame({"actual": y_test, "prediction": y_preds})
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,-1.0,0.0,1.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,227,43,151
0.0,107,32,131
1.0,138,41,222


In [52]:
# Generate classification report
print(classification_report(y_test, y_preds))


              precision    recall  f1-score   support

        -1.0       0.48      0.54      0.51       421
         0.0       0.28      0.12      0.17       270
         1.0       0.44      0.55      0.49       401

    accuracy                           0.44      1092
   macro avg       0.40      0.40      0.39      1092
weighted avg       0.42      0.44      0.42      1092



In [53]:
# Group the matches DataFrame by the "team" column.
grouped_matches = matches.groupby("team")

In [54]:
# Get the group of matches specifically for the team "Liverpool".
group = grouped_matches.get_group("Liverpool")
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,0.0,0,0,2024,Liverpool,0,7,16,6,0.0
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,1.0,0,1,2024,Liverpool,1,2,15,5,1.0
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,1.0,0,0,2024,Liverpool,0,18,16,6,1.0
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,0.0,0,0,2024,Liverpool,1,1,14,6,1.0
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,0.0,0,0,2024,Liverpool,0,27,12,5,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2019-04-14,16:30,Premier League,Matchweek 34,Sun,Home,W,2.0,0.0,Chelsea,...,0.0,0,0,2019,Liverpool,1,7,16,6,1.0
46,2019-04-21,16:00,Premier League,Matchweek 35,Sun,Away,W,2.0,0.0,Cardiff City,...,1.0,1,1,2019,Liverpool,0,6,16,6,1.0
47,2019-04-26,20:00,Premier League,Matchweek 36,Fri,Home,W,5.0,0.0,Huddersfield,...,0.0,0,0,2019,Liverpool,1,11,20,4,1.0
49,2019-05-04,19:45,Premier League,Matchweek 37,Sat,Away,W,3.0,2.0,Newcastle Utd,...,0.0,0,0,2019,Liverpool,0,18,19,5,1.0


In [55]:
# Calculate rolling averages for specified columns within a group.

def rolling_avg(group, cols, new_cols):
    """
    Args:
    - group: DataFrame group corresponding to matches of a particular team.
    - cols: List of columns for which rolling averages need to be calculated.
    - new_cols: List of new column names for the rolling average statistics.

    Returns:
    - DataFrame with the calculated rolling averages added as new columns.
    """
    # Sort the group by date to ensure chronological order.
    group = group.sort_values("date")

    # Calculate rolling averages for specified columns with a window size of 3.
    rolling_stats = group[cols].rolling(3, closed='left').mean()

    # Add the calculated rolling averages as new columns to the DataFrame.
    group[new_cols] = rolling_stats

    # Drop rows with NaN values resulting from the rolling window calculation.
    group= group.dropna(subset=new_cols)
    return group

In [56]:
# Define the columns for which rolling averages will be calculated.
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]

# Generate new column names for the corresponding rolling averages.
new_cols = [f"{c}_rolling" for c in cols]

new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [57]:
# Calculate rolling averages for cols in new_cols within the 'group' DataFrame.
rolling_avg(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
3,2018-09-01,12:30,Premier League,Matchweek 4,Sat,Away,W,2.0,1.0,Leicester City,...,5,1.0,2.333333,0.000000,18.333333,6.666667,16.800000,1.333333,0.333333,0.333333
4,2018-09-15,12:30,Premier League,Matchweek 5,Sat,Away,W,2.0,1.0,Tottenham,...,5,1.0,1.666667,0.333333,15.666667,5.333333,17.633333,0.666667,0.333333,0.333333
6,2018-09-22,15:00,Premier League,Matchweek 6,Sat,Home,W,3.0,0.0,Southampton,...,5,1.0,1.666667,0.666667,16.333333,7.000000,15.666667,0.666667,0.000000,0.000000
8,2018-09-29,17:30,Premier League,Matchweek 7,Sat,Away,D,1.0,1.0,Chelsea,...,5,0.0,2.333333,0.666667,13.000000,5.666667,15.133333,0.666667,0.000000,0.000000
10,2018-10-07,16:30,Premier League,Matchweek 8,Sun,Home,D,0.0,0.0,Manchester City,...,6,0.0,2.000000,0.666667,14.000000,5.666667,15.666667,1.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2024-02-10,15:00,Premier League,Matchweek 24,Sat,Home,W,3.0,1.0,Burnley,...,5,1.0,3.000000,1.333333,17.000000,7.000000,19.600000,0.666667,0.000000,0.333333
37,2024-02-17,12:30,Premier League,Matchweek 25,Sat,Away,W,4.0,1.0,Brentford,...,5,1.0,2.666667,1.666667,20.666667,8.000000,18.333333,0.666667,0.000000,0.333333
38,2024-02-21,19:30,Premier League,Matchweek 26,Wed,Home,W,4.0,1.0,Luton Town,...,2,1.0,2.666667,1.666667,16.666667,6.333333,17.000000,0.333333,0.000000,0.000000
41,2024-03-02,15:00,Premier League,Matchweek 27,Sat,Away,W,1.0,0.0,Nott'ham Forest,...,5,1.0,3.666667,1.000000,23.000000,10.333333,16.066667,0.333333,0.000000,0.000000


In [58]:
# Define a lambda function to calculate rolling averages for each team's matches data.
# The lambda function takes each group 'x', representing matches data for a specific team,
# and applies the 'rolling_avg' function to calculate rolling averages for specified columns.
# 'x' represents a DataFrame containing the matches data for a particular team.
matches_rolling = matches.groupby("team").apply(lambda x: rolling_avg(x, cols, new_cols))
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,3,2018-09-02,13:30,Premier League,Matchweek 4,Sun,Away,W,3.0,2.0,Cardiff City,...,6,1.0,1.666667,2.000000,13.666667,6.333333,18.166667,0.333333,0.000000,0.000000
Arsenal,4,2018-09-15,15:00,Premier League,Matchweek 5,Sat,Away,W,2.0,1.0,Newcastle Utd,...,5,1.0,2.666667,2.000000,16.333333,9.000000,17.300000,0.333333,0.000000,0.000000
Arsenal,6,2018-09-23,16:00,Premier League,Matchweek 6,Sun,Home,W,2.0,0.0,Everton,...,6,1.0,2.666667,1.333333,15.333333,7.666667,17.333333,0.666667,0.000000,0.000000
Arsenal,8,2018-09-29,15:00,Premier League,Matchweek 7,Sat,Home,W,2.0,0.0,Watford,...,5,1.0,2.333333,1.000000,12.666667,6.000000,17.133333,0.333333,0.000000,0.000000
Arsenal,10,2018-10-07,12:00,Premier League,Matchweek 8,Sun,Away,W,5.0,1.0,Fulham,...,6,1.0,2.000000,0.333333,10.000000,3.000000,16.666667,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,28,2024-02-10,15:00,Premier League,Matchweek 24,Sat,Home,L,0.0,2.0,Brentford,...,5,-1.0,2.333333,2.000000,12.000000,4.666667,17.266667,1.000000,0.666667,0.666667
Wolverhampton Wanderers,29,2024-02-17,15:00,Premier League,Matchweek 25,Sat,Away,W,2.0,1.0,Tottenham,...,5,1.0,2.333333,2.666667,15.000000,5.333333,16.800000,1.333333,0.666667,0.666667
Wolverhampton Wanderers,30,2024-02-25,13:30,Premier League,Matchweek 26,Sun,Home,W,1.0,0.0,Sheffield Utd,...,6,1.0,2.000000,1.666667,14.000000,6.000000,16.566667,1.000000,0.333333,0.333333
Wolverhampton Wanderers,32,2024-03-02,15:00,Premier League,Matchweek 27,Sat,Away,L,0.0,3.0,Newcastle Utd,...,5,-1.0,1.000000,1.000000,14.000000,4.666667,15.900000,0.333333,0.000000,0.000000


In [59]:
# Drop the 'team' level from the index of the 'matches_rolling' DataFrame.
matches_rolling = matches_rolling.droplevel('team')
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
3,2018-09-02,13:30,Premier League,Matchweek 4,Sun,Away,W,3.0,2.0,Cardiff City,...,6,1.0,1.666667,2.000000,13.666667,6.333333,18.166667,0.333333,0.000000,0.000000
4,2018-09-15,15:00,Premier League,Matchweek 5,Sat,Away,W,2.0,1.0,Newcastle Utd,...,5,1.0,2.666667,2.000000,16.333333,9.000000,17.300000,0.333333,0.000000,0.000000
6,2018-09-23,16:00,Premier League,Matchweek 6,Sun,Home,W,2.0,0.0,Everton,...,6,1.0,2.666667,1.333333,15.333333,7.666667,17.333333,0.666667,0.000000,0.000000
8,2018-09-29,15:00,Premier League,Matchweek 7,Sat,Home,W,2.0,0.0,Watford,...,5,1.0,2.333333,1.000000,12.666667,6.000000,17.133333,0.333333,0.000000,0.000000
10,2018-10-07,12:00,Premier League,Matchweek 8,Sun,Away,W,5.0,1.0,Fulham,...,6,1.0,2.000000,0.333333,10.000000,3.000000,16.666667,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,2024-02-10,15:00,Premier League,Matchweek 24,Sat,Home,L,0.0,2.0,Brentford,...,5,-1.0,2.333333,2.000000,12.000000,4.666667,17.266667,1.000000,0.666667,0.666667
29,2024-02-17,15:00,Premier League,Matchweek 25,Sat,Away,W,2.0,1.0,Tottenham,...,5,1.0,2.333333,2.666667,15.000000,5.333333,16.800000,1.333333,0.666667,0.666667
30,2024-02-25,13:30,Premier League,Matchweek 26,Sun,Home,W,1.0,0.0,Sheffield Utd,...,6,1.0,2.000000,1.666667,14.000000,6.000000,16.566667,1.000000,0.333333,0.333333
32,2024-03-02,15:00,Premier League,Matchweek 27,Sat,Away,L,0.0,3.0,Newcastle Utd,...,5,-1.0,1.000000,1.000000,14.000000,4.666667,15.900000,0.333333,0.000000,0.000000


In [60]:
# Reset the index of the 'matches_rolling' DataFrame to range from 0 to the number of rows.
# This reindexes the DataFrame with a simple integer index, replacing the previous index.
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2018-09-02,13:30,Premier League,Matchweek 4,Sun,Away,W,3.0,2.0,Cardiff City,...,6,1.0,1.666667,2.000000,13.666667,6.333333,18.166667,0.333333,0.000000,0.000000
1,2018-09-15,15:00,Premier League,Matchweek 5,Sat,Away,W,2.0,1.0,Newcastle Utd,...,5,1.0,2.666667,2.000000,16.333333,9.000000,17.300000,0.333333,0.000000,0.000000
2,2018-09-23,16:00,Premier League,Matchweek 6,Sun,Home,W,2.0,0.0,Everton,...,6,1.0,2.666667,1.333333,15.333333,7.666667,17.333333,0.666667,0.000000,0.000000
3,2018-09-29,15:00,Premier League,Matchweek 7,Sat,Home,W,2.0,0.0,Watford,...,5,1.0,2.333333,1.000000,12.666667,6.000000,17.133333,0.333333,0.000000,0.000000
4,2018-10-07,12:00,Premier League,Matchweek 8,Sun,Away,W,5.0,1.0,Fulham,...,6,1.0,2.000000,0.333333,10.000000,3.000000,16.666667,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4271,2024-02-10,15:00,Premier League,Matchweek 24,Sat,Home,L,0.0,2.0,Brentford,...,5,-1.0,2.333333,2.000000,12.000000,4.666667,17.266667,1.000000,0.666667,0.666667
4272,2024-02-17,15:00,Premier League,Matchweek 25,Sat,Away,W,2.0,1.0,Tottenham,...,5,1.0,2.333333,2.666667,15.000000,5.333333,16.800000,1.333333,0.666667,0.666667
4273,2024-02-25,13:30,Premier League,Matchweek 26,Sun,Home,W,1.0,0.0,Sheffield Utd,...,6,1.0,2.000000,1.666667,14.000000,6.000000,16.566667,1.000000,0.333333,0.333333
4274,2024-03-02,15:00,Premier League,Matchweek 27,Sat,Away,L,0.0,3.0,Newcastle Utd,...,5,-1.0,1.000000,1.000000,14.000000,4.666667,15.900000,0.333333,0.000000,0.000000


In [61]:
def make_predictions(matches_rolling, predictors):
    """
    Make predictions using a random forest classifier.

    Args:
    - data: DataFrame containing the dataset.
    - predictors: List of column names used as predictors for the model.

    Returns:
    - combined: DataFrame containing actual and predicted labels.
    - cr: Classification report containing precision, recall, F1-score, and support.
    """

    # Split the dataset into features (X) and target variable (y)
    X = matches_rolling[predictors]
    y = matches_rolling["target"]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    # Train the random forest classifier on the training data
    rf_model.fit(X_train, y_train)
    # Generate predictions on the test data
    y_preds = rf_model.predict(X_test)
    # Create a DataFrame to combine actual and predicted labels
    combined = pd.DataFrame({"actual": y_test, "prediction": y_preds})
    # Generate a classification report
    cr = classification_report(y_test, y_preds)
    return combined, cr

In [62]:
# Make predictions using the random forest classifier on the 'matches_rolling' dataset,
# with predictors including both original features and new columns for rolling averages.
combined, cr = make_predictions(matches_rolling, predictors + new_cols)
print(cr)

              precision    recall  f1-score   support

        -1.0       0.47      0.64      0.54       406
         0.0       0.29      0.03      0.06       231
         1.0       0.49      0.57      0.53       432

    accuracy                           0.48      1069
   macro avg       0.42      0.41      0.38      1069
weighted avg       0.44      0.48      0.43      1069



In [63]:
# Merge additional match information from the 'matches_rolling' DataFrame into the 'combined' DataFrame.
# This additional information includes the date, team, opponent, and result of each match.
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
1080,1.0,1.0,2019-02-02,Chelsea,Huddersfield,W
2461,1.0,1.0,2019-11-23,Manchester City,Chelsea,W
662,-1.0,-1.0,2019-04-23,Brighton and Hove Albion,Tottenham,L
2929,0.0,-1.0,2020-12-19,Newcastle United,Fulham,D
453,-1.0,-1.0,2020-03-07,Bournemouth,Liverpool,L
...,...,...,...,...,...,...
3767,1.0,1.0,2020-07-07,Watford,Norwich City,W
1147,0.0,-1.0,2020-12-28,Chelsea,Aston Villa,D
3386,1.0,-1.0,2021-01-04,Southampton,Liverpool,W
4078,-1.0,1.0,2019-01-02,Wolverhampton Wanderers,Crystal Palace,L


In [64]:
# Define a custom dictionary class to handle missing keys by returning the key itself.
# This class ensures that if a key is not found in the dictionary, it returns the key itself.
class MissingDict(dict):
    __missing__ = lambda self, key: key

# Define a dictionary containing mappings of team names to their shortened versions.
map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchested Utd",
    "Newcastle United": "Newcastle Utd",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}

# Create an instance of the MissingDict class with the provided mapping.
mapping = MissingDict(**map_values)

In [65]:
# Map team names in the 'team' column of the 'combined' DataFrame to their shortened versions
# using the custom dictionary 'mapping' created earlier, and store the results in a new column 'new_team'.
combined["new_team"] = combined ["team"].map(mapping)
combined

Unnamed: 0,actual,prediction,date,team,opponent,result,new_team
1080,1.0,1.0,2019-02-02,Chelsea,Huddersfield,W,Chelsea
2461,1.0,1.0,2019-11-23,Manchester City,Chelsea,W,Manchester City
662,-1.0,-1.0,2019-04-23,Brighton and Hove Albion,Tottenham,L,Brighton
2929,0.0,-1.0,2020-12-19,Newcastle United,Fulham,D,Newcastle Utd
453,-1.0,-1.0,2020-03-07,Bournemouth,Liverpool,L,Bournemouth
...,...,...,...,...,...,...,...
3767,1.0,1.0,2020-07-07,Watford,Norwich City,W,Watford
1147,0.0,-1.0,2020-12-28,Chelsea,Aston Villa,D,Chelsea
3386,1.0,-1.0,2021-01-04,Southampton,Liverpool,W,Southampton
4078,-1.0,1.0,2019-01-02,Wolverhampton Wanderers,Crystal Palace,L,Wolves


In [66]:
# Merge the 'combined' DataFrame with itself based on matching dates and team/opponent pairs.
# This operation matches rows where the 'date' column and 'new_team' column in the left DataFrame
# match the 'date' column and 'opponent' column in the right DataFrame, respectively.
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,0.0,-1.0,2020-12-19,Newcastle United,Fulham,D,Newcastle Utd,0.0,-1.0,Fulham,Newcastle Utd,D,Fulham
1,-1.0,-1.0,2018-12-22,Brighton and Hove Albion,Bournemouth,L,Brighton,1.0,-1.0,Bournemouth,Brighton,W,Bournemouth
2,1.0,-1.0,2023-12-02,Everton,Nott'ham Forest,W,Everton,-1.0,-1.0,Nottingham Forest,Everton,L,Nottingham Forest
3,1.0,1.0,2022-10-16,Liverpool,Manchester City,W,Liverpool,-1.0,1.0,Manchester City,Liverpool,L,Manchester City
4,1.0,-1.0,2023-02-25,Aston Villa,Everton,W,Aston Villa,-1.0,1.0,Everton,Aston Villa,L,Everton
...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,1.0,-1.0,2021-09-25,West Ham United,Leeds United,W,West Ham,-1.0,1.0,Leeds United,West Ham,L,Leeds United
217,1.0,1.0,2022-08-31,Liverpool,Newcastle Utd,W,Liverpool,-1.0,-1.0,Newcastle United,Liverpool,L,Newcastle Utd
218,-1.0,-1.0,2021-05-23,Fulham,Newcastle Utd,L,Fulham,1.0,1.0,Newcastle United,Fulham,W,Newcastle Utd
219,-1.0,-1.0,2022-04-10,Burnley,Norwich City,L,Burnley,1.0,-1.0,Norwich City,Burnley,W,Norwich City


In [67]:
# Filter the 'merged' DataFrame to select rows where one prediction is a win (1) and the other prediction is a loss (-1),
# then count the occurrences of actual labels for those filtered rows.
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] == -1)]["actual_x"].value_counts()

actual_x
 1.0    33
-1.0    15
 0.0    10
Name: count, dtype: int64