<a href="https://colab.research.google.com/github/gkbichara/Match-Predictor/blob/main/Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# when reading in a CSV file, have to specify which col is the index
matches = pd.read_csv("/content/drive/My Drive/Prediction Project/matches.csv", index_col = 0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,...,Match Report,,11.0,5.0,19.1,0.0,0,0,2025,Manchester City
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,...,Match Report,,13.0,4.0,17.8,1.0,1,1,2025,Manchester City
3,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,1.0,West Ham,...,Match Report,,23.0,8.0,15.0,1.0,0,0,2025,Manchester City
0,2024-08-17,15:00,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Everton,...,Match Report,,10.0,5.0,13.8,0.0,0,0,2025,Brighton and Hove Albion
1,2024-08-24,12:30,Premier League,Matchweek 2,Sat,Home,W,2.0,1.0,Manchester Utd,...,Match Report,,14.0,4.0,14.2,1.0,0,0,2025,Brighton and Hove Albion


In [4]:
matches.shape

(3100, 28)

In [5]:
# trying to figure out how many matches we have per team
matches["team"].value_counts()

Unnamed: 0_level_0,count
team,Unnamed: 1_level_1
Manchester City,155
Newcastle United,155
Everton,155
Crystal Palace,155
Wolverhampton Wanderers,155
Brighton and Hove Albion,155
Manchester United,155
Chelsea,155
West Ham United,155
Tottenham Hotspur,155


In [6]:
# selecting all rows where team is Arsenal
matches[matches["team"] == "Arsenal"]

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2024-08-17,15:00,Premier League,Matchweek 1,Sat,Home,W,2.0,0.0,Wolves,...,Match Report,,18.0,6.0,17.3,0.0,0,0,2025,Arsenal
1,2024-08-24,17:30,Premier League,Matchweek 2,Sat,Away,W,2.0,0.0,Aston Villa,...,Match Report,,9.0,4.0,18.8,0.0,0,0,2025,Arsenal
2,2024-08-31,12:30,Premier League,Matchweek 3,Sat,Home,D,1.0,1.0,Brighton,...,Match Report,,11.0,7.0,12.9,0.0,0,0,2025,Arsenal
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2.0,1.0,Nott'ham Forest,...,Match Report,,15.0,7.0,19.1,0.0,0,0,2024,Arsenal
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1.0,0.0,Crystal Palace,...,Match Report,,13.0,2.0,16.4,0.0,1,1,2024,Arsenal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2021-05-02,14:00,Premier League,Matchweek 34,Sun,Away,W,2.0,0.0,Newcastle Utd,...,Match Report,,19.0,4.0,17.0,1.0,0,0,2021,Arsenal
54,2021-05-09,19:00,Premier League,Matchweek 35,Sun,Home,W,3.0,1.0,West Brom,...,Match Report,,15.0,7.0,19.7,1.0,0,0,2021,Arsenal
55,2021-05-12,20:15,Premier League,Matchweek 36,Wed,Away,W,1.0,0.0,Chelsea,...,Match Report,,5.0,2.0,20.7,0.0,0,0,2021,Arsenal
56,2021-05-19,19:00,Premier League,Matchweek 37,Wed,Away,W,3.0,1.0,Crystal Palace,...,Match Report,,6.0,3.0,14.2,0.0,0,0,2021,Arsenal


In [7]:
# looking at data types in columns, as ML models only work with nums
matches.dtypes

Unnamed: 0,0
date,object
time,object
comp,object
round,object
day,object
venue,object
result,object
gf,float64
ga,float64
opponent,object


In [8]:
# converting date to a date-time data type
# does not create a new date, just overwrites the current one
matches["date"] = pd.to_datetime(matches["date"])

In [9]:
# first predictor we are creating
# turning home/away into a numeric column (0 when away or 1 when home) so that it is quantifiable
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [10]:
# doing the same thing again here, but for opponents
# each team has its own
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [11]:
# again, doing the same with time
# focusing on just the hour to see if teams perform better during a specific time of day
# removing minutes as it is useless,
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [12]:
# converting the day of the week to a number, Monday starts at 0
matches["day_code"] = matches["date"].dt.dayofweek

In [13]:
# creating a target of what we are trying to predict, if the team won or not
# if result is a loss or a draw, it is a 0, a win is 1
matches["target"] = (matches["result"] == "W").astype("int")

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
# creating a random forest classifier
# n_estimators is the number of decision trees we want to train
# min_samples_split is how many samples in a leaf of the decision tree
# random state means that we will get the same result when we rerun the decision tree

rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)

In [16]:
# training set of data is anything before 2024
train = matches[matches["date"] < '2024-01-01']

In [17]:
# rest set of data is anything after 2024
test = matches[matches["date"] >= '2024-01-01']

In [18]:
# things we will be fitting the data on
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [19]:
# fitting the model on said predictors
rf.fit(train[predictors], train["target"])

In [22]:
# using rf to train our model
rf.fit(train[predictors], train["target"])

In [23]:
# now we predict the results based on the predictores in the test data set
preds = rf.predict(test[predictors])

In [24]:
# package that looks at how accurate your model was
from sklearn.metrics import accuracy_score

In [25]:
# passing in our predicted vs actual results
acc = accuracy_score(test["target"], preds)

In [26]:
acc

0.6518691588785047

In [27]:
# our accuracy is 65%, meaning that we predict a team would win, they would actually win 65%

In [28]:
# now we check in which results does our model fall short
combined = pd.DataFrame(dict(actual=test["target"], predictions=preds ))

In [31]:
# creating a crosstable (a 2-way table)
# creating a table that shows the actual vs predicted values
pd.crosstab(index = combined["actual"], columns = combined["predictions"])

predictions,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,216,51
1,98,63


In [32]:
# last table shows us that when we predicted a team to lose or draw (0), we were correct 2x as many times as we were wrong
# when we predict a team to win, we're barely more correct with predicting correct results
# since the focus of this model is to predict wins, we need to revisit the model

In [33]:
# importing a different package
# the reason for this is to look at the accuracy of a specific instance, in this case it will be the accuracy of predicting wins
# this is needed because we don't know the precision of what we are actually trying to measure
from sklearn.metrics import precision_score

In [35]:
# goes down 10%
precision_score(test["target"], preds)

0.5526315789473685

In [36]:
# splitting into teams
grouped_matches = matches.groupby("team")

In [43]:
group = grouped_matches.get_group("Arsenal")

In [46]:
# closed = left excludes the current row when making predictions, so that data of the game you are predicting does not leak
# have to drop rows with missing values

def rolling_averages(group, cols, new_cols):
  """
    Calculate the rolling average for specified columns over a 3-row window, excluding the current row.

    Parameters:
    -----------
    group : pd.DataFrame
        Input DataFrame containing the data to compute rolling averages on. Must include a 'date' column.

    cols : list of str
        List of column names for which the rolling averages will be calculated.

    new_cols : list of str
        List of new column names where the computed rolling averages will be stored.

    Returns:
    --------
    pd.DataFrame
        DataFrame with new columns containing rolling averages, excluding rows where these values
        cannot be calculated.
    """
  group = group.sort_values("date")
  rolling_stats = group[cols].rolling(3, closed = 'left').mean()
  group[new_cols] = rolling_stats
  group = group.dropna(subset = new_cols)
  return group

In [47]:
# columns that we will be creating rolling averages for
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]

# new columns that will contain all the calculated rolling averages
new_cols = [f"{c}_rolling" for c in cols]

In [48]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,2024-05-12,16:30,Premier League,Matchweek 37,Sun,Away,W,1.0,0.0,Manchester Utd,...,6,1,3.666667,0.666667,20.000000,7.000000,14.866667,0.333333,0.333333,0.333333
51,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Home,W,2.0,1.0,Everton,...,6,1,2.333333,0.666667,14.666667,5.333333,13.900000,0.333333,0.333333,0.333333
0,2024-08-17,15:00,Premier League,Matchweek 1,Sat,Home,W,2.0,0.0,Wolves,...,5,1,2.000000,0.333333,20.333333,6.000000,13.833333,0.000000,0.333333,0.333333
1,2024-08-24,17:30,Premier League,Matchweek 2,Sat,Away,W,2.0,0.0,Aston Villa,...,5,1,1.666667,0.333333,18.333333,5.333333,14.933333,0.000000,0.000000,0.000000


In [50]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [51]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
Arsenal,7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
Arsenal,9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
Arsenal,11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
Arsenal,13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,43,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1.0,3.0,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,44,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,L,0.0,2.0,Liverpool,...,6,0,1.333333,3.000000,9.666667,4.666667,14.133333,0.000000,0.000000,0.000000
Wolverhampton Wanderers,0,2024-08-17,15:00,Premier League,Matchweek 1,Sat,Away,L,0.0,2.0,Arsenal,...,5,0,0.666667,3.333333,6.666667,3.666667,14.200000,0.333333,0.000000,0.000000
Wolverhampton Wanderers,1,2024-08-25,14:00,Premier League,Matchweek 2,Sun,Home,L,2.0,6.0,Chelsea,...,6,0,0.333333,2.333333,9.000000,4.333333,17.633333,0.666667,0.000000,0.000000


In [52]:
matches_rolling = matches_rolling.droplevel('team')

In [53]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1.0,3.0,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000
44,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,L,0.0,2.0,Liverpool,...,6,0,1.333333,3.000000,9.666667,4.666667,14.133333,0.000000,0.000000,0.000000
0,2024-08-17,15:00,Premier League,Matchweek 1,Sat,Away,L,0.0,2.0,Arsenal,...,5,0,0.666667,3.333333,6.666667,3.666667,14.200000,0.333333,0.000000,0.000000
1,2024-08-25,14:00,Premier League,Matchweek 2,Sun,Home,L,2.0,6.0,Chelsea,...,6,0,0.333333,2.333333,9.000000,4.333333,17.633333,0.666667,0.000000,0.000000


In [54]:
# assigning indexes to rows so that each row has a unique index
matches_rolling.index = range(matches_rolling.shape[0])

In [64]:
def make_predictions(data, predictors):
  train = data[data["date"] < '2024-01-01']
  test = data[data["date"] >= '2024-01-01']
  rf.fit(train[predictors], train["target"])
  preds = rf.predict(test[predictors])
  combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
  precision = precision_score(test["target"], preds)
  return combined, precision

In [65]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [67]:
# increase of 7%
precision

0.6271186440677966

In [68]:
combined

Unnamed: 0,actual,predicted
131,1,0
132,1,1
133,1,0
134,1,1
135,1,1
...,...,...
3011,0,0
3012,0,0
3013,0,0
3014,0,0


In [69]:
# issue with combined table you cannot table where the mistakes are, just given indexes
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index = True, right_index = True)

In [70]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
131,1,0,2024-01-20,Arsenal,Crystal Palace,W
132,1,1,2024-01-30,Arsenal,Nott'ham Forest,W
133,1,0,2024-02-04,Arsenal,Liverpool,W
134,1,1,2024-02-11,Arsenal,West Ham,W
135,1,1,2024-02-17,Arsenal,Burnley,W
...,...,...,...,...,...,...
3011,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L
3012,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L
3013,0,0,2024-08-17,Wolverhampton Wanderers,Arsenal,L
3014,0,0,2024-08-25,Wolverhampton Wanderers,Chelsea,L


In [71]:
# our algorithm might make different predictions on the same game
# if Villa play City at home, it might predict Villa to beat City at home
# whereas when it predicts the game for City, they might predict them to beat despite being away
# we need to unify the predictions

In [73]:
class MissingDict(dict):
  __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)

In [74]:
combined["new_team"] = combined["team"].map(mapping)

In [75]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
131,1,0,2024-01-20,Arsenal,Crystal Palace,W,Arsenal
132,1,1,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal
133,1,0,2024-02-04,Arsenal,Liverpool,W,Arsenal
134,1,1,2024-02-11,Arsenal,West Ham,W,Arsenal
135,1,1,2024-02-17,Arsenal,Burnley,W,Arsenal
...,...,...,...,...,...,...,...
3011,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves
3012,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L,Wolves
3013,0,0,2024-08-17,Wolverhampton Wanderers,Arsenal,L,Wolves
3014,0,0,2024-08-25,Wolverhampton Wanderers,Chelsea,L,Wolves


In [76]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [78]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,1,0,2024-01-20,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
1,1,1,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal,0,0,Nottingham Forest,Arsenal,L,Nottingham Forest
2,1,0,2024-02-04,Arsenal,Liverpool,W,Arsenal,0,1,Liverpool,Arsenal,L,Liverpool
3,1,1,2024-02-11,Arsenal,West Ham,W,Arsenal,0,0,West Ham United,Arsenal,L,West Ham
4,1,1,2024-02-17,Arsenal,Burnley,W,Arsenal,0,0,Burnley,Arsenal,L,Burnley
...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,1,Crystal Palace,Wolves,W,Crystal Palace
379,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L,Wolves,1,1,Liverpool,Wolves,W,Liverpool
380,0,0,2024-08-17,Wolverhampton Wanderers,Arsenal,L,Wolves,1,1,Arsenal,Wolves,W,Arsenal
381,0,0,2024-08-25,Wolverhampton Wanderers,Chelsea,L,Wolves,1,0,Chelsea,Wolves,W,Chelsea


In [81]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

Unnamed: 0_level_0,count
actual_x,Unnamed: 1_level_1
1,67
0,38


In [84]:
# 64% accuracy
67/105

0.638095238095238