In [1]:
import pandas as pd

In [2]:
#reading the dataset
matches = pd.read_csv("La_liga_matches.csv", index_col=0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,referee,match report,notes,sh,sot,dist,pk,pkatt,season,team
0,2023-08-12,21:30,La Liga,Matchweek 1,Sat,Away,W,2.0,0.0,Athletic Club,...,Jesús Gil,Match Report,,14.0,8.0,16.0,0,0,2023,Real Madrid
1,2023-08-19,19:30,La Liga,Matchweek 2,Sat,Away,W,3.0,1.0,Almería,...,José Sánchez,Match Report,,25.0,9.0,17.0,0,0,2023,Real Madrid
2,2023-08-25,21:30,La Liga,Matchweek 3,Fri,Away,W,1.0,0.0,Celta Vigo,...,Isidro Díaz de Mera,Match Report,,9.0,2.0,19.4,0,1,2023,Real Madrid
3,2023-09-02,16:15,La Liga,Matchweek 4,Sat,Home,W,2.0,1.0,Getafe,...,Mario Melero,Match Report,,26.0,12.0,17.7,0,0,2023,Real Madrid
4,2023-09-17,21:00,La Liga,Matchweek 5,Sun,Home,W,2.0,1.0,Real Sociedad,...,César Soto,Match Report,,17.0,8.0,15.9,0,0,2023,Real Madrid


In [4]:
matches.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'pk', 'pkatt',
       'season', 'team'],
      dtype='object')

In [5]:
matches.shape

(2020, 26)

In [6]:
matches["team"].value_counts().sort_index()

Alaves              63
Almeria             63
Athletic Club      101
Atletico Madrid    101
Barcelona          101
Cadiz              101
Celta Vigo         101
Elche               76
Espanyol            76
Getafe             101
Girona              63
Granada             63
Las Palmas          25
Levante             38
Mallorca           101
Osasuna            101
Rayo Vallecano     101
Real Betis         101
Real Madrid        101
Real Sociedad      101
Sevilla            101
Valencia           101
Valladolid          38
Villarreal         101
Name: team, dtype: int64

In [7]:
matches["round"].value_counts()

Matchweek 1     60
Matchweek 14    60
Matchweek 25    60
Matchweek 24    60
Matchweek 23    60
Matchweek 20    60
Matchweek 22    60
Matchweek 2     60
Matchweek 19    60
Matchweek 18    60
Matchweek 17    60
Matchweek 16    60
Matchweek 15    60
Matchweek 21    60
Matchweek 13    60
Matchweek 7     60
Matchweek 12    60
Matchweek 3     60
Matchweek 4     60
Matchweek 6     60
Matchweek 5     60
Matchweek 8     60
Matchweek 9     60
Matchweek 10    60
Matchweek 11    60
Matchweek 32    40
Matchweek 37    40
Matchweek 36    40
Matchweek 35    40
Matchweek 34    40
Matchweek 33    40
Matchweek 27    40
Matchweek 31    40
Matchweek 30    40
Matchweek 29    40
Matchweek 28    40
Matchweek 26    40
Matchweek 38    40
Name: round, dtype: int64

In [8]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
pk                int64
pkatt             int64
season            int64
team             object
dtype: object

Creating Predictors:

In [9]:
matches["date"] = pd.to_datetime(matches["date"]) #convert date type

In [10]:
#for showing if team played in home or away (0 for away, 1 for home)
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [11]:
#for showing if opponent team played in home or away (0 for away, 1 for home)
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [12]:
#for showing at what hour
matches["hour"] = matches["time"].str.replace(":.+","",regex=True).astype("int")

In [13]:
#represent days with a number (Monday as 0,Tuesday as 1,Saturday as 5 etc.)
matches["day_code"] = matches["date"].dt.dayofweek

In [14]:
#0 for team loss or draw, 1 for win
matches["target"] = (matches["result"]=="W").astype("int")

In [15]:
#for simplicity we take loss and draw as 0 and win as 1
#we can also take a scenario where loss is -1, draw is 0 and win is 1
#matches["target"] = matches["result"].apply(lambda x: 1 if x == "W" else (0 if x == "D" else -1))

Using Random Forest Model

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [17]:
rf = RandomForestClassifier(n_estimators = 100, min_samples_split=15, random_state=42)

In [18]:
train = matches[matches["date"] < '2023-09-10']

In [19]:
test = matches[matches["date"] > '2023-09-10']

In [20]:
train.shape

(1598, 31)

In [21]:
test.shape

(422, 31)

In [22]:
predictors=["venue_code","opp_code","hour","day_code"]

In [23]:
rf.fit(train[predictors],train["target"])

In [24]:
preds=rf.predict(test[predictors])

In [25]:
acc = accuracy_score(test["target"],preds)

In [26]:
acc

0.6516587677725119

In [27]:
combined = pd.DataFrame(dict(actual=test["target"],prediction=preds))

In [28]:
combined

Unnamed: 0,actual,prediction
4,1,0
6,0,0
7,1,1
8,1,0
10,1,1
...,...,...
22,0,0
23,0,1
24,0,0
25,0,0


In [29]:
pd.crosstab(index=combined["actual"],columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,218,62
1,85,57


In [30]:
precision_score(test["target"],preds)

0.4789915966386555

Improving Precision score for the Random Forest Model

In [31]:
grouped_matches = matches.groupby("team")

In [32]:
#calculates rolling averages for specified columns within each group
def rolling_averages(group,cols,new_cols):
  group=group.sort_values("date")
  rolling_stats = group[cols].rolling(4).mean()
  group[new_cols]=rolling_stats
  group=group.dropna(subset=new_cols)
  return group

In [33]:
#taking more predictors from the dataset
cols=["gf","ga","sh","sot","dist","pk","pkatt","xg","xga","poss","attendance"]
new_cols=[f"{c}_rolling" for c in cols]

In [34]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'pk_rolling',
 'pkatt_rolling',
 'xg_rolling',
 'xga_rolling',
 'poss_rolling',
 'attendance_rolling']

In [35]:
#grouped data by the "team" column, and then applying the function rolling_averages to each group
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [36]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,ga_rolling,sh_rolling,sot_rolling,dist_rolling,pk_rolling,pkatt_rolling,xg_rolling,xga_rolling,poss_rolling,attendance_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alaves,3,2021-09-18,21:00,La Liga,Matchweek 5,Sat,Home,L,0.0,2.0,Osasuna,...,2.50,9.00,3.00,18.875,0.25,0.25,0.775,1.350,48.50,6416.75
Alaves,4,2021-09-22,19:30,La Liga,Matchweek 6,Wed,Away,L,0.0,1.0,Espanyol,...,1.75,8.25,2.75,18.000,0.00,0.00,0.475,1.350,49.75,8988.25
Alaves,5,2021-09-25,14:00,La Liga,Matchweek 7,Sat,Home,W,1.0,0.0,Atlético Madrid,...,1.50,7.75,2.00,18.275,0.00,0.00,0.525,1.350,45.75,10723.25
Alaves,6,2021-10-01,21:00,La Liga,Matchweek 8,Fri,Away,L,0.0,1.0,Athletic Club,...,1.00,9.75,2.75,16.950,0.00,0.00,0.650,1.450,44.25,15232.75
Alaves,7,2021-10-18,19:00,La Liga,Matchweek 9,Mon,Home,L,0.0,1.0,Betis,...,0.75,11.25,2.50,15.225,0.00,0.00,0.750,1.450,39.75,15745.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Villarreal,29,2024-01-20,16:15,La Liga,Matchweek 21,Sat,Home,D,1.0,1.0,Mallorca,...,2.25,13.50,4.75,15.875,0.25,0.25,1.700,1.325,50.25,25156.00
Villarreal,30,2024-01-27,18:30,La Liga,Matchweek 22,Sat,Away,W,5.0,3.0,Barcelona,...,2.50,13.00,4.50,16.050,0.00,0.00,1.625,1.550,45.50,33041.50
Villarreal,31,2024-02-04,14:00,La Liga,Matchweek 23,Sun,Home,D,0.0,0.0,Cádiz,...,1.75,15.75,4.50,16.825,0.00,0.00,1.800,1.025,46.50,26094.00
Villarreal,32,2024-02-10,14:00,La Liga,Matchweek 24,Sat,Away,D,1.0,1.0,Alavés,...,1.25,14.75,4.50,16.925,0.00,0.00,1.600,1.275,47.25,23603.50


In [37]:
#dropping the team column from the dataframe
matches_rolling = matches_rolling.droplevel('team')

In [38]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,ga_rolling,sh_rolling,sot_rolling,dist_rolling,pk_rolling,pkatt_rolling,xg_rolling,xga_rolling,poss_rolling,attendance_rolling
3,2021-09-18,21:00,La Liga,Matchweek 5,Sat,Home,L,0.0,2.0,Osasuna,...,2.50,9.00,3.00,18.875,0.25,0.25,0.775,1.350,48.50,6416.75
4,2021-09-22,19:30,La Liga,Matchweek 6,Wed,Away,L,0.0,1.0,Espanyol,...,1.75,8.25,2.75,18.000,0.00,0.00,0.475,1.350,49.75,8988.25
5,2021-09-25,14:00,La Liga,Matchweek 7,Sat,Home,W,1.0,0.0,Atlético Madrid,...,1.50,7.75,2.00,18.275,0.00,0.00,0.525,1.350,45.75,10723.25
6,2021-10-01,21:00,La Liga,Matchweek 8,Fri,Away,L,0.0,1.0,Athletic Club,...,1.00,9.75,2.75,16.950,0.00,0.00,0.650,1.450,44.25,15232.75
7,2021-10-18,19:00,La Liga,Matchweek 9,Mon,Home,L,0.0,1.0,Betis,...,0.75,11.25,2.50,15.225,0.00,0.00,0.750,1.450,39.75,15745.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,2024-01-20,16:15,La Liga,Matchweek 21,Sat,Home,D,1.0,1.0,Mallorca,...,2.25,13.50,4.75,15.875,0.25,0.25,1.700,1.325,50.25,25156.00
30,2024-01-27,18:30,La Liga,Matchweek 22,Sat,Away,W,5.0,3.0,Barcelona,...,2.50,13.00,4.50,16.050,0.00,0.00,1.625,1.550,45.50,33041.50
31,2024-02-04,14:00,La Liga,Matchweek 23,Sun,Home,D,0.0,0.0,Cádiz,...,1.75,15.75,4.50,16.825,0.00,0.00,1.800,1.025,46.50,26094.00
32,2024-02-10,14:00,La Liga,Matchweek 24,Sat,Away,D,1.0,1.0,Alavés,...,1.25,14.75,4.50,16.925,0.00,0.00,1.600,1.275,47.25,23603.50


In [39]:
matches_rolling.index = range(matches_rolling.shape[0])

In [40]:
#creating a function to make prediction when using random forest model
def make_predictions_rf(data,predictors):
  train = data[data["date"] < '2023-09-10']
  test = data[data["date"] > '2023-09-10']
  rf.fit(train[predictors],train["target"])
  preds = rf.predict(test[predictors])
  combined = pd.DataFrame(dict(actual=test["target"],prediction=preds), index=test.index)
  precision = precision_score(test["target"],preds)
  acc = accuracy_score(test["target"], preds)
  return combined,precision,acc

In [41]:
combined, precision, acc = make_predictions_rf(matches_rolling, predictors + new_cols)

In [42]:
precision

0.7217391304347827

In [43]:
#We can see that our precision has improved after using the rolling function and selecting more predictors

In [44]:
pd.crosstab(index=combined["actual"],columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,248,32
1,59,83


In [45]:
#combining and merging dataframes
combined_rf = combined.merge(matches_rolling[["date", "team", "opponent", "result","venue"]], left_index=True, right_index=True)

In [46]:
combined_rf.head()

Unnamed: 0,actual,prediction,date,team,opponent,result,venue
39,0,0,2023-09-15,Alaves,Rayo Vallecano,L,Away
40,0,0,2023-09-22,Alaves,Athletic Club,L,Home
41,0,0,2023-09-28,Alaves,Celta Vigo,D,Away
42,0,0,2023-10-01,Alaves,Osasuna,L,Home
43,0,0,2023-10-08,Alaves,Betis,D,Home


In [47]:
predictions_rf = combined_rf.merge(combined_rf, left_on=["date", "team"], right_on=["date", "opponent"])

In [48]:
predictions_rf

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,venue_x,actual_y,prediction_y,team_y,opponent_y,result_y,venue_y
0,1,1,2023-09-16,Athletic Club,Cádiz,W,Home,0,0,Cadiz,Athletic Club,L,Away
1,1,1,2023-09-22,Athletic Club,Alavés,W,Away,0,0,Alaves,Athletic Club,L,Home
2,0,1,2023-09-27,Athletic Club,Getafe,D,Home,0,0,Getafe,Athletic Club,D,Away
3,0,0,2023-09-30,Athletic Club,Real Sociedad,L,Away,1,1,Real Sociedad,Athletic Club,W,Home
4,1,1,2023-10-06,Athletic Club,Almería,W,Home,0,0,Almeria,Athletic Club,L,Away
...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,0,0,2024-01-20,Villarreal,Mallorca,D,Home,0,0,Mallorca,Villarreal,D,Away
312,1,0,2024-01-27,Villarreal,Barcelona,W,Away,0,1,Barcelona,Villarreal,L,Home
313,0,0,2024-02-04,Villarreal,Cádiz,D,Home,0,0,Cadiz,Villarreal,D,Away
314,0,0,2024-02-10,Villarreal,Alavés,D,Away,0,1,Alaves,Villarreal,D,Home


In [49]:
predictions_rf[(predictions_rf["prediction_x"] == 1) & (predictions_rf["prediction_y"] ==0)]["actual_x"].value_counts()

1    60
0    17
Name: actual_x, dtype: int64

In [50]:
predictions_rf[(predictions_rf["team_x"]=="Real Madrid") & (predictions_rf["team_y"]=="Osasuna")]

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,venue_x,actual_y,prediction_y,team_y,opponent_y,result_y,venue_y
214,1,1,2023-10-07,Real Madrid,Osasuna,W,Home,0,0,Osasuna,Real Madrid,L,Away


Using Logistic Regression Model

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
#creating a function to make prediction when using logistic regression
def make_predictions_lr(data,predictors):
  train = data[data["date"] < '2023-09-10']
  test = data[data["date"] > '2023-09-10']
  lr = LogisticRegression(max_iter=1000)
  lr.fit(train[predictors], train["target"])
  preds = lr.predict(test[predictors])
  combined = pd.DataFrame(dict(actual=test["target"],prediction=preds), index=test.index)
  precision = precision_score(test["target"],preds)
  acc = accuracy_score(test["target"], preds)
  return combined,precision,acc

In [53]:
combined, precision, acc = make_predictions_lr(matches_rolling, predictors + new_cols)

In [54]:
precision

0.6850393700787402

In [55]:
pd.crosstab(index=combined["actual"],columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,240,40
1,55,87


In [56]:
combined_lr = combined.merge(matches_rolling[["date", "team", "opponent", "result","venue"]], left_index=True, right_index=True)

In [57]:
combined_lr.head()

Unnamed: 0,actual,prediction,date,team,opponent,result,venue
39,0,0,2023-09-15,Alaves,Rayo Vallecano,L,Away
40,0,0,2023-09-22,Alaves,Athletic Club,L,Home
41,0,0,2023-09-28,Alaves,Celta Vigo,D,Away
42,0,0,2023-10-01,Alaves,Osasuna,L,Home
43,0,0,2023-10-08,Alaves,Betis,D,Home


In [58]:
predictions_lr = combined_lr.merge(combined_lr, left_on=["date", "team"], right_on=["date", "opponent"])

In [59]:
predictions_lr

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,venue_x,actual_y,prediction_y,team_y,opponent_y,result_y,venue_y
0,1,1,2023-09-16,Athletic Club,Cádiz,W,Home,0,0,Cadiz,Athletic Club,L,Away
1,1,1,2023-09-22,Athletic Club,Alavés,W,Away,0,0,Alaves,Athletic Club,L,Home
2,0,1,2023-09-27,Athletic Club,Getafe,D,Home,0,0,Getafe,Athletic Club,D,Away
3,0,0,2023-09-30,Athletic Club,Real Sociedad,L,Away,1,1,Real Sociedad,Athletic Club,W,Home
4,1,1,2023-10-06,Athletic Club,Almería,W,Home,0,0,Almeria,Athletic Club,L,Away
...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,0,0,2024-01-20,Villarreal,Mallorca,D,Home,0,0,Mallorca,Villarreal,D,Away
312,1,0,2024-01-27,Villarreal,Barcelona,W,Away,0,1,Barcelona,Villarreal,L,Home
313,0,0,2024-02-04,Villarreal,Cádiz,D,Home,0,0,Cadiz,Villarreal,D,Away
314,0,0,2024-02-10,Villarreal,Alavés,D,Away,0,0,Alaves,Villarreal,D,Home


In [60]:
predictions_lr[(predictions_lr["team_x"]=="Real Madrid") & (predictions_lr["team_y"]=="Osasuna")]

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,venue_x,actual_y,prediction_y,team_y,opponent_y,result_y,venue_y
214,1,1,2023-10-07,Real Madrid,Osasuna,W,Home,0,0,Osasuna,Real Madrid,L,Away


Using  Multi-layer Perceptron

In [61]:
from sklearn.neural_network import MLPClassifier

In [62]:
#creating a function to make prediction when using multi-layer perceptron
def make_predictions_nn(data,predictors):
  train = data[data["date"] < '2023-09-10']
  test = data[data["date"] > '2023-09-10']
  nn = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
  nn.fit(train[predictors], train["target"])
  preds = nn.predict(test[predictors])
  combined = pd.DataFrame(dict(actual=test["target"],prediction=preds), index=test.index)
  precision = precision_score(test["target"],preds)
  acc = accuracy_score(test["target"], preds)
  return combined,precision,acc

In [63]:
combined, precision, acc = make_predictions_nn(matches_rolling, predictors + new_cols)

In [64]:
precision

0.334916864608076

In [65]:
pd.crosstab(index=combined["actual"],columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,280
1,1,141


In [66]:
combined_nn = combined.merge(matches_rolling[["date", "team", "opponent", "result","venue"]], left_index=True, right_index=True)

In [67]:
combined_nn.head(15)

Unnamed: 0,actual,prediction,date,team,opponent,result,venue
39,0,1,2023-09-15,Alaves,Rayo Vallecano,L,Away
40,0,1,2023-09-22,Alaves,Athletic Club,L,Home
41,0,1,2023-09-28,Alaves,Celta Vigo,D,Away
42,0,1,2023-10-01,Alaves,Osasuna,L,Home
43,0,1,2023-10-08,Alaves,Betis,D,Home
44,0,1,2023-10-22,Alaves,Villarreal,D,Away
45,0,1,2023-10-29,Alaves,Atlético Madrid,L,Away
46,1,1,2023-11-05,Alaves,Almería,W,Home
47,0,1,2023-11-12,Alaves,Barcelona,L,Away
48,1,1,2023-11-24,Alaves,Granada,W,Home


In [68]:
predictions_nn = combined_nn.merge(combined_nn, left_on=["date", "team"], right_on=["date", "opponent"])

In [69]:
predictions_nn

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,venue_x,actual_y,prediction_y,team_y,opponent_y,result_y,venue_y
0,1,1,2023-09-16,Athletic Club,Cádiz,W,Home,0,1,Cadiz,Athletic Club,L,Away
1,1,1,2023-09-22,Athletic Club,Alavés,W,Away,0,1,Alaves,Athletic Club,L,Home
2,0,1,2023-09-27,Athletic Club,Getafe,D,Home,0,1,Getafe,Athletic Club,D,Away
3,0,1,2023-09-30,Athletic Club,Real Sociedad,L,Away,1,1,Real Sociedad,Athletic Club,W,Home
4,1,1,2023-10-06,Athletic Club,Almería,W,Home,0,1,Almeria,Athletic Club,L,Away
...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,0,1,2024-01-20,Villarreal,Mallorca,D,Home,0,1,Mallorca,Villarreal,D,Away
312,1,1,2024-01-27,Villarreal,Barcelona,W,Away,0,1,Barcelona,Villarreal,L,Home
313,0,1,2024-02-04,Villarreal,Cádiz,D,Home,0,1,Cadiz,Villarreal,D,Away
314,0,1,2024-02-10,Villarreal,Alavés,D,Away,0,1,Alaves,Villarreal,D,Home


In [70]:
predictions_nn[(predictions_nn["team_x"]=="Real Madrid") & (predictions_nn["team_y"]=="Osasuna")]

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,venue_x,actual_y,prediction_y,team_y,opponent_y,result_y,venue_y
214,1,1,2023-10-07,Real Madrid,Osasuna,W,Home,0,1,Osasuna,Real Madrid,L,Away


After trying out different models and improving precision by taking more factors, we have noticed that Random Forest Model performs the best out of all the models that have been tried for this dataset.