Following is the notebook with our best performing model.

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
import seaborn as sns
import matplotlib.pyplot as plt

The first step is to import the dataframe and to remove the rows without the score, as these rows can't be used either to train to model or to check its performence.

In [2]:
with sqlite3.connect("../laliga.sqlite") as con:
    df=pd.read_sql_query("SELECT * FROM Matches",con)

df = df.dropna(subset=['score'])

After that, the dataframe has to be transformed into something that our model can understand and learn from it, so in this next cells the rank for each team from the previous season will first callculated and then added to our dataframe df. 

The procedure to calculate the rank is very similar to exercise 7 from the data analysis notebook with the exeption that the to rank for the second division, the number of teams playing in first division has been added, so that we have a single rank for both divisions for each season. Moreover, the rank for the previous season will be added instead of the current one.

In [3]:
df["score_home_team"] = df["score"].str.split(":").str[0].astype(float)
df["score_away_team"] = df["score"].str.split(":").str[1].astype(float)
df["goal_difference"] = df["score_home_team"] - df["score_away_team"]

df["match_result"] = np.where(df['score_home_team'] > df['score_away_team'], '1', np.where(df['score_home_team'] < df['score_away_team'], '2', 'X'))

In [4]:
def func_home_wins(data):
    return (data[data == '1']).count()

def func_away_wins(data):
    return (data[data == '2']).count()

def func_tie(data):
    return (data[data == 'X']).count()

In [5]:
df_columns_home = df.groupby(['division', 'season', 'home_team']).agg(
    GF=pd.NamedAgg(column='score_home_team', aggfunc='sum'),
    GA=pd.NamedAgg(column='score_away_team', aggfunc='sum'),
    W=pd.NamedAgg(column='match_result', aggfunc=func_home_wins),
    L=pd.NamedAgg(column='match_result', aggfunc=func_away_wins),
    T=pd.NamedAgg(column='match_result', aggfunc=func_tie)
).reset_index()

df_columns_away = df.groupby(['division', 'season', 'away_team']).agg(
    GF=pd.NamedAgg(column='score_away_team', aggfunc='sum'),
    GA=pd.NamedAgg(column='score_home_team', aggfunc='sum'),
    W=pd.NamedAgg(column='match_result', aggfunc=func_away_wins),
    L=pd.NamedAgg(column='match_result', aggfunc=func_home_wins),
    T=pd.NamedAgg(column='match_result', aggfunc=func_tie)
).reset_index()

df_columns_home.rename(columns={'home_team': 'team'}, inplace=True)
df_columns_away.rename(columns={'away_team': 'team'}, inplace=True)

df_classification = df_columns_away.merge(df_columns_home, how='outer')
df_classification = df_classification.groupby(['season', 'team', 'division']).sum().reset_index()

df_classification['GD'] = df_classification['GF'] - df_classification['GA']
df_classification['Pts'] = (df_classification['W']) * 3 + df_classification['T']

df_classification['year_of_start'] = df_classification['season'].str.split("-").str[0].astype(int)

df_classification_ordered = df_classification.sort_values(by=['year_of_start'], ascending=False)
df_classification_ordered = df_classification_ordered.sort_values(by=['year_of_start', 'division'], ascending=[False, True])
df_classification_ordered = df_classification_ordered.sort_values(by=['year_of_start', 'division', 'Pts'], ascending=[False, True, False])
df_classification_ordered = df_classification_ordered.sort_values(by=['year_of_start', 'division', 'Pts', 'GD'], ascending=[False, True, False, False])
df_classification_ordered = df_classification_ordered.sort_values(by=['year_of_start', 'division', 'Pts', 'GD', 'GF'], ascending=[False, True, False, False, False])

df_classification_ordered = df_classification_ordered.reset_index(drop=True)
df_classification_ordered['rank'] = df_classification_ordered.groupby(['year_of_start', 'division']).cumcount() + 1

Here we change the rank for the teams in second division, as explaind before.

In [None]:
df_classification_1_div = df_classification_ordered[df_classification_ordered['division']==1]
df_classification_2_div = df_classification_ordered[df_classification_ordered['division']==2]
num_teams_1_div = df_classification_1_div.groupby('season')['rank'].max().reset_index()

merged_df = df_classification_2_div.merge(num_teams_1_div, on='season')
merged_df['rank'] = merged_df['rank_x'] + merged_df['rank_y']
merged_df = merged_df.drop(columns={'rank_x','rank_y'})

df_classification_ordered_rank2_updated = merged_df.merge(df_classification_1_div,how='outer')

df_classification_ordered_rank2_updated = df_classification_ordered_rank2_updated.sort_values(by=['year_of_start', 'division', 'Pts', 'GD', 'GF'], ascending=[False, True, False, False, False])
df_classification_ordered_rank2_updated['delayed_rank'] = df_classification_ordered_rank2_updated.groupby(['team'])['rank'].shift(-1)

df_with_rank = df_classification_ordered_rank2_updated[['season','division','team','delayed_rank']]


In this next cell the ranks from the home team and the away team are added to a selection of columns from original dataframe to create the dataframe that will be feeded to the model.

In [6]:
df_useful_columns = df[['season','division','home_team','match_result','away_team']]

home_team_rank = df_useful_columns.merge(df_with_rank, left_on=['season','division', 'home_team'], right_on=['season','division', 'team'], how='left')
home_team_rank.rename(columns={'delayed_rank': 'home_team_rank'}, inplace=True)
home_team_rank.drop(columns=['team'], inplace=True)

away_team_rank = df_useful_columns.merge(df_with_rank, left_on=['season', 'division', 'away_team'], right_on=['season','division', 'team'], how='left')
away_team_rank.rename(columns={'delayed_rank': 'away_team_rank'}, inplace=True)
away_team_rank.drop(columns=['team'], inplace=True)

df_both_teams_rank = away_team_rank.merge(home_team_rank, on=['season', 'division', 'home_team','away_team'], how='left')

df_both_teams_rank.rename(columns={'match_result_x': 'match_result'},inplace=True)

df_to_train = df_both_teams_rank[['season','home_team','away_team','home_team_rank','away_team_rank','match_result']]
df_to_train = df_to_train.fillna(0)

Now we define the features that we will use to train the model, for this case just the ranks of the two teams which are playing.

In [7]:
features = ['away_team_rank','home_team_rank']
target = ["match_result"]

From all the data we have we select a few seasons to train the model and another one to test it's efficiency

In [14]:
df_train = df_to_train[(df_to_train['season'] > '2000-2001') & (df_to_train['season'] < '2020-2021')]
x_train = df_train[features]
y_train = df_train[target].values.ravel()

df_test = df_to_train[df_to_train['season']=='2020-2021']
x_test = df_test[features]
y_test = df_test[target]

For our model we used the *GradientBoostingClassifier()* from the library scikit-learn, which is trained with the data we modified until now and after that tested.

In [16]:
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
clf_y_pred = clf.predict(x_test)

results_df = x_test.copy()
results_df = df_test.copy()
results_df["match_result_prediction"] = clf_y_pred

model_is_correct = (results_df["match_result_prediction"] == results_df["match_result"]).sum()
total_tries = len(results_df)
sucess_rate = model_is_correct/total_tries*100

print(f"This model guesses right {sucess_rate:.2f} % of the time")

This model guesses right 46.67 % of the time
