In [1]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [2]:
with sqlite3.connect("../laliga.sqlite") as con:
    df=pd.read_sql_query("SELECT * FROM Matches",con)
    dr=pd.read_sql_query("SELECT * FROM Predictions",con)

df = df.dropna(subset=['score'])

In [3]:

df["score_home_team"] = df["score"].str.split(":").str[0].astype(float)
df["score_away_team"] = df["score"].str.split(":").str[1].astype(float)
df["goal_difference"] = df["score_home_team"] - df["score_away_team"]

df["match_result"] = np.where(df['score_home_team'] > df['score_away_team'], '1', np.where(df['score_home_team'] < df['score_away_team'], '2', 'X'))


In [4]:
teams = df["home_team"].unique()

In [5]:
df['Winner'] = np.where(df['score_home_team'] > df['score_away_team'], 1, np.where(df['score_home_team'] < df['score_away_team'], 0, -1))

df_class_home = df.groupby(['division', 'season', 'matchday', 'home_team','Winner']).agg(
    GF_safe = pd.NamedAgg(column='score_home_team', aggfunc='sum'),
    GA_safe = pd.NamedAgg(column='score_away_team', aggfunc='sum')
    ).reset_index()
df_class_away = df.groupby(['division', 'season', 'matchday', 'away_team', 'Winner']).agg(
    GF_safe = pd.NamedAgg(column='score_away_team', aggfunc='sum'),
    GA_safe = pd.NamedAgg(column='score_home_team', aggfunc='sum')
    ).reset_index()

df_class_home['W_safe'] = np.where(df_class_home['Winner'] == 1 , 1 ,0)
df_class_home['L_safe'] = np.where(df_class_home['Winner'] == 0 , 1 ,0)
df_class_home['T_safe'] = np.where(df_class_home['Winner'] == -1 , 1 ,0)

df_class_away['W_safe'] = np.where(df_class_away['Winner'] == 0 , 1 ,0)
df_class_away['L_safe'] = np.where(df_class_away['Winner'] == 1 , 1 ,0)
df_class_away['T_safe'] = np.where(df_class_away['Winner'] == -1 , 1 ,0)

df_class_away.rename(columns={'away_team':'team'}, inplace=True)
df_class_home.rename(columns={'home_team':'team'}, inplace=True)
df_classification = df_class_away.merge(df_class_home,how='outer')
df_classification = df_classification.groupby(['season', 'division','matchday','team']).sum().reset_index()

df_classification[['W','L','T','GF','GA']] = df_classification.groupby([ 'division','season','team'])[['W_safe','L_safe','T_safe','GF_safe','GA_safe']].cumsum()
df_classification['result_matchday'] = np.where(df_classification['W_safe']==1,'W',np.where(df_classification['L_safe']==1,'L','T'))

for i in range(5):
    df_classification[f"last_{i}"] = df_classification.groupby(['division','season' ,'team'])['result_matchday'].shift(i+1)

df_classification['GD'] = df_classification['GF'] - df_classification['GA']
df_classification['Pts'] = (df_classification['W']) * 3 + df_classification['T']
df_classification['year_of_start']=df_classification['season'].str.split("-").str[0].astype(int)

df_classification["last_5"] = df_classification[[f"last_{i}" for i in range(5)]].agg(lambda x: [i for i in x if not pd.isna(i)], axis=1)

df_classification_ordered = df_classification.sort_values(by=['year_of_start'], ascending=False)
df_classification_ordered = df_classification_ordered.sort_values(by=['year_of_start','division'],ascending=[False,True])
df_classification_ordered = df_classification_ordered.sort_values(by=['year_of_start','division','matchday'], ascending=[False,True,True])
df_classification_ordered = df_classification_ordered.sort_values(by=['year_of_start','division','matchday', 'Pts'],ascending=[False,True,True,False])
df_classification_ordered = df_classification_ordered.sort_values(by=['year_of_start','division','matchday', 'Pts','GD'],ascending=[False,True,True,False,False])
df_classification_ordered = df_classification_ordered.sort_values(by=['year_of_start','division','matchday', 'Pts', 'GD', 'GF'],ascending=[False,True,True,False,False,False]).reset_index(drop=True)

df_classification_ordered['rank']=df_classification_ordered.groupby(['year_of_start','division','matchday']).cumcount()+1
df_classification_ordered['delayed_rank'] = df_classification_ordered.groupby(['year_of_start','division','team'])['rank'].shift(1)

df_with_rank = df_classification_ordered[['season','division','matchday','team','delayed_rank']]


In [6]:

df_useful = df[['season','division','matchday','home_team','match_result','away_team']]

home_team_rank = df_useful.merge(df_with_rank, left_on=['season','division', 'matchday', 'home_team'], right_on=['season','division', 'matchday', 'team'], how='left')
home_team_rank.rename(columns={'delayed_rank': 'home_team_rank'}, inplace=True)
home_team_rank.drop(columns=['team'], inplace=True)

away_team_rank = df_useful.merge(df_with_rank, left_on=['season', 'division', 'matchday', 'away_team'], right_on=['season','division', 'matchday', 'team'], how='left')
away_team_rank.rename(columns={'delayed_rank': 'away_team_rank'}, inplace=True)
away_team_rank.drop(columns=['team'], inplace=True)

df_new = away_team_rank.merge(home_team_rank, on=['season', 'division', 'matchday', 'home_team','away_team'], how='left')

df_new.rename(columns={'match_result_x': 'match_result'},inplace=True)

df_to_train = df_new[['season','home_team','away_team','away_team_rank','home_team_rank','match_result','matchday']]
df_to_train = df_to_train.fillna(0)
df_to_train.tail(30)


Unnamed: 0,season,home_team,away_team,away_team_rank,home_team_rank,match_result,matchday
47970,2021-2022,Real Sociedad B,CD Leganés,0.0,0.0,1,1
47971,2021-2022,Girona,SD Amorebieta,0.0,0.0,1,1
47972,2021-2022,Ponferradina,Alcorcón,0.0,0.0,1,1
47973,2021-2022,Sporting Gijón,Burgos CF,0.0,0.0,1,1
47974,2021-2022,Real Oviedo,CD Lugo,0.0,0.0,X,1
47975,2021-2022,CF Fuenlabrada,CD Tenerife,0.0,0.0,2,1
47976,2021-2022,UD Las Palmas,Real Valladolid,0.0,0.0,X,1
47977,2021-2022,Málaga CF,CD Mirandés,0.0,0.0,X,1
47978,2021-2022,FC Cartagena,UD Almería,0.0,0.0,2,1
47979,2021-2022,Real Valladolid,Real Zaragoza,14.0,10.0,1,2


In [11]:
features = ['away_team_rank','home_team_rank','matchday']
target = ["match_result"]

In [12]:
df_train = df_to_train[df_to_train['season'] < '2020-2021']
x_train = df_train[features]
y_train = df_train[target]

df_test = df_to_train[df_to_train['season']=='2020-2021']
x_test = df_test[features]
y_test = df_test[target]

In [14]:
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
clf_y_pred = clf.predict(x_test)

results_df = x_test.copy()
results_df = df_test.copy()
results_df["match_result_prediction"] = clf_y_pred # el nome haura de ser pred per a que funcioni despres 

model_is_correct = (results_df["match_result_prediction"] == results_df["match_result"]).sum()
total_tries = len(results_df)

print(model_is_correct/total_tries*100)

results_df.tail(40)

  y = column_or_1d(y, warn=True)


44.53681710213777


Unnamed: 0,season,home_team,away_team,away_team_rank,home_team_rank,match_result,matchday,match_result_prediction
47898,2020-2021,UD Las Palmas,Real Zaragoza,15.0,11.0,2,39,1
47899,2020-2021,CE Sabadell,CD Tenerife,14.0,20.0,2,39,1
47900,2020-2021,CF Fuenlabrada,Rayo Vallecano,7.0,10.0,2,39,1
47901,2020-2021,RCD Mallorca,Alcorcón,17.0,2.0,1,39,1
47902,2020-2021,CD Leganés,UD Logroñés,19.0,4.0,1,39,1
47903,2020-2021,Real Oviedo,Málaga CF,12.0,13.0,1,39,1
47904,2020-2021,Girona,Sporting Gijón,5.0,6.0,1,39,1
47905,2020-2021,FC Cartagena,UD Almería,3.0,16.0,1,40,1
47906,2020-2021,Albacete,CD Lugo,21.0,22.0,X,40,1
47907,2020-2021,Ponferradina,Espanyol,1.0,8.0,2,40,1
