In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
train

Unnamed: 0,fixture_id,date,home_team,home_team_id,away_team,away_team_id,home_score,away_score,league_season,home_result,...,away_points_last_3,away_points_last_5,away_points_last_10,scoring_diff_last_3,scoring_diff_last_5,scoring_diff_last_10,scoring_diff_season,points_diff_last_3,points_diff_last_5,points_diff_last_10
0,720746,2021-08-13 19:00:00+00:00,Valencia,532,Getafe,546,1,0,2021,2,...,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0
1,720743,2021-08-14 17:30:00+00:00,Cadiz,724,Levante,539,1,1,2021,1,...,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0
2,720742,2021-08-14 17:30:00+00:00,Mallorca,798,Real Betis,543,1,1,2021,1,...,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0
3,720739,2021-08-14 20:00:00+00:00,Alaves,542,Real Madrid,541,1,4,2021,0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0
4,720744,2021-08-14 20:00:00+00:00,Osasuna,727,Espanyol,540,0,0,2021,1,...,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1038189,2024-02-11 13:00:00+00:00,Getafe,546,Celta Vigo,538,3,2,2023,2,...,3.0,7.0,13.0,-0.333,-1.0,-0.2,0.261,1.0,-3.0,1.0
996,1038184,2024-02-11 15:15:00+00:00,Mallorca,798,Rayo Vallecano,728,2,1,2023,2,...,1.0,4.0,6.0,-1.001,-0.8,0.4,-0.044,0.0,-2.0,5.0
997,1038187,2024-02-11 17:30:00+00:00,Sevilla,536,Atletico Madrid,530,1,0,2023,2,...,7.0,10.0,17.0,-2.000,-1.8,-1.0,-1.218,-3.0,-6.0,-9.0
998,1038183,2024-02-11 20:00:00+00:00,Barcelona,529,Granada CF,715,3,3,2023,1,...,1.0,4.0,5.0,1.333,1.2,1.4,1.696,5.0,8.0,15.0


In [4]:
X = train.drop(columns=['fixture_id','date','home_team','home_team_id','away_team','away_team_id','home_score','away_score','league_season','home_result'])
# X = train[['scoring_diff_last_3','scoring_diff_last_5','scoring_diff_last_10','scoring_diff_season','points_diff_last_3','points_diff_last_5','points_diff_last_10']]
y = train['home_result']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=20, random_state=42)

In [6]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])
pipe

In [7]:
rf_params = {
    'scaler': [StandardScaler(), MinMaxScaler(), None],
    'classifier__max_depth': np.arange(2,10),
    'classifier__min_samples_leaf': np.arange(2,15)
}

xgb_params = {
    'classifier' : [XGBClassifier()],
    'scaler': [StandardScaler(), MinMaxScaler(), None],
    'classifier__max_depth': np.arange(2,10),
    'classifier__min_samples_leaf': np.arange(2,15),
    # 'classifier__learning_rate': [0.05, 0.1, 0.2],
    # 'classifier__subsample': [0.8, 1.0],
    # 'classifier__colsample_bytree': [0.8, 1.0],
    # 'classifier__gamma': [0, 1],
    # 'classifier__min_child_weight': [1, 5],
    # 'classifier__reg_alpha': [0, 0.1],
    # 'classifier__reg_lambda': [1, 2],
}

search = [rf_params, xgb_params]

In [8]:
gs = GridSearchCV(pipe, search,scoring='accuracy', cv=10,n_jobs=-1)
gs.fit(X_train,y_train)

In [9]:
print(gs.best_score_)
print(gs.best_estimator_)
print(gs.best_params_)

0.5132653061224489
Pipeline(steps=[('scaler', None),
                ('classifier',
                 RandomForestClassifier(max_depth=np.int64(9),
                                        min_samples_leaf=np.int64(14)))])
{'classifier__max_depth': np.int64(9), 'classifier__min_samples_leaf': np.int64(14), 'scaler': None}


In [10]:
h_model = gs.best_estimator_

In [11]:
feat_importance = h_model.named_steps['classifier'].feature_importances_

pd.DataFrame({
    'Feature': X.columns,
    "importance": feat_importance
})

Unnamed: 0,Feature,importance
0,home_avg_goals_last_3,0.010068
1,home_avg_goals_against_last_3,0.012967
2,home_avg_goals_last_5,0.013482
3,home_avg_goals_against_last_5,0.019123
4,home_avg_goals_last_10,0.028443
5,home_avg_goals_against_last_10,0.018889
6,home_avg_goals_season,0.036314
7,home_avg_goals_against_season,0.040894
8,home_avg_scoring_last_3,0.019146
9,home_avg_scoring_last_5,0.013385


In [12]:
test_pred = h_model.predict(test.drop(columns=['fixture_id','date','home_team','home_team_id','away_team','away_team_id','home_score','away_score','league_season','home_result']))
test_pred

array([1, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2,
       2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, 1, 2, 2, 0, 0, 2,
       2, 2, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 0, 0, 2,
       1, 2, 2, 2, 2, 2, 2, 1, 2, 0, 0, 0, 2, 2, 0, 0, 1, 2, 2, 2, 1, 2,
       2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 0, 1, 0, 2, 2, 1, 2, 2, 2, 2, 0, 2, 2, 0, 2, 1,
       2, 2, 0, 2, 2, 2, 1, 0])

In [13]:
print(f'accuracy: {accuracy_score(test['home_result'], test_pred)}')

accuracy: 0.5357142857142857


In [None]:
pickle.dump(h_model, open('../models/model_model.pkl', 'wb'))