In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [24]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [25]:
train

Unnamed: 0,fixture_id,date,league_season,league,home_team,away_team,home_score,away_score,home_result,home_avg_goals_last_3,...,points_diff_last_10,streak_diff,elo_diff,h2h_matches,h2h_home_wins,h2h_away_wins,h2h_draws,h2h_home_goals,h2h_away_goals,h2h_goal_diff
0,720752,2021-08-20 19:00:00+00:00,2021,La Liga,Real Betis,Cadiz,1,1,1,1.000,...,0.0,0,0.00,0,0,0,0,0,0,0
1,720749,2021-08-21 15:00:00+00:00,2021,La Liga,Alaves,Mallorca,0,1,0,1.000,...,-1.0,-1,-10.00,0,0,0,0,0,0,0
2,720755,2021-08-21 17:30:00+00:00,2021,La Liga,Espanyol,Villarreal,0,0,1,0.000,...,0.0,0,0.00,0,0,0,0,0,0,0
3,720754,2021-08-21 17:30:00+00:00,2021,La Liga,Granada CF,Valencia,1,1,1,0.000,...,-2.0,-1,-10.00,0,0,0,0,0,0,0
4,720750,2021-08-21 20:00:00+00:00,2021,La Liga,Athletic Club,Barcelona,1,1,1,0.000,...,-2.0,-1,-10.00,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,1038189,2024-02-11 13:00:00+00:00,2023,La Liga,Getafe,Celta Vigo,3,2,2,1.000,...,1.0,-1,45.69,5,2,1,2,6,6,0
986,1038184,2024-02-11 15:15:00+00:00,2023,La Liga,Mallorca,Rayo Vallecano,2,1,2,0.333,...,5.0,0,-3.35,5,3,1,1,10,6,4
987,1038187,2024-02-11 17:30:00+00:00,2023,La Liga,Sevilla,Atletico Madrid,1,0,2,1.333,...,-9.0,1,-188.93,5,1,3,1,4,11,-7
988,1038183,2024-02-11 20:00:00+00:00,2023,La Liga,Barcelona,Granada CF,3,3,1,2.333,...,15.0,2,294.78,3,0,0,3,4,4,0


In [26]:
X = train.drop(columns=['fixture_id','date', 'league','home_team','away_team','home_score','away_score','league_season','home_result'])
# X = train[['scoring_diff_last_3','scoring_diff_last_5','scoring_diff_last_10','scoring_diff_season','points_diff_last_3','points_diff_last_5','points_diff_last_10', 'streak_diff', 'elo_diff']]
y = train['home_result']

In [28]:
pca = PCA(n_components=10)
# pca.fit(X)
# pca.explained_variance_ratio_.cumsum()

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=20, random_state=42)

In [31]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('pca', pca),
    ('classifier', RandomForestClassifier())
])
pipe

In [32]:
lreg_params = {
    'classifier': [LogisticRegression(max_iter=1000)],
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l2']
}

# Random Forest
rf_params = {
    'classifier': [RandomForestClassifier()],
    'classifier__max_depth': [5, 10],
    'classifier__min_samples_leaf': [5, 10],
    'classifier__criterion': ['gini']
}

# XGBoost
xgb_params = {
    'classifier': [XGBClassifier()],
    'scaler': [None],  # XGBoost handles scaling internally
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 7],
    'classifier__learning_rate': [0.1],
    'classifier__subsample': [0.8],
    'classifier__colsample_bytree': [0.8]
}

# LightGBM
lgmb_params = {
    'classifier': [LGBMClassifier()],
    'scaler': [None],  # LightGBM handles scaling internally
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 7],
    'classifier__learning_rate': [0.1],
    'classifier__num_leaves': [31],
    'classifier__subsample': [0.8]
}

# CatBoost
catb_params = {
    'classifier': [CatBoostClassifier(verbose=0)],
    'scaler': [None],  # CatBoost handles scaling internally
    'classifier__iterations': [100, 200],
    'classifier__depth': [5, 7],
    'classifier__learning_rate': [0.1]
}


# search = [lreg_params,rf_params, xgb_params,lgmb_params,catb_params]

In [33]:
gs = GridSearchCV(pipe, lreg_params, scoring='accuracy', cv=5, n_jobs=-1)
gs.fit(X_train,y_train)

print(gs.best_score_)
print(gs.best_estimator_)
print(gs.best_params_)

h_lreg = gs.best_estimator_

0.5082474226804123
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=10)),
                ('classifier', LogisticRegression(C=1, max_iter=1000))])
{'classifier': LogisticRegression(max_iter=1000), 'classifier__C': 1, 'classifier__penalty': 'l2'}


In [34]:
gs = GridSearchCV(pipe, rf_params, scoring='accuracy', cv=5 ,n_jobs=-1)
gs.fit(X_train,y_train)

print(gs.best_score_)
print(gs.best_estimator_)
print(gs.best_params_)

h_rf = gs.best_estimator_

0.49690721649484537
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=10)),
                ('classifier',
                 RandomForestClassifier(max_depth=5, min_samples_leaf=5))])
{'classifier': RandomForestClassifier(), 'classifier__criterion': 'gini', 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 5}


In [35]:
gs = GridSearchCV(pipe, xgb_params, scoring='accuracy', cv=5 ,n_jobs=-1)
gs.fit(X_train,y_train)

print(gs.best_score_)
print(gs.best_estimator_)
print(gs.best_params_)

h_xgb = gs.best_estimator_

0.4731958762886597
Pipeline(steps=[('scaler', None), ('pca', PCA(n_components=10)),
                ('classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=0.8, device=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, feature_weights=None,
                               gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=0.1,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=5, max_leaves=None,
                               min_child_weight=None, mi

In [36]:
gs = GridSearchCV(pipe, lgmb_params, scoring='accuracy', cv=5 ,n_jobs=-1)
gs.fit(X_train,y_train)

print(gs.best_score_)
print(gs.best_estimator_)
print(gs.best_params_)

h_lgmb = gs.best_estimator_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 970, number of used features: 10
[LightGBM] [Info] Start training from score -1.275177
[LightGBM] [Info] Start training from score -1.308952
[LightGBM] [Info] Start training from score -0.797363
0.4484536082474227
Pipeline(steps=[('scaler', None), ('pca', PCA(n_components=10)),
                ('classifier', LGBMClassifier(max_depth=7, subsample=0.8))])
{'classifier': LGBMClassifier(), 'classifier__learning_rate': 0.1, 'classifier__max_depth': 7, 'classifier__n_estimators': 100, 'classifier__num_leaves': 31, 'classifier__subsample': 0.8, 'scaler': None}


In [37]:
gs = GridSearchCV(pipe, catb_params, scoring='accuracy', cv=5 ,n_jobs=-1)
gs.fit(X_train,y_train)

print(gs.best_score_)
print(gs.best_estimator_)
print(gs.best_params_)

h_catb = gs.best_estimator_

0.48453608247422686
Pipeline(steps=[('scaler', None), ('pca', PCA(n_components=10)),
                ('classifier',
                 <catboost.core.CatBoostClassifier object at 0x000001C78F145D60>)])
{'classifier': <catboost.core.CatBoostClassifier object at 0x000001C792DFF8C0>, 'classifier__depth': 5, 'classifier__iterations': 100, 'classifier__learning_rate': 0.1, 'scaler': None}


In [None]:
# print(gs.best_score_)
# print(gs.best_estimator_)
# print(gs.best_params_)

In [None]:
# h_model = gs.best_estimator_

In [44]:
h_rf.named_steps['classifier'].feature_importances_

array([0.29538919, 0.09169334, 0.06656672, 0.12892432, 0.08409509,
       0.05216115, 0.07693606, 0.08458866, 0.0672276 , 0.05241786])

In [None]:
# feat_importance = h_model.named_steps['classifier'].feature_importances_

# pd.DataFrame({
#     'Feature': X.columns,
#     "importance": feat_importance
# }).sort_values(by='importance', ascending=False)

In [46]:
test_pred = h_rf.predict(test.drop(columns=['fixture_id','league','date','home_team','away_team','home_score','away_score','league_season','home_result']))
# test_pred = h_model.predict(test[['scoring_diff_last_3','scoring_diff_last_5','scoring_diff_last_10','scoring_diff_season','points_diff_last_3','points_diff_last_5','points_diff_last_10', 'streak_diff']])
test_pred



array([2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2,
       2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2,
       2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 0, 2,
       0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 2, 2, 0, 2,
       2, 2, 0, 2, 2, 0, 0, 0, 0, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 0,
       2, 2, 0, 2, 2, 2, 2, 0])

In [48]:
print(f'accuracy: {accuracy_score(test['home_result'], test_pred)}')

accuracy: 0.5357142857142857


In [None]:
# pickle.dump(h_model, open('../models/model_model.pkl', 'wb'))