In [1]:
import pandas as pd

matches = pd.read_csv("matches_3classes_rolling.csv", index_col=0)

In [2]:
from sklearn import preprocessing

In [3]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

predictors += new_cols
#matches[predictors] = preprocessing.scale(matches[predictors]) 

train = matches[matches["date"] < '2024-01-01']
test = matches[matches["date"] > '2024-01-01']

In [4]:
matches_scaled = matches.copy()
matches_scaled[predictors] = preprocessing.scale(matches_scaled[predictors]) 

train_scaled = matches_scaled[matches_scaled["date"] < '2024-01-01']
test_scaled = matches_scaled[matches_scaled["date"] > '2024-01-01']

In [5]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import (confusion_matrix,
                             accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score,
                            )

In [6]:
param_grid = {'max_iter': [1000],
              'random_state': [1, ],
              'solver': ['lbfgs', 'liblinear'],  
              'penalty': ['l2', 'l1'], 
              'class_weight': ['balanced', {0:2, 1:1}, {0:3, 1:1}, {0:4, 1:1}, {0:3, 1:2},
                              {1:2, 0:1}, {1:3, 0:1}, {1:4, 0:1}]
             }

In [7]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def evaluation_(model, train, test, param_grid):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=50)
    grid_search.fit(train[predictors], train['target'])

    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Evaluate the best model
    preds_best = best_model.predict(test[predictors])
    accuracy_best = accuracy_score(test['target'], preds_best)
    precision_best = precision_score(test["target"], preds_best)
    recall_best = recall_score(test["target"], preds_best)
    f1score_best = f1_score(test["target"], preds_best)
    print(f"Best model Accuracy: {accuracy_best:.5f}")
    print(f"Best model Precision: {precision_best:.5f}")
    print(f"Best model Recall: {recall_best:.5f}")
    print(f"Best model F1score: {f1score_best:.5f}")
    print(f"Best Hyperparameters: {best_params}")
    
def random_evaluation_(model, train, test, parameters):
    ran_search = RandomizedSearchCV(estimator=model, param_distributions=parameters,
                                    cv = 2, n_iter = 30, n_jobs=-1
                                   )
    ran_search.fit(train[predictors], train['target'])

    best_params = ran_search.best_params_
    best_model = ran_search.best_estimator_

    # Evaluate the best model
    preds_best = best_model.predict(test[predictors])
    accuracy_best = accuracy_score(test['target'], preds_best)
    precision_best = precision_score(test["target"], preds_best, average='weighted', zero_division=0)
    recall_best = recall_score(test["target"], preds_best, average='weighted', zero_division=0)
    f1score_best = f1_score(test["target"], preds_best, average='weighted', zero_division=0)
    print(f"Best model Accuracy: {accuracy_best:.5f}")
    print(f"Best model Precision: {precision_best:.5f}")
    print(f"Best model Recall: {recall_best:.5f}")
    print(f"Best model F1score: {f1score_best:.5f}")
    print(f"Best Hyperparameters: {best_params}")

In [8]:
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [9]:
rf = RandomForestClassifier()
rf_param = {'n_estimators': sp_randInt(100, 1000), 
            'min_samples_split': sp_randInt(2, 20), 
            'random_state': sp_randInt(1, 5),
            'min_samples_leaf': sp_randInt(1, 10),
            'max_features': sp_randInt(2, 20),
            'max_depth': sp_randInt(2, 20),
            
           }
random_evaluation_(rf, train, test, rf_param)

Best model Accuracy: 0.49688
Best model Precision: 0.36082
Best model Recall: 0.49688
Best model F1score: 0.41727
Best Hyperparameters: {'max_depth': 5, 'max_features': 12, 'min_samples_leaf': 7, 'min_samples_split': 14, 'n_estimators': 865, 'random_state': 4}


In [10]:
dt = DecisionTreeClassifier(criterion='entropy',
                            max_depth=10,)
dt_param = {'criterion': ['gini','entropy'],
            'max_depth': sp_randInt(2, 20),
           }
random_evaluation_(dt, train, test, dt_param)

Best model Accuracy: 0.40625
Best model Precision: 0.38555
Best model Recall: 0.40625
Best model F1score: 0.36820
Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 5}


In [25]:
svc = SVC()

svc_param = {'C': sp_randFloat(0.01, 100),  
             'gamma': sp_randFloat(0.01, 100), 
             'kernel': ['linear', 'rbf'],
             'random_state':sp_randInt(0, 5),
             
             } 
random_evaluation_(svc, train_scaled, test_scaled, svc_param)

Best model Accuracy: 0.46875
Best model Precision: 0.34034
Best model Recall: 0.46875
Best model F1score: 0.39299
Best Hyperparameters: {'C': 96.73712975691015, 'gamma': 65.52467019338333, 'kernel': 'linear', 'random_state': 3}


In [24]:
import numpy as np

lr = LogisticRegression()

l1 = np.arange(10, 1)
l2 = np.arange(10, 1)
class_weight = [{0:i,1:j} for i,j in zip(l1,l2)]

lr_param = {'max_iter':[1000],
            'random_state':sp_randInt(1, 5),
            'solver':['lbfgs','liblinear'] ,
            'penalty':['l2','l1'],
            'class_weight':[{0:1, 1:1, 2:1}, {0:2, 1:1, 2:1}, {0:2, 1:1, 2:2}, {0:3, 1:1, 2:3}],
           } 
random_evaluation_(lr, train_scaled, test_scaled, lr_param)

Best model Accuracy: 0.44688
Best model Precision: 0.41266
Best model Recall: 0.44688
Best model F1score: 0.36730
Best Hyperparameters: {'class_weight': {0: 2, 1: 1, 2: 2}, 'max_iter': 1000, 'penalty': 'l1', 'random_state': 2, 'solver': 'liblinear'}


12 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hoang Hiep\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hoang Hiep\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Hoang Hiep\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [20]:
knn = KNeighborsClassifier()

knn_param = {'n_neighbors':sp_randInt(2,50),
            } 
random_evaluation_(knn, train_scaled, test_scaled, knn_param)

Best model Accuracy: 0.44375
Best model Precision: 0.43406
Best model Recall: 0.44375
Best model F1score: 0.38135
Best Hyperparameters: {'n_neighbors': 29}


In [14]:
xgb = GradientBoostingClassifier()
xgb_param = {'n_estimators': sp_randInt(100, 1000), 
             'min_samples_leaf': sp_randInt(2, 10),
             'min_samples_split': sp_randInt(5, 100),
             'max_features': sp_randInt(2, 10),
             'max_depth': sp_randInt(2, 10),
             'learning_rate': [0.1, 1, 2],
             'subsample': [0.5, 0.7, 0.8, 1],
             'random_state':sp_randInt(1, 5),
            } 
random_evaluation_(xgb, train, test, xgb_param)

Best model Accuracy: 0.50000
Best model Precision: 0.47639
Best model Recall: 0.50000
Best model F1score: 0.46507
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 6, 'max_features': 7, 'min_samples_leaf': 9, 'min_samples_split': 25, 'n_estimators': 256, 'random_state': 4, 'subsample': 0.5}


In [15]:
cb = CatBoostClassifier()
cb_param = {'n_estimators': sp_randInt(100, 1000),
            'learning_rate': sp_randFloat(0.01, 0.05),
            #'iterations':sp_randInt(5, 30),
            'depth':sp_randInt(4, 10),
            'verbose':sp_randInt(0, 5),
            'random_state':sp_randInt(1, 5),
            } 
random_evaluation_(cb, train, test, cb_param)

0:	learn: 1.0930424	total: 135ms	remaining: 1m 3s
2:	learn: 1.0828569	total: 137ms	remaining: 21.4s
4:	learn: 1.0735929	total: 139ms	remaining: 13s
6:	learn: 1.0647023	total: 140ms	remaining: 9.33s
8:	learn: 1.0566637	total: 142ms	remaining: 7.32s
10:	learn: 1.0499949	total: 144ms	remaining: 6.04s
12:	learn: 1.0428628	total: 145ms	remaining: 5.14s
14:	learn: 1.0358156	total: 147ms	remaining: 4.49s
16:	learn: 1.0294007	total: 149ms	remaining: 4s
18:	learn: 1.0217433	total: 151ms	remaining: 3.6s
20:	learn: 1.0161406	total: 153ms	remaining: 3.28s
22:	learn: 1.0114335	total: 154ms	remaining: 3.02s
24:	learn: 1.0073427	total: 156ms	remaining: 2.8s
26:	learn: 1.0026044	total: 158ms	remaining: 2.61s
28:	learn: 0.9991758	total: 160ms	remaining: 2.44s
30:	learn: 0.9955438	total: 162ms	remaining: 2.31s
32:	learn: 0.9916301	total: 163ms	remaining: 2.18s
34:	learn: 0.9877411	total: 165ms	remaining: 2.07s
36:	learn: 0.9833722	total: 167ms	remaining: 1.97s
38:	learn: 0.9793794	total: 169ms	remaining

Best model Accuracy: 0.46875
Best model Precision: 0.61511
Best model Recall: 0.46875
Best model F1score: 0.39783
Best Hyperparameters: {'depth': 4, 'learning_rate': 0.030855479723730178, 'n_estimators': 473, 'random_state': 1, 'verbose': 2}
