In [1]:
import pandas as pd

matches = pd.read_csv("matches_3classes.csv", index_col=0)

In [2]:
from sklearn import preprocessing

In [3]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

#matches[predictors] = preprocessing.scale(matches[predictors]) 

train = matches[matches["date"] < '2024-01-01']
test = matches[matches["date"] > '2024-01-01']

In [4]:
matches_scaled = matches.copy()
matches_scaled[predictors] = preprocessing.scale(matches_scaled[predictors]) 

train_scaled = matches_scaled[matches_scaled["date"] < '2024-01-01']
test_scaled = matches_scaled[matches_scaled["date"] > '2024-01-01']

In [5]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import (confusion_matrix,
                             accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score,
                            )

In [6]:
param_grid = {'max_iter': [1000],
              'random_state': [1, ],
              'solver': ['lbfgs', 'liblinear'],  
              'penalty': ['l2', 'l1'], 
              'class_weight': ['balanced', {0:2, 1:1}, {0:3, 1:1}, {0:4, 1:1}, {0:3, 1:2},
                              {1:2, 0:1}, {1:3, 0:1}, {1:4, 0:1}]
             }

In [13]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def evaluation_(model, train, test, param_grid):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=50)
    grid_search.fit(train[predictors], train['target'])

    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Evaluate the best model
    preds_best = best_model.predict(test[predictors])
    accuracy_best = accuracy_score(test['target'], preds_best)
    precision_best = precision_score(test["target"], preds_best)
    recall_best = recall_score(test["target"], preds_best)
    f1score_best = f1_score(test["target"], preds_best)
    print(f"Best model Accuracy: {accuracy_best:.5f}")
    print(f"Best model Precision: {precision_best:.5f}")
    print(f"Best model Recall: {recall_best:.5f}")
    print(f"Best model F1score: {f1score_best:.5f}")
    print(f"Best Hyperparameters: {best_params}")
    
def random_evaluation_(model, train, test, parameters):
    ran_search = RandomizedSearchCV(estimator=model, param_distributions=parameters,
                                    cv = 2, n_iter = 30, n_jobs=-1
                                   )
    ran_search.fit(train[predictors], train['target'])

    best_params = ran_search.best_params_
    best_model = ran_search.best_estimator_

    # Evaluate the best model
    preds_best = best_model.predict(test[predictors])
    accuracy_best = accuracy_score(test['target'], preds_best)
    precision_best = precision_score(test["target"], preds_best, average='weighted')
    recall_best = recall_score(test["target"], preds_best, average='weighted')
    f1score_best = f1_score(test["target"], preds_best, average='weighted')
    print(f"Best model Accuracy: {accuracy_best:.5f}")
    print(f"Best model Precision: {precision_best:.5f}")
    print(f"Best model Recall: {recall_best:.5f}")
    print(f"Best model F1score: {f1score_best:.5f}")
    print(f"Best Hyperparameters: {best_params}")

In [14]:
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [15]:
rf = RandomForestClassifier()
rf_param = {'n_estimators': sp_randInt(100, 1000), 
            'min_samples_split': sp_randInt(2, 20), 
            'random_state': sp_randInt(1, 5),
            'min_samples_leaf': sp_randInt(1, 10),
            'max_features': sp_randInt(2, 20),
            'max_depth': sp_randInt(2, 20),
            
           }
random_evaluation_(rf, train, test, rf_param)

Best model Accuracy: 0.44375
Best model Precision: 0.37841
Best model Recall: 0.44375
Best model F1score: 0.38628
Best Hyperparameters: {'max_depth': 15, 'max_features': 16, 'min_samples_leaf': 7, 'min_samples_split': 16, 'n_estimators': 391, 'random_state': 1}


In [16]:
dt = DecisionTreeClassifier(criterion='entropy',
                            max_depth=10,)
dt_param = {'criterion': ['gini','entropy'],
            'max_depth': sp_randInt(2, 20),
           }
random_evaluation_(dt, train, test, dt_param)

Best model Accuracy: 0.37188
Best model Precision: 0.32167
Best model Recall: 0.37188
Best model F1score: 0.30219
Best Hyperparameters: {'criterion': 'gini', 'max_depth': 4}


In [17]:
svc = SVC()

svc_param = {'C': sp_randFloat(0.01, 100),  
             'gamma': sp_randFloat(0.01, 100), 
             'kernel': ['linear', 'rbf'],
             'random_state':sp_randInt(0, 5),
             
             } 
random_evaluation_(svc, train_scaled, test_scaled, svc_param)

Best model Accuracy: 0.44375
Best model Precision: 0.32172
Best model Recall: 0.44375
Best model F1score: 0.37301
Best Hyperparameters: {'C': 96.66964393682444, 'gamma': 52.629165518744415, 'kernel': 'linear', 'random_state': 1}


  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
import numpy as np

lr = LogisticRegression()

l1 = np.arange(10, 1)
l2 = np.arange(10, 1)
class_weight = [{0:i,1:j} for i,j in zip(l1,l2)]

lr_param = {'max_iter':[1000],
            'random_state':sp_randInt(1, 5),
            'solver':['lbfgs','liblinear'] ,
            'penalty':['l2','l1'],
            'class_weight':[{0:1, 1:1}],
           } 
random_evaluation_(lr, train_scaled, test_scaled, lr_param)

Best model Accuracy: 0.44375
Best model Precision: 0.32172
Best model Recall: 0.44375
Best model F1score: 0.37301
Best Hyperparameters: {'class_weight': {0: 1, 1: 1}, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 3, 'solver': 'liblinear'}


8 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hoang Hiep\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hoang Hiep\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Hoang Hiep\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [19]:
knn = KNeighborsClassifier()

knn_param = {'n_neighbors':sp_randInt(2,50),
            } 
random_evaluation_(knn, train_scaled, test_scaled, knn_param)

Best model Accuracy: 0.36875
Best model Precision: 0.26980
Best model Recall: 0.36875
Best model F1score: 0.31157
Best Hyperparameters: {'n_neighbors': 28}


In [20]:
xgb = GradientBoostingClassifier()
xgb_param = {'n_estimators': sp_randInt(100, 1000), 
             'min_samples_leaf': sp_randInt(2, 10),
             'min_samples_split': sp_randInt(5, 100),
             'max_features': sp_randInt(2, 10),
             'max_depth': sp_randInt(2, 10),
             'learning_rate': [0.1, 1, 2],
             'subsample': [0.5, 0.7, 0.8, 1],
             'random_state':sp_randInt(1, 5),
            } 
random_evaluation_(xgb, train, test, xgb_param)

Best model Accuracy: 0.45937
Best model Precision: 0.43061
Best model Recall: 0.45937
Best model F1score: 0.42133
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'max_features': 4, 'min_samples_leaf': 7, 'min_samples_split': 93, 'n_estimators': 806, 'random_state': 1, 'subsample': 0.5}


In [21]:
cb = CatBoostClassifier()
cb_param = {'n_estimators': sp_randInt(100, 1000),
            'learning_rate': sp_randFloat(0.01, 0.05),
            #'iterations':sp_randInt(5, 30),
            'depth':sp_randInt(4, 10),
            'verbose':sp_randInt(0, 5),
            'random_state':sp_randInt(1, 5),
            } 
random_evaluation_(cb, train, test, cb_param)

0:	learn: 1.0970274	total: 144ms	remaining: 1m 34s
2:	learn: 1.0934348	total: 145ms	remaining: 31.8s
4:	learn: 1.0897113	total: 146ms	remaining: 19.1s
6:	learn: 1.0862910	total: 147ms	remaining: 13.7s
8:	learn: 1.0827386	total: 148ms	remaining: 10.7s
10:	learn: 1.0792587	total: 149ms	remaining: 8.8s
12:	learn: 1.0759288	total: 150ms	remaining: 7.47s
14:	learn: 1.0721174	total: 151ms	remaining: 6.5s
16:	learn: 1.0688052	total: 152ms	remaining: 5.75s
18:	learn: 1.0657876	total: 153ms	remaining: 5.16s
20:	learn: 1.0622677	total: 154ms	remaining: 4.69s
22:	learn: 1.0596039	total: 155ms	remaining: 4.29s
24:	learn: 1.0567115	total: 156ms	remaining: 3.96s
26:	learn: 1.0538244	total: 157ms	remaining: 3.68s
28:	learn: 1.0510040	total: 158ms	remaining: 3.44s
30:	learn: 1.0488851	total: 159ms	remaining: 3.23s
32:	learn: 1.0466616	total: 160ms	remaining: 3.04s
34:	learn: 1.0444780	total: 161ms	remaining: 2.87s
36:	learn: 1.0423144	total: 161ms	remaining: 2.72s
38:	learn: 1.0402410	total: 162ms	rem

418:	learn: 0.8966238	total: 324ms	remaining: 187ms
420:	learn: 0.8963174	total: 325ms	remaining: 185ms
422:	learn: 0.8959178	total: 327ms	remaining: 184ms
424:	learn: 0.8952811	total: 327ms	remaining: 182ms
426:	learn: 0.8946748	total: 328ms	remaining: 180ms
428:	learn: 0.8942116	total: 329ms	remaining: 178ms
430:	learn: 0.8938092	total: 330ms	remaining: 176ms
432:	learn: 0.8931789	total: 331ms	remaining: 174ms
434:	learn: 0.8926482	total: 334ms	remaining: 173ms
436:	learn: 0.8921538	total: 335ms	remaining: 172ms
438:	learn: 0.8916999	total: 336ms	remaining: 170ms
440:	learn: 0.8911408	total: 337ms	remaining: 168ms
442:	learn: 0.8909874	total: 338ms	remaining: 166ms
444:	learn: 0.8903971	total: 339ms	remaining: 164ms
446:	learn: 0.8899760	total: 340ms	remaining: 163ms
448:	learn: 0.8895627	total: 341ms	remaining: 161ms
450:	learn: 0.8891129	total: 341ms	remaining: 159ms
452:	learn: 0.8886624	total: 342ms	remaining: 157ms
454:	learn: 0.8881744	total: 343ms	remaining: 155ms
456:	learn: 