In [1]:
import pandas as pd

matches = pd.read_csv("matches_2classes.csv", index_col=0)

In [2]:
from sklearn import preprocessing

In [3]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

#matches[predictors] = preprocessing.scale(matches[predictors]) 

train = matches[matches["date"] < '2024-01-01']
test = matches[matches["date"] > '2024-01-01']

In [4]:
matches_scaled = matches.copy()
matches_scaled[predictors] = preprocessing.scale(matches_scaled[predictors]) 

train_scaled = matches_scaled[matches_scaled["date"] < '2024-01-01']
test_scaled = matches_scaled[matches_scaled["date"] > '2024-01-01']

In [5]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import (confusion_matrix,
                             accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score,
                            )

In [6]:
param_grid = {'max_iter': [1000],
              'random_state': [1, ],
              'solver': ['lbfgs', 'liblinear'],  
              'penalty': ['l2', 'l1'], 
              'class_weight': ['balanced', {0:2, 1:1}, {0:3, 1:1}, {0:4, 1:1}, {0:3, 1:2},
                              {1:2, 0:1}, {1:3, 0:1}, {1:4, 0:1}]
             }

In [46]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def evaluation_(model, train, test, param_grid):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=50)
    grid_search.fit(train[predictors], train['target'])

    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Evaluate the best model
    preds_best = best_model.predict(test[predictors])
    accuracy_best = accuracy_score(test['target'], preds_best)
    precision_best = precision_score(test["target"], preds_best)
    recall_best = recall_score(test["target"], preds_best)
    f1score_best = f1_score(test["target"], preds_best)
    print(f"Best model Accuracy: {accuracy_best:.5f}")
    print(f"Best model Precision: {precision_best:.5f}")
    print(f"Best model Recall: {recall_best:.5f}")
    print(f"Best model F1score: {f1score_best:.5f}")
    print(f"Best Hyperparameters: {best_params}")
    
def random_evaluation_(model, train, test, parameters):
    ran_search = RandomizedSearchCV(estimator=model, param_distributions=parameters,
                                    cv = 2, n_iter = 30, n_jobs=-1
                                   )
    ran_search.fit(train[predictors], train['target'])

    best_params = ran_search.best_params_
    best_model = ran_search.best_estimator_

    # Evaluate the best model
    preds_best = best_model.predict(test[predictors])
    accuracy_best = accuracy_score(test['target'], preds_best)
    precision_best = precision_score(test["target"], preds_best)
    recall_best = recall_score(test["target"], preds_best)
    f1score_best = f1_score(test["target"], preds_best)
    print(f"Best model Accuracy: {accuracy_best:.5f}")
    print(f"Best model Precision: {precision_best:.5f}")
    print(f"Best model Recall: {recall_best:.5f}")
    print(f"Best model F1score: {f1score_best:.5f}")
    print(f"Best Hyperparameters: {best_params}")

In [8]:
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [56]:
rf = RandomForestClassifier()
rf_param = {'n_estimators': sp_randInt(100, 1000), 
            'min_samples_split': sp_randInt(2, 20), 
            'random_state': sp_randInt(1, 5),
            'min_samples_leaf': sp_randInt(1, 10),
            'max_features': sp_randInt(2, 20),
            'max_depth': sp_randInt(2, 20),
            
           }
random_evaluation_(rf, train, test, rf_param)

Best model Accuracy: 0.62813
Best model Precision: 0.48571
Best model Recall: 0.43966
Best model F1score: 0.46154
Best Hyperparameters: {'max_depth': 19, 'max_features': 14, 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 634, 'random_state': 4}


In [57]:
dt = DecisionTreeClassifier(criterion='entropy',
                            max_depth=10,)
dt_param = {'criterion': ['gini','entropy'],
            'max_depth': sp_randInt(2, 20),
           }
random_evaluation_(dt, train, test, dt_param)

Best model Accuracy: 0.58437
Best model Precision: 0.42735
Best model Recall: 0.43103
Best model F1score: 0.42918
Best Hyperparameters: {'criterion': 'gini', 'max_depth': 18}


In [58]:
svc = SVC()

svc_param = {'C': sp_randFloat(0.01, 100),  
             'gamma': sp_randFloat(0.01, 100), 
             'kernel': ['linear', 'rbf'],
             'random_state':sp_randInt(0, 5),
             
             } 
random_evaluation_(svc, train_scaled, test_scaled, svc_param)

Best model Accuracy: 0.59062
Best model Precision: 0.39437
Best model Recall: 0.24138
Best model F1score: 0.29947
Best Hyperparameters: {'C': 5.357879515778587, 'gamma': 96.38124757217304, 'kernel': 'rbf', 'random_state': 0}


In [59]:
import numpy as np

lr = LogisticRegression()

l1 = np.arange(10, 1)
l2 = np.arange(10, 1)
class_weight = [{0:i,1:j} for i,j in zip(l1,l2)]

lr_param = {'max_iter':[1000],
            'random_state':sp_randInt(1, 5),
            'solver':['lbfgs','liblinear'] ,
            'penalty':['l2','l1'],
            'class_weight':[{0:1, 1:1}],
           } 
random_evaluation_(lr, train_scaled, test_scaled, lr_param)

Best model Accuracy: 0.65938
Best model Precision: 0.57143
Best model Recall: 0.24138
Best model F1score: 0.33939
Best Hyperparameters: {'class_weight': {0: 1, 1: 1}, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 4, 'solver': 'lbfgs'}


14 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hoang Hiep\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hoang Hiep\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Hoang Hiep\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [60]:
knn = KNeighborsClassifier()

knn_param = {'n_neighbors':sp_randInt(2,50),
            } 
random_evaluation_(knn, train_scaled, test_scaled, knn_param)

Best model Accuracy: 0.59062
Best model Precision: 0.40000
Best model Recall: 0.25862
Best model F1score: 0.31414
Best Hyperparameters: {'n_neighbors': 28}


In [61]:
xgb = GradientBoostingClassifier()
xgb_param = {'n_estimators': sp_randInt(100, 1000), 
             'min_samples_leaf': sp_randInt(2, 10),
             'min_samples_split': sp_randInt(5, 100),
             'max_features': sp_randInt(2, 10),
             'max_depth': sp_randInt(2, 10),
             'learning_rate': [0.1, 1, 2],
             'subsample': [0.5, 0.7, 0.8, 1],
             'random_state':sp_randInt(1, 5),
            } 
random_evaluation_(xgb, train, test, xgb_param)

Best model Accuracy: 0.61875
Best model Precision: 0.47656
Best model Recall: 0.52586
Best model F1score: 0.50000
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 8, 'max_features': 9, 'min_samples_leaf': 3, 'min_samples_split': 92, 'n_estimators': 585, 'random_state': 4, 'subsample': 1}


In [62]:
cb = CatBoostClassifier()
cb_param = {'n_estimators': sp_randInt(100, 1000),
            'learning_rate': sp_randFloat(0.01, 0.05),
            #'iterations':sp_randInt(5, 30),
            'depth':sp_randInt(4, 10),
            'verbose':sp_randInt(0, 5),
            'random_state':sp_randInt(1, 5),
            } 
random_evaluation_(cb, train, test, cb_param)

Best model Accuracy: 0.61250
Best model Precision: 0.46491
Best model Recall: 0.45690
Best model F1score: 0.46087
Best Hyperparameters: {'depth': 4, 'learning_rate': 0.03593524093347264, 'n_estimators': 339, 'random_state': 2, 'verbose': 0}
