In [48]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [49]:
df = pd.read_csv('./src/final_dataset.csv')

In [50]:
df.columns

Index(['date', 'season', 'league_name', 'match_result', 'team_long_name_home',
       'team_short_name_home', 'home_team_goal', 'home_players_average_rating',
       'home_players_average_score', 'home_team_score',
       'home_team_prob_receive_red_cards', 'home_team_prob_case_win',
       'team_long_name_away', 'team_short_name_away', 'away_team_goal',
       'away_players_average_rating', 'away_players_average_score',
       'away_team_score', 'away_team_prob_receive_red_cards',
       'away_team_prob_case_win', 'home_avg_bet', 'draw_avg_bet',
       'away_avg_bet'],
      dtype='object')

Параметры для обучения

In [51]:
columns_to_test = ['league_name',
                   'home_players_average_rating',
                   'away_players_average_rating',
                   'home_team_score',
                   'away_team_score',
                   'home_team_prob_case_win',
                   'away_team_prob_case_win',
                   'away_team_prob_receive_red_cards',
                   'home_team_prob_receive_red_cards',
                   'home_avg_bet',
                   'draw_avg_bet',
                   'away_avg_bet',
                   ]

Вытащим уникальные лиги

In [52]:
leagues = df['league_name'].unique()

Создадим модель для каждой лиги

In [53]:
param_distributions_for_leagues = {
    'n_estimators': np.arange(100, 501, 50),
    'max_features': ['sqrt', 'log2'],
    'max_depth': np.arange(3, 12)
}

In [54]:
models = {}

for league in leagues:
    league_data = df[df['league_name'] == league]
    X = league_data[columns_to_test].drop(columns=['league_name'])
    y = league_data['match_result']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    updated_model = RandomizedSearchCV(estimator=model, param_distributions=param_distributions_for_leagues, n_iter=10, cv=5,
                                       scoring='accuracy', random_state=42, error_score='raise', n_jobs=-1)

    updated_model.fit(X_train, y_train)

    models[league] = updated_model
    best_model = updated_model.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f'Accuracy for {league}:', accuracy_score(y_test, y_pred))

Accuracy for Belgium Jupiler League: 0.547945205479452
Accuracy for England Premier League: 0.5831842576028623
Accuracy for France Ligue 1: 0.49452554744525545
Accuracy for Germany 1. Bundesliga: 0.4868995633187773
Accuracy for Italy Serie A: 0.5602294455066922
Accuracy for Netherlands Eredivisie: 0.5118733509234829
Accuracy for Portugal Liga ZON Sagres: 0.5276595744680851
Accuracy for Scotland Premier League: 0.5
Accuracy for Spain LIGA BBVA: 0.5503731343283582


Включение лиги как признака

Предобработка

In [55]:
column_transformer = ColumnTransformer(
    [('league_encoder', OneHotEncoder(), ['league_name'])],
    remainder='passthrough')

In [56]:
pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

Параметры для RandomizedSearchCV

In [57]:
param_distributions = {
    'classifier__n_estimators': np.arange(100, 501, 50),
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__max_depth': np.arange(3, 12)
}

Сам RandomizedSearchCV

In [58]:
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_distributions, n_iter=10, cv=5,
                                   scoring='accuracy', random_state=42, error_score='raise', n_jobs=-1, verbose=1)

In [59]:
X = df[columns_to_test]
y = df['match_result']

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [61]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [62]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

In [63]:
print('Overall accuracy:', accuracy_score(y_test, y_pred))
print('Overall precision:', precision_score(y_test, y_pred, average='macro'))
print('Overall recall:', recall_score(y_test, y_pred, average='macro'))

Overall accuracy: 0.5395548404397962
Overall precision: 0.45340387262296306
Overall recall: 0.45471321416974186
