In [100]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [101]:
df = pd.read_csv('./src/final_dataset.csv')

In [102]:
df.columns

Index(['date', 'season', 'league_name', 'team_long_name_home',
       'team_short_name_home', 'team_short_name_away', 'team_long_name_away',
       'home_team_goal', 'away_team_goal', 'match_result',
       'home_players_average_rating', 'away_players_average_rating',
       'home_players_average_score', 'away_players_average_score',
       'away_team_score', 'home_team_score', 'avg_bet_home', 'avg_bet_draw',
       'avg_bet_away'],
      dtype='object')

Параметры для обучения

In [103]:
columns_to_test = ['league_name', 'home_players_average_rating', 'away_players_average_rating',
                   'home_players_average_score', 'away_players_average_score',
                   'away_team_score', 'home_team_score', 'avg_bet_home', 'avg_bet_draw',
                   'avg_bet_away']

Вытащим уникальные лиги

In [104]:
leagues = df['league_name'].unique()

Создадим модель для каждой лиги

In [105]:
models = {}
for league in leagues:
    league_data = df[df['league_name'] == league]
    X = league_data[columns_to_test].drop(columns=['league_name'])
    y = league_data['match_result']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    models[league] = model

    y_pred = model.predict(X_test)
    print(f'Accuracy for {league}:', accuracy_score(y_test, y_pred))

Accuracy for Belgium Jupiler League: 0.5068493150684932
Accuracy for England Premier League: 0.5295169946332737
Accuracy for France Ligue 1: 0.45072992700729925
Accuracy for Germany 1. Bundesliga: 0.4388646288209607
Accuracy for Italy Serie A: 0.5200764818355641
Accuracy for Netherlands Eredivisie: 0.48284960422163586
Accuracy for Portugal Liga ZON Sagres: 0.5106382978723404
Accuracy for Scotland Premier League: 0.45985401459854014
Accuracy for Spain LIGA BBVA: 0.5


Включение лиги как признака

In [106]:
column_transformer = ColumnTransformer(
    [('league_encoder', OneHotEncoder(), ['league_name'])],
    remainder='passthrough')

In [107]:
pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [108]:
X = df[columns_to_test]
y = df['match_result']

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [110]:
pipeline.fit(X_train, y_train)

In [111]:
y_pred = pipeline.predict(X_test)

In [112]:
print('Overall accuracy:', accuracy_score(y_test, y_pred))
print('Overall precision:', precision_score(y_test, y_pred, average='macro'))
print('Overall recall:', recall_score(y_test, y_pred, average='macro'))

Overall accuracy: 0.5073746312684366
Overall precision: 0.4322190288870453
Overall recall: 0.4384201487479882
