In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('./src/final_dataset.csv')

In [3]:
df.columns

Index(['date', 'season', 'league_name', 'team_long_name_home',
       'team_short_name_home', 'team_short_name_away', 'team_long_name_away',
       'home_team_goal', 'away_team_goal', 'match_result',
       'home_players_average_rating', 'away_players_average_rating',
       'home_players_average_score', 'away_players_average_score',
       'away_team_score', 'home_team_score',
       'home_team_prob_receive_red_cards', 'away_team_prob_receive_red_cards',
       'avg_bet_home', 'avg_bet_draw', 'avg_bet_away'],
      dtype='object')

Параметры для обучения

In [4]:
columns_to_test = ['league_name', 'home_players_average_rating', 'away_players_average_rating',
                   'home_players_average_score', 'away_players_average_score',
                   'away_team_score', 'away_team_prob_receive_red_cards', 'home_team_score', 'home_team_prob_receive_red_cards', 'avg_bet_home',
                   'avg_bet_draw',
                   'avg_bet_away']

Вытащим уникальные лиги

In [5]:
leagues = df['league_name'].unique()

Создадим модель для каждой лиги

In [6]:
models = {}
for league in leagues:
    league_data = df[df['league_name'] == league]
    X = league_data[columns_to_test].drop(columns=['league_name'])
    y = league_data['match_result']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    models[league] = model

    y_pred = model.predict(X_test)
    print(f'Accuracy for {league}:', accuracy_score(y_test, y_pred))

Accuracy for Belgium Jupiler League: 0.502283105022831
Accuracy for England Premier League: 0.518783542039356
Accuracy for France Ligue 1: 0.458029197080292
Accuracy for Germany 1. Bundesliga: 0.4519650655021834
Accuracy for Italy Serie A: 0.5181644359464627
Accuracy for Netherlands Eredivisie: 0.49076517150395776
Accuracy for Portugal Liga ZON Sagres: 0.4808510638297872
Accuracy for Scotland Premier League: 0.44525547445255476
Accuracy for Spain LIGA BBVA: 0.5223880597014925


Включение лиги как признака

In [7]:
column_transformer = ColumnTransformer(
    [('league_encoder', OneHotEncoder(), ['league_name'])],
    remainder='passthrough')

In [8]:
pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [9]:
X = df[columns_to_test]
y = df['match_result']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
pipeline.fit(X_train, y_train)

In [12]:
y_pred = pipeline.predict(X_test)

In [13]:
print('Overall accuracy:', accuracy_score(y_test, y_pred))
print('Overall precision:', precision_score(y_test, y_pred, average='macro'))
print('Overall recall:', recall_score(y_test, y_pred, average='macro'))

Overall accuracy: 0.5164923572003218
Overall precision: 0.4577090545928757
Overall recall: 0.45111701977391405
