Importações Iniciais - Manejo do Dataframe e depois para treinar o modelo

In [None]:
import pandas as pd
import numpy as np
from features import create
from datetime import datetime
from features.required import REQUIRED_FEATURES

In [None]:
from model import calculate
import matplotlib.pyplot as plt
from xgboost import XGBRegressor, plot_importance
from model.config import grid_search, random_search
from sklearn.metrics import mean_absolute_error, mean_squared_error

Pré-processamento inicial dos dados.

In [None]:
df = pd.read_csv(r'training_testing/dados/dados_eventos.csv')
df['date'] = pd.to_datetime(df['date'])
df.drop_duplicates()
df.sort_values(by='date')
df.info()

df = df.rename(columns= {
    'jogador_casa': 'home_player',
    'jogador_fora': 'away_player',
    'time_casa': 'home_team',
    'time_fora': 'away_team',
    'gols_casa': 'home_score',
    'gols_fora': 'away_score',
    'gols_totais': 'total_score',})

Divisão dos dados e criação das features separadamente


In [None]:
#Dividir em Treino, Teste e Validação

train_size = int(0.6 * len(df))
val_size = int(0.2 * len(df))

train_df = df.iloc[:train_size]
val_df = df.iloc[train_size:train_size + val_size]
test_df = df.iloc[train_size + val_size:]

In [None]:
#Criar Features para cada um dos dados.
#TODO: Normalizar?

train_featured = create.features(
    data=train_df,live=False
    ).dropna()

val_featured = create.features(
    data=val_df, live=False, 
    lookback_data=train_df
    ).dropna()

test_featured = create.features(
    data=test_df, live=False, 
    lookback_data=pd.concat(
        [train_df, test_df])
        ).dropna()

In [None]:
#Divisão dos dados em X e y

train_X = train_featured[REQUIRED_FEATURES]
train_y = train_featured['total_score']

val_X = test_featured[REQUIRED_FEATURES]
val_y = test_featured['total_score']

test_X = val_featured[REQUIRED_FEATURES]
test_y = val_featured['total_score']

#Dividir a validação em dois blocos.

val_X_1, val_X_2 = val_X[:len(val_X)//2], val_X[len(val_X)//2:]
val_y_1, val_y_2 = val_y[:len(val_y)//2], val_y[len(val_y)//2:]

final_train_X = pd.concat([train_X, val_X_1])
final_train_y = pd.concat([train_y, val_y_1])


In [None]:
pesos_treinamento = np.linspace(0.5, 1, num=len(train_X))
pesos_validacao = np.linspace(0.7, 1, num=len(val_X))

Treinamento do Modelo baseado em Random Search

In [None]:
random_search.fit(
    train_X, train_y,
    sample_weight=pesos_treinamento,
    eval_set=[(val_X_1, val_y_1)],
    verbose=2
)

best_random_model = random_search.best_estimator_
best_random_params = random_search.best_params_

final_random_model = XGBRegressor(
    **best_random_params,
    objective='count:poisson',
    eval_metric='poisson-nloglik',
    tree_method='hist',
    early_stopping_rounds=50,
    random_state=42
)

final_random_model.fit(
    final_train_X, final_train_y,
    eval_set=[(val_X_2,val_y_2)],
    verbose=10
)
 
test_pred_random_model = final_random_model.predict(test_X)

print(60 * '-')
print('Teste:')
print(f"MAE: {mean_absolute_error(test_y, test_pred_random_model):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, test_pred_random_model)):.4f}")
print(f"Log Loss: {calculate.poisson_log_loss(test_y, test_pred_random_model):.4f}")

plot_importance(final_random_model)
plt.show()

In [None]:
#Exportar modelo para arquivo xlsx.

test_pred_df = pd.DataFrame(test_pred_random_model, columns=['prediction'])
df_to_export = pd.concat([test_featured, test_pred_df], axis=1)
df_to_export.to_excel(f'training_testing/dados/exports/final_random_df {datetime.now().strftime('%d-%m %H-%M')}.xlsx', index=False)


Treinamento do Modelo baseado em Grid Search

In [None]:
grid_search.fit(train_X, train_y)

best_grid_model = grid_search.best_estimator_
best_grid_params = grid_search.best_params_

final_grid_model = XGBRegressor(
    **best_grid_params,
    early_stopping_rounds=50,
    objective='count:poisson',
    eval_metric='poisson-nloglik'
    )

final_grid_model.fit(
    final_train_X, final_train_y,
    eval_set=[(val_X_2, val_y_2)],
    verbose=10
)

test_pred_grid_model = final_grid_model.predict(test_X)


print(60 * '-')
print('Teste:')
print(f"MAE: {mean_absolute_error(test_y, test_pred_grid_model):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, test_pred_grid_model)):.4f}")
print(f"Log Loss: {calculate.poisson_log_loss(test_y, test_pred_grid_model):.4f}")

plot_importance(final_grid_model)
plt.show()


In [None]:
#Exportar modelo para arquivo xlsx.

test_pred_df = pd.DataFrame(test_pred_grid_model, columns=['prediction'])
df_to_export = pd.concat([test_featured, test_pred_df], axis=1)
df_to_export.to_excel(f'training_testing/dados/exports/final_grid_df {datetime.now().strftime('%d-%m %H-%M')}.xlsx', index=False)