The aim of this preprocessing notebook is to run the preprocessing pipeline, whose goal is to create the following variables:
- an aggregated variable with betting odds: one column for an aggregated betting odd variable for home tema victory, one column for away team victory, one column for draw
- current season indicators, based on matched played during the current season
- absolute recent form indicators, based on the last matches played during the current season, regardless of the confronted teams
- absolute historical form indicators, based on the entire imported data (several seasons)
- relative recent form indicators, based on the last matches played against the same team
- strict relative recent form indicators, based on the last matches played against the same team in the same stadium (home away order kept)
- external factors which could impact a match issue

In [None]:
import os
import sys
root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(root_path)

import pandas as pd

from src.config import load_config
from src.preprocessing import Preprocessing
import src.utils

# config.yaml importation
config_file = 'config.yaml'
config_path = os.path.join(root_path, config_file)
config = load_config(config_path)

# Preprocessing pipeline for total dataframe (train + test)

In [None]:
cleaned_data_dir = os.path.join(root_path, config['cleaned_dir'])
cleaned_df_train_path = os.path.join(cleaned_data_dir, f"{config['cleaned_train_df_name']}.csv")
cleaned_df_test_path = os.path.join(cleaned_data_dir, f"{config['cleaned_test_df_name']}.csv")

cleaned_df_train = pd.read_csv(cleaned_df_train_path)
cleaned_df_test = pd.read_csv(cleaned_df_test_path)

In [None]:
cleaned_df_train.head()

In [None]:
# Preprocessing must be made on total dataframe (train + test)
total_df = pd.concat([cleaned_df_train, cleaned_df_test])
total_preproc = Preprocessing(total_df, config) 

preprocessed_df = total_preproc.run_preprocessing_pipeline()
preprocessed_df.head()

In [None]:
# Train test split: last season for test, all first seasons for train
last_date_last_season = config['last_date_last_season']
last_season = f"{last_date_last_season - 1}/{last_date_last_season}"

preprocessed_df_train = preprocessed_df[preprocessed_df[config['season_column']] != last_season]
preprocessed_df_test = preprocessed_df[preprocessed_df[config['season_column']] == last_season]

assert all(pd.concat([preprocessed_df_train, preprocessed_df_test]) == preprocessed_df), "Preproc train and test concatenation doest not give preproc df"

# Preprocessed dataframes export

In [None]:
preprocessed_data_path = os.path.join(root_path, config['preprocessed_dir'])
df_train_path = os.path.join(preprocessed_data_path, f"{config['preprocessed_train_df_name']}.csv")
df_test_path = os.path.join(preprocessed_data_path, f"{config['preprocessed_test_df_name']}.csv")

preprocessed_df_train.to_csv(df_train_path, index=False)
preprocessed_df_test.to_csv(df_test_path, index=False)