In [None]:
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(".."))

from src.config import load_config
from src.data_prep import import_raw_aggregated_dataset, dataframe_cleaning, convert_dates_column

# Raw data importation

In [None]:
raw_df_train = import_raw_aggregated_dataset(2012, 2024, 'data/raw')
raw_df_test = import_raw_aggregated_dataset(2024, 2025, 'data/raw')

# Raw data cleaning

In [None]:
cols_to_delete_train = ['Div', 'Time', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR', 'Bb1X2', 'BbMxH', 'BbAvH', 'BbMxD',
                        'BbAvD', 'BbMxA', 'BbAvA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA', 'BbOU', 'BbMx>2.5', 'BbMx<2.5', 'BbAv>2.5',
                        'BbAv<2.5', 'B365>2.5', 'B365<2.5', 'P<2.5', 'P>2.5', 'Max<2.5', 'Max>2.5', 'Avg<2.5', 'Avg>2.5', 'BbAH', 'BbAHh', 'AHh',
                        'BbMxAHH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA',
                        'AvgC<2.5', 'AvgC>2.5', 'MaxC<2.5', 'MaxC>2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA',
                        'AvgCAHH', 'AvgCAHA', 'MaxCH', 'MaxCD', 'MaxCA', 'AvgCH', 'AvgCD', 'AvgCA', 'B365C>2.5', 'B365C<2.5', 'PC<2.5', 'PC>2.5',
                        'PC<2.5', 'BWCA', 'IWCH', 'IWCD', 'IWCA', 'VCCH', 'VCCD', 'VCCA', 'B365CH', 'B365CD', 'B365CA', 'BWCH', 'BWCD', 'PSCH', 'PSCD',
                        'PSCA', 'WHCH', 'WHCD', 'WHCA', 'HTHG', 'HTAG', 'HTR']

cols_to_delete_test = ['Div', 'Time', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD',
                       'AvgA', 'B365>2.5', 'B365<2.5', 'P<2.5', 'P>2.5', 'Max<2.5', 'Max>2.5', 'Avg<2.5', 'Avg>2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH',
                       'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA', 'AvgC<2.5', 'AvgC>2.5', 'MaxC<2.5', 'MaxC>2.5', 'AHCh', 'B365CAHH', 'B365CAHA',
                       'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH', 'AvgCAHA', 'MaxCH', 'MaxCD', 'MaxCA', 'AvgCH', 'AvgCD', 'AvgCA', 'B365C>2.5',
                       'B365C<2.5', 'PC<2.5', 'PC>2.5', 'PC<2.5', 'BWCA', 'B365CH', 'B365CD', 'B365CA', 'BWCH', 'BWCD', 'PSCH', 'PSCD', 'PSCA', 'WHCH',
                       'WHCD', 'WHCA', 'HTHG', 'HTAG', 'HTR', 'BFE>2.5', 'BFE<2.5', 'BFEAHH', 'BFEAHA', 'BFEC>2.5', 'BFEC<2.5', 'BFECAHH', 'BFECAHA']

cols_to_rename = {'Date': 'date',
                  'HomeTeam': 'home',
                  'AwayTeam': 'away',
                  'FTHG': 'nb_goals_home_final',
                  'FTAG': 'nb_goals_away_final',
                  'FTR': 'final_result'}

values_to_rename = {'final_result': {'D': 'draw',
                                     'H': 'home',
                                     'A': 'away'},
                    'home': {'Paris SG': 'PSG',
                             'St Etienne': 'Saint-Etienne',
                             'Evian Thonon Gaillard': 'ETG',
                             'Ajaccio GFCO': 'Gazélec Ajaccio'},
                    'away': {'Paris SG': 'PSG',
                             'St Etienne': 'Saint-Etienne',
                             'Evian Thonon Gaillard': 'ETG',
                             'Ajaccio GFCO': 'Gazélec Ajaccio'}
                   }


cleaned_df_train = dataframe_cleaning(df=raw_df_train,
                                      cols_to_delete=cols_to_delete_train,
                                      cols_to_rename=cols_to_rename,
                                      values_to_rename=values_to_rename)

cleaned_df_test = dataframe_cleaning(df=raw_df_test,
                                     cols_to_delete=cols_to_delete_test,
                                     cols_to_rename=cols_to_rename,
                                     values_to_rename=values_to_rename)

In [None]:
cleaned_df_train = convert_dates_column(df=cleaned_df_train, date_column='date')
cleaned_df_test = convert_dates_column(df=cleaned_df_test, date_column='date')

# Missing values management

In [None]:
# Missing values

print(f'Train dataframe: \n{cleaned_df_train.isna().sum()} \n')
print(f'Test dataframe: \n{cleaned_df_test.isna().sum()}')

Columns with a lot of missing values (> 5% of the lines) will be deleted. Since they are odd variables, columns with less than 5% will be replaced by mean value.

In [None]:
cols_to_delete_train = ['GBH', 'GBD', 'GBA', 'LBH', 'LBD', 'LBA', 'SJH', 'SJD', 'SJA', 'BSH', 'BSD', 'BSA']
cols_to_delete_test = ['BWH', 'BWD', 'BWA', 'WHH', 'WHD', 'WHA']

cleaned_df_train = dataframe_cleaning(df=cleaned_df_train,
                                      cols_to_delete=cols_to_delete_train)
cleaned_df_test = dataframe_cleaning(df=cleaned_df_test,
                                     cols_to_delete=cols_to_delete_test)

In [None]:
numeric_cols_train = cleaned_df_train.select_dtypes(include='number').columns
cleaned_df_train[numeric_cols_train] = cleaned_df_train[numeric_cols_train].fillna(cleaned_df_train[numeric_cols_train].mean())

numeric_cols_test = cleaned_df_test.select_dtypes(include='number').columns
cleaned_df_test[numeric_cols_test] = cleaned_df_test[numeric_cols_test].fillna(cleaned_df_test[numeric_cols_test].mean())

In [None]:
print(f'Train dataframe: \n{cleaned_df_train.isna().sum()} \n')
print(f'Test dataframe: \n{cleaned_df_test.isna().sum()}')

# Cleaned dataframe storage

In [None]:
cleaned_df_train.to_csv('../data/cleaned/cleaned_df_train.csv')
cleaned_df_test.to_csv('../data/cleaned/cleaned_df_test.csv')