# **Datasets Preparation Before Modeling**

In [1]:
# Packages
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import itertools
# Pandas' options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### *Preparation: Venue Dataset*

In [2]:
# Import Data
venue_df = pd.read_csv('../../Data/Modeling_Before_Preparation/Venue_data_v01.csv').set_index('id')
# Filter Columns
venue_columns = ['result', 'league_id', 'season_id', 'season_name', 'venue_id', 'venue_city', 'attendance', 'venue_capacity', 'home_country_id', 'away_country_id', 'home_venue_id', 'away_venue_id', 'venue_surface_isgrass'] + venue_df.loc[:, 'temp':'night_game'].columns.tolist() + ['travel_dist(km)', 'colors_home_color', 'colors_away_color']
venue_df = venue_df.loc[:, venue_columns]
# Correct some Errors
venue_df.loc[venue_df['attendance'] < 0, 'attendance'] = 0 

# Create new features
venue_df['home_not_home'] = np.where(venue_df['venue_id'] != venue_df['home_venue_id'], 1, 0)
venue_df['travel_outside_state'] = np.where(venue_df['home_country_id'] != venue_df['away_country_id'], 1, 0)
venue_df['attendance_ratio'] = venue_df['attendance'] / venue_df['venue_capacity']
venue_df.loc[venue_df['attendance_ratio'] > 1.0, 'attendance_ratio'] = 1

# Convert colors values
conv_colors = {'#F0F0F0':'W','#C40010':'RE','#0A0A0A':'BL','#FFDF1B':'YE','#002B87':'BL','#0046A8':'BL','#FBED32':'YE','#262626':'W','#022857':'BL','#940014':'RE','#2B72DE':'GRN','#00C1BA':'GRN','#D6003D':'RE','#79ADE2':'GRN','#832034':'RE','#339063':'GRN','#CCCCCC':'W','#C0D6FE':'GRN','#B0E8E6':'GRN','#679FEA':'GRN','#0C183A':'BL','#F4F48A':'YE','#FDBD0F':'RE','#0060BF':'BL','#025C17':'GRN','#EA9C08':'RE','#3B1E4F':'PU','#F61917':'RE','#007848':'GRN','#FC7E00':'RE','#7C45CA':'PU','#E996BC':'PU','#FCC24F':'YE','#4BD43B':'GRN','#A7D6F5':'GRN','#E03A3E':'RE','#B9D9EC':'GRN','#2291FF':'GRN','#482F8A':'PU','#1544B8':'BL','#235234':'GRN','#1E48C0':'BL','#666666':'W','#ABD422':'GRN','#BFFFBF':'GRN','#D50619':'RE','#F18A01':'RE','#2F97DA':'GRN','#EB172B':'RE','#E11B22':'RE','#202A44':'BL','#EEC0C8':'PU','#5B000F':'RE','#004F9F':'BL','#D71920':'RE','#D94B14':'RE','#D82520':'RE','#881319':'RE','#AAAAAA':'W','#8EBADB':'GRN','#999900':'GRN','#D61414':'RE','#316F37':'GRN','#00AA64':'GRN','#0C7662':'GRN','#888888':'W','#34165F':'PU','#2F6452':'GRN','#111111':'BL','#AA5B9C':'PU','#E41B1B':'RE','#D446BA':'PU','#013D38':'GRN','#283964':'BL','#171C28':'BL','#FDE713':'YE','#CE0000':'RE','#451021':'RE','#1F162B':'BL','#04614A':'GRN','#5F002B':'RE', '#800000':'RE', '#D00014':'RE', 'F0F0F0':'W'}
venue_df['colors_home_color'].replace(conv_colors, inplace=True)
venue_df['colors_away_color'].replace(conv_colors, inplace=True)

# Assign correct data types 
categorial_columns = ['colors_home_color','colors_away_color', 'league_id', 'season_id', 'season_name', 'venue_id', 'venue_city']
venue_df.drop(columns=['away_country_id', 'home_venue_id', 'away_venue_id'], inplace=True)
venue_df[categorial_columns] = venue_df[categorial_columns].astype('category')
venue_df['season_name_codes'] = venue_df['season_name'].cat.codes
venue_df.drop(columns=['season_name'], inplace=True)

# Print + Store dataset
print(venue_df.shape)
venue_df.head(2)
venue_df.to_csv('../../Data/Modeling_Final/1_venue.csv')

(10536, 24)


### *Preparation: Standings Dataset*

In [3]:
# Import Data
standings_df = pd.read_csv('../../Data/Modeling_Before_Preparation/Standings_v01.csv').set_index('id')
# Filter Features
initial_cols = ['result', 'league_id', 'season_id', 'season_name']
columns_standings = initial_cols + standings_df.loc[:, 'ROUND':].columns.tolist()
standings_df = standings_df.loc[:, columns_standings]

# Change some columns dtypes to categorical
standings_df[initial_cols] = standings_df[initial_cols].astype('category')
# Downcast Dtypes
float_cols = standings_df.select_dtypes(np.float_).columns.tolist()
int_cols = standings_df.select_dtypes(np.int_).columns.tolist()
cat_cols =  standings_df.select_dtypes(pd.CategoricalDtype).columns.tolist()
standings_df.loc[:, float_cols] = standings_df.loc[:, float_cols].apply(pd.to_numeric, downcast='float')
standings_df.loc[:, int_cols] = standings_df.loc[:, int_cols].apply(pd.to_numeric, downcast='integer')
# Assign correct data types
standings_df['season_name_codes'] = standings_df['season_name'].astype('category').cat.codes
standings_df.drop(columns=['season_name'], inplace=True)
# Numerical and Categorical variables 
rank_cols = standings_df.filter(regex='rank').columns.tolist() + ['ROUND']
standings_df[rank_cols] = standings_df[rank_cols].astype('category')
numerical_cols = standings_df.select_dtypes(np.number).columns.tolist()
cat = ['league_id', 'season_id']

# Print + Store dataset
print(standings_df.shape)
standings_df.head(2)
standings_df.to_csv('../../Data/Modeling_Final/2_standings.csv')

  app.launch_new_instance()


(10536, 185)


### *Preparation: Form & Rest Dataset*

In [4]:
# Import Data for both Rest and Form + Merge 
old_form = pd.read_csv('../../Data/Modeling_Before_Preparation/Form_data_v01.csv').set_index('id').drop(columns=['index'])
rest = pd.read_csv('../../Data/Modeling_Before_Preparation/Team_attributes_and_Rest_data_v01.csv').set_index('id')
rest_cols = ['league_id', 'season_id', 'season_name'] + rest.loc[:, 'home_n_games_last60d':].columns.tolist()
rest = rest.loc[:, rest_cols]
form_df = pd.merge(old_form, rest, how='left', left_index=True, right_index=True)
# Filter columns
columns_df = ['result', 'league_id', 'season_id', 'season_name', 'home_rest_days', 'away_rest_days']  + form_df.filter(regex='last').columns.tolist()
form_df = form_df.loc[:, columns_df]

# Points Differentials columns 
for i in itertools.product(['points_', 'goals_scored_', 'goals_conceded_', 'goals_diff_'], ['last1_games_', 'last3_games_', 'last5_games_', 'last10_games_', 'last20_games_'], ['withcups', 'nopcups']):
    i_joined = ''.join(i)
    form_df['diff_' + i_joined] = form_df['home_' + i_joined] - form_df['away_' + i_joined]
    form_df['diff_team_' + i_joined] = form_df['home_team_' + i_joined] - form_df['away_team_' + i_joined]
    
# Rest Days differentials columns
form_df['diff_rest_days'] = form_df['home_rest_days'] - form_df['away_rest_days']
for i in ['n_games_last60d', 'n_games_last30d', 'n_games_last15d', 'n_games_last7d']:
    form_df['diff_' + i] = form_df['home_' + i] - form_df['away_' + i]
    
# Downcast numeric columns & drop any row with NA
float_cols = form_df.select_dtypes(np.number).columns.tolist()
form_df.loc[:, float_cols] = form_df.loc[:, float_cols].apply(pd.to_numeric, downcast='float')
form_df.dropna(inplace=True)
# Assign correct data types
form_df['season_name_codes'] = form_df['season_name'].astype('category').cat.codes
form_df.drop(columns=['season_name'], inplace=True)
form_df.loc[:, ['result', 'league_id', 'season_id', 'season_name_codes']] = form_df.loc[:, ['result', 'league_id', 'season_id', 'season_name_codes']].astype(int).astype('category')

# Print + Store dataset
print(form_df.shape)
form_df.head(2)
form_df.to_csv('../../Data/Modeling_Final/3_form_rest.csv')

(10438, 259)


### *Preparation: Stats Dataset*

In [5]:
# Import Data
stats_df = pd.read_csv('../../Data/Modeling_Before_Preparation/Stats_data_v01.csv').set_index('id')
rest = pd.read_csv('../../Data/Modeling_Before_Preparation/Team_attributes_and_Rest_data_v01.csv').set_index('id')
rest_cols = ['result', 'league_id', 'season_id', 'season_name'] 
rest = rest.loc[:, rest_cols]
stats_df = pd.merge(rest, stats_df, how='right', left_index=True, right_index=True)

# Get rest data to add missing league_id, season_id, and season_name
columns_df = ['result', 'league_id', 'season_id', 'season_name']  + stats_df.filter(regex='last').columns.tolist()
stats_df = stats_df.loc[:, columns_df]
print('DataFrame shape before dropping NAs: ', stats_df.shape)
# Downcast numeric columns & drop any row with NA
float_cols = stats_df.select_dtypes(np.number).columns.tolist()
stats_df.loc[:, float_cols] = stats_df.loc[:, float_cols].apply(pd.to_numeric, downcast='float')
# Drop all NAs
stats_df.dropna(inplace=True)
print('DataFrame shape after dropping NAs: ', stats_df.shape)

# Points Differentials columns 
for i in itertools.product(['diff_','team_', 'team_diff_', ''], ['shots_total','shots_ongoal','shots_offgoal','shots_blocked','shots_insidebox','shots_outsidebox','fouls','corners', 'offsides', 'possessiontime', 'yellowcards','redcards','yellowredcards','saves','tackles','passes_total','passes_accurate','passes_percentage','attacks_attacks','attacks_dangerous_attacks'], ['_last1_games', '_last3_games', '_last5_games']):
    i_joined = ''.join(i)
    stats_df.loc[:, 'diff_' + i_joined] = stats_df.loc[:, 'home_' + i_joined] - stats_df.loc[:, 'away_' + i_joined]
    
# Downcast numeric columns & drop any row with NA
float_cols = stats_df.select_dtypes(np.number).columns.tolist()
stats_df.loc[:, float_cols] = stats_df.loc[:, float_cols].apply(pd.to_numeric, downcast='float')
stats_df.dropna(inplace=True)
# Assign category data type 
stats_df['season_name_codes'] = stats_df['season_name'].astype('category').cat.codes
stats_df.drop(columns=['season_name'], inplace=True)
stats_df.loc[:, ['result', 'league_id', 'season_id', 'season_name_codes']] = stats_df.loc[:, ['result', 'league_id', 'season_id', 'season_name_codes']].astype(int).astype('category')

# Print + Store dataset
print(stats_df.shape)
stats_df.head(2)
stats_df.to_csv('../../Data/Modeling_Final/4_stats.csv')

DataFrame shape before dropping NAs:  (10536, 484)
DataFrame shape after dropping NAs:  (10205, 484)


  self.obj[key] = value


(10205, 724)


### *Preparation: Odds Dataset*

In [6]:
# Import Data
odds_df = pd.read_csv('../../Data/Modeling_Before_Preparation/Odds_data_complete_v01.csv').set_index('id')
rest = pd.read_csv('../../Data/Modeling_Before_Preparation/Team_attributes_and_Rest_data_v01.csv').set_index('id').loc[:, ['result', 'league_id', 'season_id', 'season_name']]
odds_df = pd.merge(rest, odds_df, how='right', left_index=True, right_index=True)
print('DataFrame shape before dropping NAs: ', odds_df.shape)
# Drop all NAs
odds_df.dropna(axis='columns', thresh=8700, inplace=True)
odds_df.dropna(axis=0, how='any', inplace=True)
print('DataFrame shape after dropping NAs: ', odds_df.shape)

# Downcast numeric columns & drop any row with NA
float_cols = odds_df.select_dtypes(np.number).columns.tolist()
odds_df.loc[:, float_cols] = odds_df.loc[:, float_cols].apply(pd.to_numeric, downcast='float')
imp_cols = odds_df.filter(regex='count').columns.tolist()
odds_df.drop(columns=imp_cols, inplace=True)
print('DataFrame shape after dropping count-columns: ', odds_df.shape)
# Assign category data type
odds_df['season_name_codes'] = odds_df['season_name'].astype('category').cat.codes
odds_df.drop(columns=['season_name'], inplace=True)
odds_df.loc[:, ['result', 'league_id', 'season_id', 'season_name_codes']] = odds_df.loc[:, ['result', 'league_id', 'season_id', 'season_name_codes']].astype(int).astype('category')

# Print + Store dataset
print(odds_df.shape)
odds_df.head(2)
odds_df.to_csv('../../Data/Modeling_Final/5_odds.csv')

DataFrame shape before dropping NAs:  (8915, 2655)
DataFrame shape after dropping NAs:  (8415, 1010)
DataFrame shape after dropping count-columns:  (8415, 780)
(8415, 780)


### *Preparation: Team Attributes Dataset*

In [7]:
# Import Data
team_attr = pd.read_csv('../../Data/Modeling_Before_Preparation/Team_attributes_and_Rest_data_v01.csv').set_index('id')
# Filter out rest data + get desired columns 
team_attr = team_attr.loc[:, :'isrival_away']
imp_cols = ['result', 'league_id','season_id','home_id','away_id','formations_home_formation','formations_away_formation','home_founded','home_country_id','away_country_id','away_founded','homecoach_coach_id','homecoach_country_id','homecoach_birthdate','awaycoach_coach_id','awaycoach_country_id','awaycoach_birthdate','time_starting_at_date','season_name'] + team_attr.loc[:, 'home_ObservationDate':].columns.tolist()
team_attr = team_attr.loc[:, imp_cols]

# Change transfer budget columns 
team_attr['home_TransferBudget'] = team_attr['home_TransferBudget'] / 100000
team_attr['away_TransferBudget'] = team_attr['away_TransferBudget'] / 100000
# Convert columns to datetime 
for col in ['homecoach_birthdate','awaycoach_birthdate','time_starting_at_date']:
    team_attr[col] = pd.to_datetime(team_attr[col], format = '%Y-%m-%d')
# Create new features for home/away coach age + Drop unwanted columns
team_attr['home_coach_age'] = team_attr.apply(lambda x: \
    relativedelta(x['time_starting_at_date'], x['homecoach_birthdate']).years, axis=1)
team_attr['away_coach_age'] = team_attr.apply(lambda x: \
    relativedelta(x['time_starting_at_date'], x['awaycoach_birthdate']).years, axis=1)
team_attr.drop(columns=['homecoach_birthdate','awaycoach_birthdate','time_starting_at_date'], inplace=True)
# Create differentials and ratios columns from existing columns 
diff_ratio_cols = ['Attack','Midfield','Defence','TransferBudget','Width','coach_age', 'founded']
for col_type in diff_ratio_cols:
    team_attr['diff_' + col_type] = team_attr['home_' + col_type] - team_attr['away_' + col_type]
    team_attr['ratio_' + col_type] = team_attr['home_' + col_type] / team_attr['away_' + col_type]
columns_drop = ['home_ObservationDate','home_TeamName'] + team_attr.loc[:, 'home_Captain':'away_TeamName'].columns.tolist() \
+ team_attr.loc[:, 'away_Captain':'away_LoanedPlayers'].columns.tolist()
team_attr.drop(columns=columns_drop, inplace=True)

# Change categorical columns to category dtype
categorial_columns =  team_attr.loc[:, :'formations_away_formation'].columns.tolist() + ['home_country_id','away_country_id'] + \
    ['homecoach_coach_id','awaycoach_coach_id', 'season_name'] + team_attr.filter(regex='_is_major$').columns.tolist() + team_attr.filter(regex='^isrival_').columns.tolist()
team_attr[categorial_columns] = team_attr[categorial_columns].astype('category')
# Different dtypes of columns
team_attr['home_founded'] = team_attr['home_founded'] - min(team_attr['home_founded'])
team_attr['away_founded'] = team_attr['away_founded'] - min(team_attr['away_founded'])
width_int = team_attr.filter(regex='(away|home|diff)_Width').columns
team_attr.loc[:, width_int] = team_attr.loc[:, width_int].astype(int)
float_cols = team_attr.select_dtypes(np.float_).columns.tolist()
int_cols = team_attr.select_dtypes(np.int_).columns.tolist()
cat_cols =  team_attr.select_dtypes(pd.CategoricalDtype).columns.tolist()
# Downcast numerical variables
team_attr.loc[:, float_cols] = team_attr.loc[:, float_cols].apply(pd.to_numeric, downcast='float')
team_attr.loc[:, int_cols] = team_attr.loc[:, int_cols].apply(pd.to_numeric, downcast='integer')

# Drop columns
team_attr.drop(columns=['home_id', 'away_id', 'home_country_id', 'away_country_id', 'homecoach_coach_id', 'homecoach_country_id', 'awaycoach_country_id', 'awaycoach_coach_id', 'ratio_Attack','ratio_Midfield', 'ratio_Defence', 'ratio_Width', 'ratio_coach_age', 'ratio_founded'], inplace=True)
team_attr['season_name_codes'] = team_attr['season_name'].cat.codes
team_attr.drop(columns=['season_name'], inplace=True)

# Print + Store dataset
print(team_attr.shape)
team_attr.head(2)
team_attr.to_csv('../../Data/Modeling_Final/6_team_attr.csv')

(10536, 32)


### *Preparation: Player Attributes Dataset*

In [8]:
# Import Data
player_attr_df = pd.read_csv('../../Data/Modeling_Before_Preparation/Players_attributes_v01.csv').rename(columns={'Unnamed: 0': 'id'}).set_index('id')
rest = pd.read_csv('../../Data/Modeling_Before_Preparation/Team_attributes_and_Rest_data_v01.csv').set_index('id').loc[:, ['result', 'league_id', 'season_id', 'season_name']]
player_attr_df = pd.merge(rest, player_attr_df, how='right', left_index=True, right_index=True)
# Drop all NAs
print('DataFrame shape before dropping NAs: ', player_attr_df.shape)
player_attr_df.dropna(inplace=True)
print('DataFrame shape after dropping NAs: ', player_attr_df.shape)

# Players attributes
common_players_attributes = ['Acceleration', 'Age', 'Aggression', 'Agility', 'Balance', 'Crossing', 'Height', 'Jumping', 'LongPass', 'Overall', 'Potential', 'Reactions', 'ShortPass', 'SprintSpeed', 'Stamina', 'Strength', 'Value', 'Vision', 'Wage', 'Weight', 'Att.Position', 'BallControl', 'Curve', 'Dribbling', 'Finishing', 'Heading', 'LongShots', 'ShotPower', 'Volleys', 'Marking', 'SlideTackle', 'StandTackle', 'Interceptions', 'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes', 'FKAcc.', 'Penalties', 'PlayNational']
# Attributes Differentials columns 
for att in common_players_attributes:
    player_attr_df.loc[:, 'diff_' + att] = player_attr_df.loc[:, 'home_' + att] - player_attr_df.loc[:, 'away_' + att]

# Downcast numeric columns & drop any row with NA
float_cols = player_attr_df.select_dtypes(np.number).columns.tolist()
player_attr_df.loc[:, float_cols] = player_attr_df.loc[:, float_cols].apply(pd.to_numeric, downcast='float')
player_attr_df.dropna(inplace=True)

# Assign category data type
player_attr_df['season_name_codes'] = player_attr_df['season_name'].astype('category').cat.codes
player_attr_df.drop(columns=['season_name'], inplace=True)
player_attr_df.loc[:, ['result', 'league_id', 'season_id', 'season_name_codes']] = player_attr_df.loc[:, ['result', 'league_id', 'season_id', 'season_name_codes']].astype(int).astype('category')

# Print + Store dataset
print(player_attr_df.shape)
player_attr_df.head(2)
player_attr_df.to_csv('../../Data/Modeling_Final/7_players.csv')

DataFrame shape before dropping NAs:  (10537, 86)
DataFrame shape after dropping NAs:  (10536, 86)
(10536, 127)
