# **TEAM'S ATTRIBUTES PREPARATION**

### Imports

In [1]:
# Packages
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime
from geopy import distance
from datetime import timedelta
import dateutil.parser

# Pandas' options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Prepare Data

#### Functions

In [2]:
# Add results column: possible results are 0 = draw, 1 = home-win, 2 = away-win
def add_result(df):
    d = {}
    for i in range(len(df)):
        if df['scores_home_score'].iloc[i] > df['scores_away_score'].iloc[i]:
            d[df.index[i]] = 1
        elif df['scores_home_score'].iloc[i] < df['scores_away_score'].iloc[i]:
            d[df.index[i]] = 2
        elif df['scores_home_score'].iloc[i] == df['scores_away_score'].iloc[i]:
            d[df.index[i]] = 0
    df['result'] = pd.Series(d)
    return df

#### Cleaning

In [3]:
# Import Data
final_data = pd.read_csv('finalmerge_MATCHES.csv',low_memory=False)
# Set Fixture id as index
final_data.set_index('id', inplace=True)
final_data.drop('Unnamed: 0', axis = 1, inplace=True)
#### Change data types ####
float_columns = ['weather_report_pressure', 'weather_report_temperature_celcius_temp', 'weather_report_wind_degree', 'weather_windspeed(m/s)', 'home_passes_percentage', 'away_passes_percentage'] # columns that will be floats
# Convert all numeric columns to int except those that are expressed in floats
final_data[['weather_windspeed(m/s)', 'weather_clouds(%)', 'weather_humidity(%)']] = final_data[['weather_windspeed(m/s)', 'weather_clouds(%)', 'weather_humidity(%)']].astype('float64')       
m = final_data.select_dtypes(np.number).loc[:, ~final_data.select_dtypes(np.number).columns.isin(float_columns)]
final_data[m.columns]= m.round().astype('Int64')
# Convert dates to datetime format
final_data['time_starting_at_date_time'] = pd.to_datetime(final_data['time_starting_at_date_time'], infer_datetime_format=True)
final_data['round_start'] = pd.to_datetime(final_data['round_start'], format = '%Y-%m-%d')
final_data['round_end'] = pd.to_datetime(final_data['round_end'], format = '%Y-%m-%d')
final_data['Away_ObservationDate'] = pd.to_datetime(final_data['Away_ObservationDate'], format = '%Y-%m-%d')
final_data['Home_ObservationDate'] = pd.to_datetime(final_data['Home_ObservationDate'], format = '%Y-%m-%d')
final_data = final_data.sort_values(by='time_starting_at_date_time')

In [4]:
# Add columns for results and goal difference
final_data = add_result(final_data)
final_data['goal_diff'] = final_data['scores_home_score'] - final_data['scores_away_score']
final_data.head()

Unnamed: 0_level_0,league_id,season_id,stage_id,round_id,venue_id,referee_id,home_id,away_id,winner_team_id,attendance,formations_home_formation,formations_away_formation,scores_home_score,scores_away_score,scores_ht_score,scores_ft_score,time_starting_at_date_time,time_minute,time_injury_time,coaches_home_coach_id,coaches_away_coach_id,standings_home_position,standings_away_position,assistants_first_assistant_id,assistants_second_assistant_id,assistants_fourth_official_id,home_name,home_short_code,home_twitter,home_country_id,home_founded,home_venue_id,away_name,away_short_code,away_twitter,away_country_id,away_founded,away_venue_id,league_country_id,league_name,league_is_cup,season_name,season_league_id,current_season,round_name,round_league_id,round_season_id,round_stage_id,round_start,round_end,venue_name,venue_grass_surface,venue_city,venue_capacity,venue_coordinates,referee_fullname,homecoach_coach_id,homecoach_team_id,homecoach_country_id,homecoach_fullname,homecoach_nationality,homecoach_birthdate,homecoach_birthcountry,homecoach_birthplace,Awaycoach_coach_id,Awaycoach_team_id,Awaycoach_country_id,Awaycoach_fullname,Awaycoach_nationality,Awaycoach_birthdate,Awaycoach_birthcountry,Awaycoach_birthplace,weather_report_code,weather_report_type,weather_windspeed(m/s),weather_report_wind_degree,weather_clouds(%),weather_humidity(%),colors_home_color,colors_away_color,weather_report_pressure,weather_report_temperature_celcius_temp,home_team_id,home_fixture_id,home_shots_total,home_shots_ongoal,home_shots_offgoal,home_shots_blocked,home_shots_insidebox,home_shots_outsidebox,home_passes,home_attacks,home_fouls,home_corners,home_offsides,home_possessiontime,home_yellowcards,home_redcards,home_yellowredcards,home_saves,home_substitutions,home_goal_kick,home_goal_attempts,home_free_kick,home_throw_in,home_ball_safe,home_goals,home_penalties,home_injuries,home_tackles,away_team_id,away_fixture_id,away_shots_total,away_shots_ongoal,away_shots_offgoal,away_shots_blocked,away_shots_insidebox,away_shots_outsidebox,away_passes,away_attacks,away_fouls,away_corners,away_offsides,away_possessiontime,away_yellowcards,away_redcards,away_yellowredcards,away_saves,away_substitutions,away_goal_kick,away_goal_attempts,away_free_kick,away_throw_in,away_ball_safe,away_goals,away_penalties,away_injuries,away_tackles,home_passes_total,home_passes_accurate,home_passes_percentage,away_passes_total,away_passes_accurate,away_passes_percentage,home_attacks_attacks,home_attacks_dangerous_attacks,away_attacks_attacks,away_attacks_dangerous_attacks,home_shots,away_shots,weather_lat_lon,Home_ObservationDate,Home_TeamName,Home_Attack,Home_Midfield,Home_Defence,Home_TransferBudget,Home_RivalTeam,Home_TeamRoster,Home_is_major,Away_ObservationDate,Away_TeamName,Away_Attack,Away_Midfield,Away_Defence,Away_TransferBudget,Away_RivalTeam,Away_TeamRoster,Away_is_major,result,goal_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1
251711,301,1390,2497,37918,184,48975,690,591,591.0,,,,0,1,0-0,0-1,2015-08-07 20:30:00,90,,474402,37438919,,,12852,18518,15934,Lille,LIL,@losclive,17,1944,184,Paris Saint Germain,PSG,@PSG_inside,17,1970,131,17,Ligue 1,0,2015/2016,301,0,1,301,1390,2497,2015-08-07,2015-08-09,Stade Pierre-Mauroy,1,Villeneuve d'Ascq,50186,"(50.631111,3.137500)",Fredy Fautrel,474402,18562.0,17,Hervé Renard,France,30/09/1968,France,Aix-les-Bains,37438919,13260.0,17,Laurent Blanc,France,19/11/1965,France,Ales,,,,,,,,,,,690,251711,12,2,0,0,0,0,,,20,3,,52,3,,,1,,,,,,,,,,,591,251711,7,2,0,0,0,0,,,17,2,2,48,2.0,1.0,,2,,,,,,,,,,,,,,,,,,,,,,,"(nan, nan)",2015-07-10,Lille,73,75,74,13000000,Racing Club de Lens,"['Vincent Enyeama', 'Sébastien Corchia', 'Davi...",1,2015-07-10,Paris Saint Germain,82,81,80,120000000,Olympique de Marseille,"['Nicolas Douchez', 'Marquinhos', 'Thiago Silv...",1,2,-1
849,8,10,8,85,206,13533,14,6,14.0,75261.0,,,1,0,1-0,1-0,2015-08-08 13:45:00,90,,37523072,455358,,,13561,12245,14533,Manchester United,MUN,@ManUtd,462,1878,206,Tottenham Hotspur,TOT,@SpursOfficial,462,1882,281313,462,Premier League,0,2015/2016,8,0,1,8,10,8,2015-08-08,2015-08-10,Old Trafford,1,Manchester,75635,"(53.463150,-2.291444)",Jonathan Moss,37523072,18694.0,38,Louis van Gaal,Netherlands,08/08/1951,Netherlands,Amsterdam,455358,591.0,44,Mauricio Roberto Pochettino Trossero,Argentina,02/03/1972,Argentina,Murphy,,,,,,,,,,,14,849,9,1,0,0,0,0,,,12,1,1.0,50,2,,,4,,,,,,,,,,,6,849,9,4,0,0,0,0,,,12,2,2,50,3.0,,,1,,,,,,,,,,,,,,,,,,,,,,,"(nan, nan)",2015-07-10,Manchester United,85,80,75,39000000,Manchester City,"['De Gea', 'Antonio Valencia', 'Chris Smalling...",1,2015-07-10,Tottenham Hotspur,78,77,77,26000000,Arsenal,"['Hugo Lloris', 'Kyle Walker', 'Federico Fazio...",1,1,1
864,8,10,8,85,489,14663,33,51,51.0,27036.0,,,1,3,0-1,1-3,2015-08-08 16:00:00,90,,173023,896470,,,12088,12147,15271,Norwich City,NOR,@NorwichCityFC,462,1902,489,Crystal Palace,CRY,@CPFC,462,1905,201,462,Premier League,0,2015/2016,8,0,1,8,10,8,2015-08-08,2015-08-10,Carrow Road,1,Norwich,27244,"(52.622116,1.30914)",Simon Hooper,173023,3.0,1161,Alex Neil,Scotland,09/06/1981,Scotland,Belshill,896470,,462,Alan Pardew,England,18/07/1961,England,London,,,,,,,,,,,33,864,17,6,0,0,0,0,,,14,1,4.0,63,1,,,4,,,,,,,,,,,51,864,11,7,0,0,0,0,,,20,4,2,37,,,,5,,,,,,,,,,,,,,,,,,,,,,,"(nan, nan)",2015-07-10,Norwich City,73,72,71,6500000,Ipswich Town,"['John Ruddy', 'Steven Whittaker', 'Russell Ma...",0,2015-07-10,Crystal Palace,74,74,71,10000000,Fulham,"['Rob Green', 'Nedum Onuoha', 'Richard Dunne',...",1,2,-2
871,8,10,8,85,117,15270,42,3,42.0,32242.0,,,4,2,3-0,4-2,2015-08-08 16:00:00,90,,893654,523970,,,12146,12090,15241,Leicester City,LEI,@LCFC,462,1884,117,Sunderland,SUN,,462,1879,212,462,Premier League,0,2015/2016,8,0,1,8,10,8,2015-08-08,2015-08-10,King Power Stadium,1,Leicester,32312,"(52.620278,-1.142222)",Lee Mason,893654,,251,Claudio Ranieri,Italy,20/10/1951,Italy,Roma,523970,18600.0,38,Dick Advocaat,Netherlands,27/09/1947,Netherlands,Den Haag,,,,,,,,,,,42,871,19,8,0,0,0,0,,,13,6,,44,2,,,3,,,,,,,,,,,3,871,11,5,0,0,0,0,,,17,3,1,56,4.0,,,4,,,,,,,,,,,,,,,,,,,,,,,"(nan, nan)",2015-07-10,Leicester City,71,72,68,9500000,Nottingham Forest,"['Kasper Schmeichel', 'Marc Albrighton', 'Marc...",1,2015-07-10,Sunderland,76,73,72,16000000,Newcastle United,"['Costel Pantilimon', 'Billy Jones', 'Sebastiá...",1,1,2
879,8,10,8,85,12,15272,13,25,,39063.0,,,2,2,0-1,2-2,2015-08-08 16:00:00,90,,474796,455821,,,12018,12091,14532,Everton,EVE,@Everton,462,1878,12,Watford,WAT,@WatfordFC,462,1881,19,462,Premier League,0,2015/2016,8,0,1,8,10,8,2015-08-08,2015-08-10,Goodison Park,1,Liverpool,40569,"(53.438801,-2.966328)",Mike Jones,474796,18743.0,32,Roberto Martínez Montoliú,Spain,13/07/1973,Spain,Balaguer,455821,106.0,32,Enrique Sánchez Flores,Spain,05/02/1965,Spain,Madrid,,,,,,,,,,,13,879,10,5,0,0,0,0,,,7,8,4.0,67,1,,,3,,,,,,,,,,,25,879,11,5,0,0,0,0,,,13,2,1,33,2.0,,,3,,,,,,,,,,,,,,,,,,,,,,,"(nan, nan)",2015-07-10,Everton,78,78,78,14000000,Liverpool,"['Tim Howard', 'Séamus Coleman', 'John Stones'...",1,2015-07-10,Watford,70,70,68,3200000,Reading,"['Gomes', 'Marco Motta', 'Craig Cathcart', 'Ga...",0,0,0


## Visualizations

In [5]:
attributes = [['Home_Attack', 'Away_Attack'], ['Home_Midfield', 'Away_Midfield'], ['Home_Defence', 'Away_Defence'], ['Home_TransferBudget', 'Away_TransferBudget']] # attributes to plot
colors = [['#8106ba', '#f412e6'], ['#d00202', '#ff6633'], ['#000080', '#1e90ff'], ['#048916', '#57f40e']] # color to plot with
yaxes_titles = ['<b>Attacks</b>', '<b>Midfield</b>', '<b>Defence</b>', '<b>Transfer Budget</b>'] # y-axes titles
# Create subplots grid
fig = make_subplots(rows=4, cols=1, specs=[[{"secondary_y": True}], [{"secondary_y": True}], [{"secondary_y": True}], [{"secondary_y": True}]])
# Add traces and update y-axes title
for ind, i in enumerate(attributes):
    fig.add_trace(go.Box(x=final_data['result'], y=final_data[i[0]], name=i[0], marker_color=colors[ind][0],offsetgroup='A'), secondary_y=False, row=ind+1, col=1)
    fig.add_trace(go.Box(x=final_data['result'], y=final_data[i[1]], name=i[1],marker_color=colors[ind][1], offsetgroup='B'), secondary_y=True, row=ind+1, col=1)
    fig.update_yaxes(title_text=yaxes_titles[ind], row=ind+1, col=1)
# Update layout and x-axes title
fig.update_layout(boxmode='group', title_text="<b>FIFA Team Attributes - Home vs Away</b>", width=1300, height=1000)
fig.update_xaxes(title_text="Results")
fig.show()       

# **PREPARE DATA BEFORE MODEL**

#### Setup

In [6]:
pd.options.mode.chained_assignment = None  # default='warn'
final_data.dropna(axis=1, how='all', inplace=True) 
# Columns with only NAs: home_passes, home_attacks, home_shots, away_passes, away_attacks, away_shots
final_data.drop_duplicates(inplace=True)
final_16 = final_data[final_data['time_starting_at_date_time'] > '2016-07-01']
final_16 = final_16[~(final_16['home_possessiontime'].isna() | final_16['away_possessiontime'].isna())]
final_16 = final_16.sort_values(by='time_starting_at_date_time', ascending = True)

## **Functions**

##### Fill Stats

In [7]:
def fill_tackles(df):
    tackles_cols = ['home_tackles', 'away_tackles']
    mask = (df[tackles_cols[0]].isna()) & (df[tackles_cols[1]].isna())
    for col in tackles_cols:
        if 'home' in col.lower():
            df.loc[mask, col] = df.groupby(['home_id', 'season_id'])[col].apply(lambda x: x.fillna(round(x.mean())))
        if 'away' in col.lower():
            df.loc[mask, col] = df.groupby(['away_id', 'season_id'])[col].apply(lambda x: x.fillna(round(x.mean())))
    for col in tackles_cols:
        df[col] = df[col].fillna(0)
    return df

def fill_shots_blocked(df):
    df['home_shots_blocked'] = df['home_shots_blocked'].fillna((df['home_shots_total'] - df['home_shots_ongoal'] - df['home_shots_offgoal']))
    df['away_shots_blocked'] = df['away_shots_blocked'].fillna((df['away_shots_total'] - df['away_shots_ongoal'] - df['away_shots_offgoal']))
    return df

def fill_shot_tackles(df):
    shot_cols = ['away_shots_insidebox', 'away_shots_outsidebox', 'home_shots_insidebox', 'home_shots_outsidebox']
    mask = (df[shot_cols[0]].isna()) & (df[shot_cols[1]].isna()) & (df[shot_cols[2]].isna()) & (df[shot_cols[3]].isna())
    for col in shot_cols:
        if 'home' in col.lower():
            df.loc[mask, col] = df.groupby('home_shots_total')[col].apply(lambda x: x.fillna(round(x.mean())))
        if 'away' in col.lower():
            df.loc[mask, col] = df.groupby('home_shots_total')[col].apply(lambda x: x.fillna(round(x.mean())))
    for col in shot_cols:
        df[col] = df[col].fillna(0)
    df = fill_shots_blocked(df)
    df = fill_tackles(df)
    return df

In [8]:
def fill_na_stats(df, stats_col):
    if 'shots_offgoal' in stats_col.lower() or 'saves' in stats_col.lower() or 'offsides' in stats_col.lower():
        df[stats_col] = df[stats_col].fillna(0)
    if 'card' in stats_col.lower() or 'foul' in stats_col.lower() or 'corner' in stats_col.lower():
        df[stats_col] = df[stats_col].fillna(0)   
    else:
        if 'home' in stats_col.lower():
            df[stats_col] = df.groupby(['home_id', 'season_id'])[stats_col].apply(lambda x: x.fillna(round(x.mean())))
        elif 'away' in stats_col.lower():
            df[stats_col] = df.groupby(['away_id', 'season_id'])[stats_col].apply(lambda x: x.fillna(round(x.mean())))
    return df

##### Fill Coaches and Refs

In [9]:
def fillna_coach_ref(df, column):
    if 'coach' in column.lower():
        if 'home' in column.lower():
            d = {df['homecoach_coach_id'].iloc[i]: df[column].iloc[i] for i in range(len(df)) \
                if df[column].iloc[i] is not pd.NA}
            df[column] = (df.set_index('homecoach_coach_id')[column].fillna(d).values)
            df[column] = df.groupby(['home_id', 'season_id'])[column].fillna(method='bfill')
        elif 'away' in column.lower():
            d = {df['Awaycoach_coach_id'].iloc[i]: df[column].iloc[i] for i in range(len(df)) \
                if df[column].iloc[i] is not pd.NA}
            df[column] = (df.set_index('Awaycoach_coach_id')[column].fillna(d).values)
            df[column] = df.groupby(['away_id', 'season_id'])[column].fillna(method='bfill')
        else:
            print('Not clear if coach is home or away')
    elif 'referee' in column.lower():
        d = {df['referee_id'].iloc[i]: df[column].iloc[i] for i in range(len(df)) if df[column].iloc[i] is not pd.NA}
        df[column] = (df.set_index('referee_id')[column].fillna(d).values)
        df[column] = df.groupby(['referee_id', 'season_id'])[column].fillna(method='bfill')
    else:
        print('Not a: coach or referee')
    return df

In [10]:
def fill_na_IDs(df, column):
    if 'coach' in column.lower() or 'formations' in column.lower():
        if 'home' in column.lower():
            final_16[column] = final_16.groupby(['home_id', 'season_id'])[column].fillna(method='bfill')
        elif 'away' in column.lower():
            final_16[column] = final_16.groupby(['away_id', 'season_id'])[column].fillna(method='bfill')
        else:
            print('Not clear if coach/formation is home or away')
    elif 'referee' in column.lower():
       final_16[column] = final_16.groupby(['season_id', 'stage_id'])[column].fillna(method='bfill')
    else:
        print('It is neither coach or referee or formation')
    return df

##### Fill Venue

In [11]:
def fill_venue_capacity(df):
    diz = {339996:33150, 6154:20000, 339714:54726, 232157:1000, 339832:51500, 339831:42358, 340122:29000, 2088:24000}
    for key, val in diz.items():
        df.loc[(df['venue_id'] == key) & (df['venue_capacity'].isna()), 'venue_capacity'] = val
    if len(df[df['venue_capacity'].isna()]) != 0:
        print('There are still NAs in venue_capacity!!!')
    return df
    
def fill_attendance(df):
    df['attendance'] = df['attendance'].astype(float)
    df['attendance'] = df.groupby(['venue_id', 'season_id'])['attendance'].transform(lambda x: x.fillna(x.mean()))
    avg_attendance_seriea22 = {'Inter':43549,'Hellas Verona':13350,'Torino':9465,'Empoli':6387,'Udinese':11655,'Bologna':14581,'Napoli':27593,'Roma':40723,'Cagliari':9400,'Sampdoria':8754,'Atalanta':10828,'Lazio':22056,'Fiorentina':20346, 'Juventus':22871, 'Sassuolo':6839, 'Genoa':13026, 'Milan':42388,'Salernitana':14323,'Spezia':6704,'Venezia':6731, 'Bastia':10511}
    for key, val in avg_attendance_seriea22.items():
        df.loc[(df['home_name'] == key) & (df['attendance'].isna()), 'attendance'] = val
    if len(df[df['attendance'].isna()]) != 0:
        print('There are still NAs in attendance!!!')
    df['attendance'] = df['attendance'].round(0).astype(int)
    return df

##### Fill Twitter + Colors

In [12]:
def fill_twitter_names(df):
    twitter_names = {430:'@SCBastia', 3520:'@asnlofficiel', 9257:'@FCLorient', 3:'@SunderlandAFC', 344:'@RealSporting', 482:'@sv98', 573:'@Schanzer', 377:'@RayoVallecano', 6967:'@nimesolympiquel', 956:'@1_fc_nuernberg', 274:'@SDHuesca', 266:'@SB29', 271:'@RCLens', 6827:'@Cadiz_CF', 2927:'@arminia', 1099:'@elchecf', 6898:'@ClermontFoot', 3431:'@kleeblattfuerth', 999:'@VfLBochum1848eV', 1393:'@SMCaen'}
    for key, val in twitter_names.items():
        df.loc[(df['home_id'] == key) & (df['home_twitter'].isna()), 'home_twitter'] = val
    if len(df[df['home_twitter'].isna()]) != 0:
        print('There are still NAs in home_twitter!!!')
    for key, val in twitter_names.items():
        df.loc[(df['away_id'] == key) & (df['away_twitter'].isna()), 'away_twitter'] = val
    if len(df[df['away_twitter'].isna()]) != 0:
        print('There are still NAs in away_twitter!!!')  
    return df

In [13]:
def fill_colors(df):
    extra_colors={3:'#EB172B',7:'#E11B22',22:'#F18A01',26:'#E03A3E',30:'#F0F0F0',126:'#A7D6F5',344:'#F0F0F0',429:'#B9D9EC',430:'#202A44',482:'#004F9F',573:'#D71920',1216:'#EEC0C8',1343:'#2F97DA',2708:'#F0F0F0',2921:'#FCC24F',3520:'#F0F0F0'}
    colors_home = {final_16['home_id'].iloc[i]:final_16['colors_home_color'].iloc[i] for i in range(len(final_16)) if pd.notna(final_16['colors_home_color'].iloc[i])}
    colors_away = {final_16['away_id'].iloc[i]:final_16['colors_away_color'].iloc[i] for i in range(len(final_16)) if pd.notna(final_16['colors_away_color'].iloc[i])}

    for key, val in colors_home.items():
        df.loc[(df['home_id'] == key) & (df['colors_home_color'].isna()), 'colors_home_color'] = val
    for key, val in colors_away.items():
        df.loc[(df['away_id'] == key) & (df['colors_away_color'].isna()), 'colors_away_color'] = val      
    for key, val in extra_colors.items():
        df.loc[(df['home_id'] == key) & (df['colors_home_color'].isna()), 'colors_home_color'] = val
        df.loc[(df['away_id'] == key) & (df['colors_away_color'].isna()), 'colors_away_color'] = val
    if len(df[df['colors_home_color'].isna()]) != 0 | len(df[df['colors_away_color'].isna()]) != 0:
        print('There are still NAs!!!')
    return df

##### Fill Rounds

In [14]:
laliga_853 = [['2016-08-19','2016-08-22'],['2016-08-26','2016-08-28'],['2016-09-09','2016-09-11'],['2016-09-16','2016-09-19'],['2016-09-20','2016-09-22'],['2016-09-23','2016-09-26'],['2016-09-30','2016-10-02'],['2016-10-14','2016-10-17'],['2016-10-21','2016-10-23'],['2016-10-28','2016-10-31'],['2016-11-04','2016-11-06'],['2016-11-18','2016-11-21'],['2016-11-25','2016-11-28'],['2016-12-03','2016-12-05'],['2016-12-09','2016-12-12'],['2016-12-16','2016-12-19'],['2017-01-06','2017-01-09'],['2017-01-14','2017-01-16'],['2017-01-20','2017-01-22'],['2017-01-27','2017-01-30'],['2017-02-04','2017-02-06'],['2017-02-10','2017-02-13'], ['2017-02-17','2017-02-20'],['2017-02-24','2017-02-26'],['2017-02-28','2017-03-02'],['2017-03-03','2017-03-06'],['2017-03-10','2017-03-13'],['2017-03-17','2017-03-19'],['2017-03-31','2017-04-03'],['2017-04-04','2017-04-06'],['2017-04-07','2017-04-10'],['2017-04-14','2017-04-17'],['2017-04-21','2017-04-24'],['2017-04-25','2017-04-27'],['2017-04-28','2017-05-01'],['2017-05-05','2017-05-08'],['2017-05-13','2017-05-14'],['2017-05-19','2017-05-21']]
seriea_802 = [['2016-08-20','2016-08-21'],['2016-08-27','2016-08-28'],['2016-09-10','2016-09-12'],['2016-09-16','2016-09-18'],['2016-09-20','2016-09-21'],['2016-09-24','2016-09-26'],['2016-10-01','2016-10-02'],['2016-10-15','2016-10-17'],['2016-10-22','2016-10-23'],['2016-10-25','2016-10-27'],['2016-10-29','2016-10-31'],['2016-11-05','2016-11-06'],['2016-11-19','2016-11-20'],['2016-11-26','2016-11-28'],['2016-12-02','2016-12-05'],['2016-12-10','2016-12-12'],['2016-12-17','2016-12-18'],['2016-12-20','2016-12-22'],['2017-01-07','2017-01-08'],['2017-01-14','2017-01-16'],['2017-01-21','2017-01-22'],['2017-01-28','2017-01-29'],['2017-02-04','2017-02-07'],['2017-02-10','2017-02-13'],['2017-02-17','2017-02-19'],['2017-02-25','2017-02-27'],['2017-03-04','2017-03-05'],['2017-03-10','2017-03-13'],['2017-03-18','2017-03-19'],['2017-04-01','2017-04-03'],['2017-04-08','2017-04-09'],['2017-04-15','2017-04-16'],['2017-04-22','2017-04-24'],['2017-04-28','2017-04-30'],['2017-05-06','2017-05-07'],['2017-05-13','2017-05-14'],['2017-05-20','2017-05-22'],['2017-05-27','2017-05-28']]

def str_date(string):
    return datetime.strptime(string, '%Y-%m-%d').date()
    
def round_name_fill(df):
    for ind in df.index:
        if pd.isnull(df.loc[ind, 'round_name']) and df.loc[ind, 'season_id'] == 802:
            for round, date_list in enumerate(seriea_802):
                if str_date(date_list[0]) <= df.loc[ind,'time_starting_at_date_time'].date() <= str_date(date_list[1]):
                    df.at[ind, 'round_name'] = round + 1
        elif pd.isnull(df.loc[ind, 'round_name']) and df.loc[ind, 'season_id'] == 853:
            for round, date_list in enumerate(laliga_853):
                if str_date(date_list[0]) <= df.loc[ind,'time_starting_at_date_time'].date() <= str_date(date_list[1]):
                    df.at[ind, 'round_name'] = round + 1
    diz_exc = {299930:3, 301697:19, 301642:18, 301647:18, 404577:16, 405042:21, 405051:21}
    for key, value in diz_exc.items():
        df.loc[key, 'round_name'] = value
    return df

In [15]:

def round_start_end(df):
    for ind in df.index:
        if pd.isnull(df.loc[ind, 'round_start']) and pd.isnull(df.loc[ind, 'round_end']) and df.loc[ind, 'season_id'] == 802:
            df.loc[ind, 'round_start'] = datetime.strptime(seriea_802[df.loc[ind,'round_name']-1][0], '%Y-%m-%d')
            df.loc[ind, 'round_end'] = datetime.strptime(seriea_802[df.loc[ind,'round_name']-1][1], '%Y-%m-%d')
        if pd.isnull(df.loc[ind, 'round_start']) and pd.isnull(df.loc[ind, 'round_end']) and df.loc[ind, 'season_id'] == 853:
            df.loc[ind, 'round_start'] = datetime.strptime(laliga_853[df.loc[ind,'round_name']-1][0], '%Y-%m-%d')
            df.loc[ind, 'round_end'] = datetime.strptime(laliga_853[df.loc[ind,'round_name']-1][1], '%Y-%m-%d')
    return df

##### Create Variables: Travel Distance + Rivals

In [16]:
def create_travel_distance(df):
    d = {}
    s = set()
    for i in range(len(df)):
        if df['home_id'].iloc[i] not in s:
            s.add(df['home_id'].iloc[i])
            d[df['home_id'].iloc[i]] = df['venue_coordinates'].iloc[i]
    df['coordinates_away'] = df['away_id'].map(d)
    df['travel_distance'] = df.apply(lambda row: \
        distance.distance(eval(row['venue_coordinates']), eval(row['coordinates_away'])).km, axis=1)
    df['travel_distance'] = df['travel_distance'].round(0).astype(int)
    return df

In [17]:
def get_binaryrivals(df):
    df['isrival_home'] = np.where(df['away_name'] == df['Home_RivalTeam'], 1, 0)
    df['isrival_away'] = np.where(df['home_name'] == df['Away_RivalTeam'], 1, 0)
    df.drop(['Home_RivalTeam', 'Away_RivalTeam'], axis = 1, inplace=True)
    return df

## **OPERATIONs**

#### Fill NAs

In [18]:
ids_tofill = ['coaches_home_coach_id', 'referee_id', 'coaches_away_coach_id', 'formations_home_formation', 'formations_away_formation']
stats_tofill = ['home_yellowcards','home_redcards','home_yellowredcards','away_yellowcards','away_redcards','away_yellowredcards', 'home_shots_offgoal', 'away_shots_offgoal', 'home_fouls', 'away_fouls', 'home_corners', 'home_passes_total', 'home_passes_percentage', 'away_passes_accurate', 'home_passes_accurate', 'away_passes_total', 'away_passes_percentage', 'away_corners', 'home_saves', 'away_saves', 'home_offsides', 'away_offsides']
coach_ref_tofill = ['homecoach_country_id', 'homecoach_birthdate', 'homecoach_nationality', 'homecoach_fullname', 'referee_fullname', 'Awaycoach_country_id', 'Awaycoach_birthcountry', 'Awaycoach_nationality', 'Awaycoach_fullname', 'Awaycoach_birthdate', 'homecoach_birthcountry']

In [19]:
# Fill NAs for coaches_home_coach_id and referee_id
for col in ids_tofill:
    final_16 = fill_na_IDs(final_16, col)
# Since homecoach_coach_id == coaches_home_coach_id
final_16['homecoach_coach_id'] = final_16['homecoach_coach_id'].fillna(final_16['coaches_home_coach_id'])
final_16['Awaycoach_coach_id'] = final_16['Awaycoach_coach_id'].fillna(final_16['coaches_away_coach_id'])
# Change manually a single missing data
final_16.loc[final_16['scores_ht_score'].isna(), 'scores_ht_score'] = '1-0'
final_16['referee_id'].replace(to_replace=74217, value=77, inplace=True)
# Fill shots columns
final_16 = fill_shot_tackles(final_16)
# Fill NAs for Stats
for col in stats_tofill:
    final_16 = fill_na_stats(final_16, col)
# Fill NAs for coaches and refs knowing their IDs
for col in coach_ref_tofill:
    final_16 = fillna_coach_ref(final_16, col)
# Fill NAs for venue capacity
final_16 = fill_venue_capacity(final_16)
# Fill NAs for twitter names
final16 = fill_twitter_names(final_16)
# Fill NAs for round_name
final_16 = round_name_fill(final_16)
# Fill NAs for round_start & round_end
final_16 = round_start_end(final_16)
# Fill NAs for Attendance
final_16 = fill_attendance(final_16)
# Fill NAs for winner_team_id (NAs are draws!)
final_16.loc[final_16['winner_team_id'].isna(), 'winner_team_id'] = 0
# Fill NAs for colors_home_color and colors_away_color
final_16 = fill_colors(final_16)

#### Create New Variables

In [20]:
# Create travel_distance column
final_16 = create_travel_distance(final_16)
# Create rival binary columns
final_16 = get_binaryrivals(final_16)

In [21]:
# Create categorical variables for fundation year
final_16['home_founded_cat'] = pd.cut(final_16['home_founded'], bins = [1847, 1860, 1872, 1884, 1896, 1908, 1920, 1932, 1944, 1956, 1968, 1980, 1992, 2004, 2016], labels = [13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])
final_16['away_founded_cat'] = pd.cut(final_16['away_founded'], bins = [1847, 1860, 1872, 1884, 1896, 1908, 1920, 1932, 1944, 1956, 1968, 1980, 1992, 2004, 2016], labels = [13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [22]:
final_16.tail(3)

Unnamed: 0_level_0,league_id,season_id,stage_id,round_id,venue_id,referee_id,home_id,away_id,winner_team_id,attendance,formations_home_formation,formations_away_formation,scores_home_score,scores_away_score,scores_ht_score,scores_ft_score,time_starting_at_date_time,time_minute,time_injury_time,coaches_home_coach_id,coaches_away_coach_id,standings_home_position,standings_away_position,assistants_first_assistant_id,assistants_second_assistant_id,assistants_fourth_official_id,home_name,home_short_code,home_twitter,home_country_id,home_founded,home_venue_id,away_name,away_short_code,away_twitter,away_country_id,away_founded,away_venue_id,league_country_id,league_name,league_is_cup,season_name,season_league_id,current_season,round_name,round_league_id,round_season_id,round_stage_id,round_start,round_end,venue_name,venue_grass_surface,venue_city,venue_capacity,venue_coordinates,referee_fullname,homecoach_coach_id,homecoach_team_id,homecoach_country_id,homecoach_fullname,homecoach_nationality,homecoach_birthdate,homecoach_birthcountry,homecoach_birthplace,Awaycoach_coach_id,Awaycoach_team_id,Awaycoach_country_id,Awaycoach_fullname,Awaycoach_nationality,Awaycoach_birthdate,Awaycoach_birthcountry,Awaycoach_birthplace,weather_report_code,weather_report_type,weather_windspeed(m/s),weather_report_wind_degree,weather_clouds(%),weather_humidity(%),colors_home_color,colors_away_color,weather_report_pressure,weather_report_temperature_celcius_temp,home_team_id,home_fixture_id,home_shots_total,home_shots_ongoal,home_shots_offgoal,home_shots_blocked,home_shots_insidebox,home_shots_outsidebox,home_fouls,home_corners,home_offsides,home_possessiontime,home_yellowcards,home_redcards,home_yellowredcards,home_saves,home_substitutions,home_goal_kick,home_goal_attempts,home_free_kick,home_throw_in,home_ball_safe,home_goals,home_penalties,home_injuries,home_tackles,away_team_id,away_fixture_id,away_shots_total,away_shots_ongoal,away_shots_offgoal,away_shots_blocked,away_shots_insidebox,away_shots_outsidebox,away_fouls,away_corners,away_offsides,away_possessiontime,away_yellowcards,away_redcards,away_yellowredcards,away_saves,away_substitutions,away_goal_kick,away_goal_attempts,away_free_kick,away_throw_in,away_ball_safe,away_goals,away_penalties,away_injuries,away_tackles,home_passes_total,home_passes_accurate,home_passes_percentage,away_passes_total,away_passes_accurate,away_passes_percentage,home_attacks_attacks,home_attacks_dangerous_attacks,away_attacks_attacks,away_attacks_dangerous_attacks,weather_lat_lon,Home_ObservationDate,Home_TeamName,Home_Attack,Home_Midfield,Home_Defence,Home_TransferBudget,Home_TeamRoster,Home_is_major,Away_ObservationDate,Away_TeamName,Away_Attack,Away_Midfield,Away_Defence,Away_TransferBudget,Away_TeamRoster,Away_is_major,result,goal_diff,coordinates_away,travel_distance,isrival_home,isrival_away,home_founded_cat,away_founded_cat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1
18220185,384,18576,77454372,252815,1721,13970,113,102,113,51362,4-3-3,4-3-2-1,2,0,1-0,2-0,2022-04-15 21:00:00,90,,459100,35571,2,18,12816,12059,14082,Milan,MIL,@acmilan,251,1899,1721,Genoa,GEN,@GenoaCFC,251,1893,86,251,Serie A,0,2021/2022,384,1,33,384,18576,77454372,2022-04-15,2022-04-18,Stadio Giuseppe Meazza,1,Milano,80018,"(45.478025,9.124206)",Daniele Chiffi,459100,113,251,Stefano Pioli,Italy,20/10/1965,Italy,Parma,35571,102,11,Alexander Blessin,Germany,28/05/1973,Germany,,clear,clear sky,6.91,120.0,0,49,#C40010,#CCCCCC,1018.0,18.5,113,18220185,9,4,5,1,7,3,10,3,3,59,1,0,0,2,5,,7,,,108,2,0,3.0,17,102,18220185,8,2,6,1,3,4,19,1,1,41,2,0,0,3,5,,5,,,102,0,0,2.0,30,498,412,82.73,342,250,73.1,103,46,101,30,"(45.4643, 9.1895)",2022-04-11,Milan,81,80,81,37000000,"['Mike Maignan', 'Davide Calabria', 'Fikayo To...",1,2022-04-11,Genoa,72,72,72,8500000,"['Salvatore Sirigu', 'Silvan Hefti', 'Nikola M...",1,1,2,"(44.416431,8.952428)",119,0,0,9,10
18157344,301,18441,77453967,249532,135,15484,598,6789,6789,22445,4-2-3-1,4-2-3-1,2,3,1-1,2-3,2022-04-15 21:00:00,90,,523953,2158189,3,6,12253,15840,17292,Rennes,REN,@staderennais,17,1901,135,Monaco,AMO,@AS_Monaco,75285,1919,4451,17,Ligue 1,0,2021/2022,301,1,32,301,18441,77453967,2022-04-15,2022-04-17,Roazhon Park,1,Rennes,29778,"(48.107458,-1.712839)",Stéphanie Frappart,523953,598,17,Bruno Génésio,France,01/09/1966,France,Lyon,2158189,6789,556,Philippe Clement,Belgium,22/03/1974,Belgium,Antwerpen,clear,clear sky,8.05,310.0,0,82,#C40010,#F0F0F0,1026.0,13.6,598,18157344,14,4,10,3,7,6,6,6,0,64,1,0,0,1,4,,9,,,103,2,1,1.0,9,6789,18157344,12,4,8,1,9,3,15,0,1,36,3,0,0,2,3,,11,,,99,3,0,0.0,13,668,569,85.18,383,293,76.5,125,58,74,33,"(48.1667, -1.6667)",2022-04-11,Rennes,78,76,76,14000000,"['Alfred Gomis', 'Hamari Traoré', 'Warmed Omar...",1,2022-04-11,Monaco,83,78,76,26000000,"['Alexander NübelL', 'Axel Disasi', 'Guillermo...",1,2,-1,"(43.727606,7.415614)",858,0,0,9,8
18165743,564,18462,77454016,250047,133,13963,594,485,0,25285,4-3-1-2,4-2-3-1,0,0,0-0,0-0,2022-04-15 21:00:00,90,,530755,523898,6,5,12257,13964,18344,Real Sociedad,SOC,@RealSociedad,32,1909,133,Real Betis,BET,@RealBetis,32,1907,68,32,La Liga,0,2021/2022,564,1,32,564,18462,77454016,2022-04-15,2022-04-18,Reale Arena,1,San Sebastián,32076,"(43.301376,-1.973602)",Isidro Díaz de Mera Escuderos,530755,594,32,Imanol Alguacil Barrenetxea,Spain,04/07/1971,Spain,Orio,523898,485,80,Manuel Luis Pellegrini Ripamonti,Chile,16/09/1953,Chile,Santiago de Chile,clouds,overcast clouds,3.0,207.0,100,85,#2B72DE,#EA9C08,1026.0,13.0,594,18165743,10,3,7,1,7,2,17,8,7,57,3,1,1,2,6,,6,,,77,0,0,,16,485,18165743,9,2,7,1,3,6,7,1,3,43,4,0,0,2,5,,7,,,94,0,0,,26,477,408,85.53,377,305,80.9,108,78,95,37,"(43.3128, -1.975)",2022-04-11,Real Sociedad,82,81,78,15500000,"['Congo', 'Gorosabel', 'Aritz Elustondo', 'Rob...",1,2022-04-11,Real Betis,79,80,78,16000000,"['Rui Silva', 'Héctor Bellerín MorunoL', 'Germ...",1,0,0,"(37.356483,-5.981768)",743,0,0,8,9


# **MERGE TABLES**

### **Function To Merge:** Full Table, Home Table, and Away Table

In [23]:
def merge_tables(df):
    df['ROUND'] = df['round_name'] - 1
    table_total = pd.read_csv('/Users/enricocattaneo/Desktop/CREATE LEAGUES TABLES/tables_ready.csv').add_suffix('_T')
    table_home = pd.read_csv('/Users/enricocattaneo/Desktop/CREATE LEAGUES TABLES/tables_ready_HOME.csv')
    table_away = pd.read_csv('/Users/enricocattaneo/Desktop/CREATE LEAGUES TABLES/tables_ready_AWAY.csv')
    table_total_FT = table_total[(table_total['Type_T'] == 'FT')].add_suffix('_FT')
    table_home_FT = table_home[(table_home['Type'] == 'FT')].add_suffix('_FT_OHO')
    table_away_FT = table_away[(table_away['Type'] == 'FT')].add_suffix('_FT_OAW')
    
    df_temp = pd.merge(df.reset_index(), table_total_FT,  how='left', left_on=['season_id','home_name', 'ROUND'], right_on = ['season_id_T_FT','Team_T_FT', 'round_name_T_FT'], suffixes=(None, '_H'))
    df1 = pd.merge(df_temp, table_total_FT,  how='left', left_on=['season_id','away_name', 'ROUND'], right_on = ['season_id_T_FT','Team_T_FT', 'round_name_T_FT'], suffixes=(None, '_A'))
    df2 = pd.merge(df1, table_home_FT,  how='left', left_on=['season_id','home_name', 'ROUND'], right_on = ['season_id_FT_OHO','Team_FT_OHO', 'round_name_FT_OHO'])
    df3 = pd.merge(df2, table_away_FT,  how='left', left_on=['season_id','away_name', 'ROUND'], right_on = ['season_id_FT_OAW','Team_FT_OAW', 'round_name_FT_OAW']).set_index('id')
    return df3

In [24]:
def fill_byregexfilter(df, filter_list, filler):
    for col in filter_list:
        df[col] = df[col].fillna(filler)
    return df

### **Create New Variables** From Tables Previous Variables

In [25]:
final_16 = merge_tables(final_16)

In [26]:
# Complete Table Home Team
final_16['FT_W%_home'] = final_16['Won_T_FT'] / final_16['Played_T_FT']
final_16['FT_D%_home'] = final_16['Drawn_T_FT'] / final_16['Played_T_FT']
final_16['FT_L%_home'] = final_16['Lost_T_FT'] / final_16['Played_T_FT']
final_16['FT_GFxGame_home'] = final_16['GF_T_FT'] / final_16['Played_T_FT']
final_16['FT_GAxGame_home'] = final_16['GA_T_FT'] / final_16['Played_T_FT']
final_16['FT_GDxGame_home'] = final_16['GD_T_FT'] / final_16['Played_T_FT']
final_16['FT_PointsxGame_home'] = final_16['Points_T_FT'] / final_16['Played_T_FT']
final_16['FT_Rank_home'] = final_16['rank_T_FT']

In [27]:
# Complete Table Away Team
final_16['FT_W%_away'] = final_16['Won_T_FT_A'] / final_16['Played_T_FT_A']
final_16['FT_D%_away'] = final_16['Drawn_T_FT_A'] / final_16['Played_T_FT_A']
final_16['FT_L%_away'] = final_16['Lost_T_FT_A'] / final_16['Played_T_FT_A']
final_16['FT_GFxGame_away'] = final_16['GF_T_FT_A'] / final_16['Played_T_FT_A']
final_16['FT_GAxGame_away'] = final_16['GA_T_FT_A'] / final_16['Played_T_FT_A']
final_16['FT_GDxGame_away'] = final_16['GD_T_FT_A'] / final_16['Played_T_FT_A']
final_16['FT_PointsxGame_away'] = final_16['Points_T_FT_A'] / final_16['Played_T_FT_A']
final_16['FT_Rank_away'] = final_16['rank_T_FT_A']

In [28]:
# Only Home Table Home Team
final_16['W%_onlyHome'] = final_16['Won_FT_OHO'] / final_16['Played_FT_OHO']
final_16['D%_onlyHome'] = final_16['Drawn_FT_OHO'] / final_16['Played_FT_OHO']
final_16['L%_onlyHome'] = final_16['Lost_FT_OHO'] / final_16['Played_FT_OHO']
final_16['GFxGame_onlyHome'] = final_16['GF_FT_OHO'] / final_16['Played_FT_OHO']
final_16['GAxGame_onlyHome'] = final_16['GA_FT_OHO'] / final_16['Played_FT_OHO']
final_16['GDxGame_onlyHome'] = final_16['GD_FT_OHO'] / final_16['Played_FT_OHO']
final_16['PointsxGame_onlyHome'] = final_16['Points_FT_OHO'] / final_16['Played_FT_OHO']
final_16['Rank_onlyHome'] = final_16['rank_FT_OHO']

In [29]:
# Only Away Table Away Team
final_16['W%_onlyAway'] = final_16['Won_FT_OAW'] / final_16['Played_FT_OAW']
final_16['D%_onlyAway'] = final_16['Drawn_FT_OAW'] / final_16['Played_FT_OAW']
final_16['L%_onlyAway'] = final_16['Lost_FT_OAW'] / final_16['Played_FT_OAW']
final_16['GFxGame_onlyAway'] = final_16['GF_FT_OAW'] / final_16['Played_FT_OAW']
final_16['GAxGame_onlyAway'] = final_16['GA_FT_OAW'] / final_16['Played_FT_OAW']
final_16['GDxGame_onlyAway'] = final_16['GD_FT_OAW'] / final_16['Played_FT_OAW']
final_16['PointsxGame_onlyAway'] = final_16['Points_FT_OAW'] / final_16['Played_FT_OAW']
final_16['Rank_onlyAway'] = final_16['rank_FT_OAW']

### **Tables Cleaning**

In [30]:
regex_filters = ['^Team_','^Played_','^season_id_','^round_name_','^Won_','^Drawn_','^Lost_','^Points_','^rank_','^Type_']
for reg in regex_filters:
    final_16.drop(final_16.filter(regex=reg).columns, axis = 1, inplace=True)
final_16.drop(['GF_T_FT', 'GA_T_FT', 'GD_T_FT', 'GF_T_FT_A', 'GA_T_FT_A', 'GD_T_FT_A', 'GF_FT_OHO', 'GA_FT_OHO', 'GD_FT_OHO', 'GF_FT_OAW', 'GA_FT_OAW', 'GD_FT_OAW'], axis = 1, inplace=True)

final_16[final_16.filter(regex='^FT_Rank_').columns] = final_16.filter(regex='^FT_Rank_').astype('Int64')
final_16[final_16.filter(regex='^Rank_').columns] = final_16.filter(regex='^Rank_').astype('Int64')

regs = ['W%_','D%_','L%_','GFxGame_','GAxGame_','GDxGame_','PointsxGame_']
for r in regs:
    final_16 = fill_byregexfilter(df = final_16, filter_list = final_16.filter(regex=r).columns, filler = 0)
final_16 = fill_byregexfilter(df = final_16, filter_list = final_16.filter(regex='Rank_').columns, filler = 20)

In [31]:
# TEST
for i in final_16.loc[:, 'ROUND':].columns:
    if final_16.loc[:, i].isna().sum() != 0:
        print(i, final_16.loc[:, i].isna().sum())

In [32]:
final_16.loc[:, 'ROUND':].tail(4)

Unnamed: 0_level_0,ROUND,FT_W%_home,FT_D%_home,FT_L%_home,FT_GFxGame_home,FT_GAxGame_home,FT_GDxGame_home,FT_PointsxGame_home,FT_Rank_home,FT_W%_away,FT_D%_away,FT_L%_away,FT_GFxGame_away,FT_GAxGame_away,FT_GDxGame_away,FT_PointsxGame_away,FT_Rank_away,W%_onlyHome,D%_onlyHome,L%_onlyHome,GFxGame_onlyHome,GAxGame_onlyHome,GDxGame_onlyHome,PointsxGame_onlyHome,Rank_onlyHome,W%_onlyAway,D%_onlyAway,L%_onlyAway,GFxGame_onlyAway,GAxGame_onlyAway,GDxGame_onlyAway,PointsxGame_onlyAway,Rank_onlyAway
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
18220188,32,0.28125,0.1875,0.53125,1.0,1.6875,-0.6875,1.03125,15,0.612903,0.290323,0.096774,2.096774,0.774194,1.322581,2.129032,2,0.333333,0.2,0.466667,1.066667,1.133333,-0.066667,1.2,14,0.533333,0.4,0.066667,1.8,0.733333,1.066667,2.0,5
18220185,32,0.625,0.25,0.125,1.75,0.90625,0.84375,2.125,1,0.0625,0.5,0.4375,0.75,1.625,-0.875,0.6875,18,0.5625,0.25,0.1875,1.4375,0.75,0.6875,1.9375,5,0.0625,0.5,0.4375,0.8125,1.8125,-1.0,0.6875,19
18157344,31,0.548387,0.16129,0.290323,2.16129,1.0,1.16129,1.806452,3,0.451613,0.258065,0.290323,1.516129,1.032258,0.483871,1.612903,6,0.733333,0.066667,0.2,2.6,0.666667,1.933333,2.266667,2,0.333333,0.266667,0.4,1.066667,1.2,-0.133333,1.266667,8
18165743,31,0.483871,0.290323,0.225806,1.032258,0.967742,0.064516,1.741935,6,0.548387,0.16129,0.290323,1.806452,1.193548,0.612903,1.806452,5,0.6,0.266667,0.133333,0.8,0.4,0.4,2.066667,7,0.533333,0.2,0.266667,1.666667,1.2,0.466667,1.8,3


# **WEATHER**

##### **Function**

In [33]:
def clean_sunset(df):
    d = {}
    for i in range(len(df)):
        try:
            d[df.index[i]] = dateutil.parser.isoparse(df['sunset'].iloc[i])
        except:
            d[df.index[i]] = pd.NaT
    df['new_sunset'] =  df.index.map(d)
    return df

### **Cleaning Weather**

In [34]:
# Import Data
weat_data = pd.read_csv('/Users/enricocattaneo/Desktop/Weather Output/weather_out_complete.csv').set_index('id')
# Create new columns from Time variable
weat_data['time'] = weat_data['time'].apply(lambda x: datetime.fromisoformat(x))
weat_data['year'] = pd.to_datetime(weat_data['time']).dt.year
weat_data['month'] = pd.to_datetime(weat_data['time']).dt.month
weat_data['hour'] = pd.to_datetime(weat_data['time']).dt.time
# Fill NAs of columns in the list
fill_cols = ['temp', 'precip', 'cloudcover', 'humidity', 'pressure', 'winddir', 'windgust', 'windspeed']
for col in fill_cols:
    weat_data.loc[:, col] = weat_data.groupby(['venue_city', 'month'])[col].apply(lambda x: x.fillna(round(x.mean(), 1)))
    weat_data.loc[:, col] = weat_data.groupby(['home_country_id', 'month'])[col].apply(lambda x: x.fillna(round(x.mean(), 1)))
    weat_data.loc[:, col] = weat_data.groupby(['venue_city', 'year'])[col].apply(lambda x: x.fillna(round(x.mean(), 1)))
# Fill NAs for sunset variable
weat_data = clean_sunset(weat_data)
weat_data['new_sunset'] = weat_data.groupby(['venue_city', 'month'])['new_sunset'].apply(lambda x: x.fillna(method='bfill'))
weat_data['new_sunset'] = weat_data.groupby(['home_country_id', 'month'])['new_sunset'].apply(lambda x: x.fillna(method='bfill'))
weat_data['new_sunset'] = weat_data.groupby(['month'])['new_sunset'].apply(lambda x: x.fillna(method='bfill'))
# Create new variable night_game from match time and sunset time
weat_data['new_sunset'] = weat_data['new_sunset'].astype(str)
weat_data['new_sunset'] = weat_data['new_sunset'].str.replace('\+.*$', '', regex=True)
weat_data['sun_hour'] =  (pd.to_datetime(weat_data['new_sunset']) - timedelta(minutes=30)).dt.time
weat_data['night_game'] = np.where(weat_data['hour'] >= weat_data['sun_hour'], 1, 0)
# Filter Columns
weat_data = weat_data[['temp', 'precip', 'cloudcover', 'humidity', 'pressure', 'winddir', 'windgust', 'windspeed', 'night_game']]

In [35]:
final_16 = pd.merge(final_16, weat_data, how='left', left_index=True, right_index=True)
final_16.loc[:,'temp':].head(3)

Unnamed: 0_level_0,temp,precip,cloudcover,humidity,pressure,winddir,windgust,windspeed,night_game
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2188,16.6,0.0,88.0,83.3,1015.0,156.0,10.9,9.5,0
2197,16.7,0.0,58.8,71.61,1021.9,283.0,33.0,18.3,0
2208,22.2,0.0,38.5,62.28,1022.3,243.0,28.0,15.4,0


# **LATER**

In [36]:
final_16.columns[final_16.isna().sum() != 0]

Index(['round_id', 'time_injury_time', 'standings_home_position',
       'standings_away_position', 'assistants_first_assistant_id',
       'assistants_second_assistant_id', 'assistants_fourth_official_id',
       'home_short_code', 'away_short_code', 'round_league_id',
       'round_season_id', 'round_stage_id', 'homecoach_team_id',
       'homecoach_birthplace', 'Awaycoach_team_id', 'Awaycoach_birthplace',
       'weather_report_code', 'weather_report_type', 'weather_windspeed(m/s)',
       'weather_report_wind_degree', 'weather_clouds(%)',
       'weather_humidity(%)', 'weather_report_pressure',
       'weather_report_temperature_celcius_temp', 'home_substitutions',
       'home_goal_kick', 'home_goal_attempts', 'home_free_kick',
       'home_throw_in', 'home_ball_safe', 'home_goals', 'home_penalties',
       'home_injuries', 'away_substitutions', 'away_goal_kick',
       'away_goal_attempts', 'away_free_kick', 'away_throw_in',
       'away_ball_safe', 'away_goals', 'away_penalties'

In [37]:
# DO NOT CARE ABOUT 
b = {'assistants_second_assistant_id','round_league_id','homecoach_birthplace','round_id','home_short_code','Awaycoach_birthplace','assistants_first_assistant_id','round_season_id','away_short_code','assistants_fourth_official_id','round_stage_id','standings_away_position','standings_home_position','weather_humidity(%)','weather_report_type','weather_clouds(%)','weather_windspeed(m/s)','weather_report_pressure','weather_report_temperature_celcius_temp','weather_report_code','weather_report_wind_degree','homecoach_team_id','Awaycoach_team_id','home_throw_in','away_throw_in', 'time_injury_time','home_goal_kick','away_goal_kick','home_free_kick','away_free_kick','home_substitutions','away_substitutions', 'home_goals', 'away_goals'}


no_NA_columns = set([i for i in final_16.columns if final_16[i].isna().sum() == 0])
less5_NA_columns = set([i for i in final_16.columns if final_16[i].isna().sum() < 7000])
print(less5_NA_columns - no_NA_columns - b)
print(len(less5_NA_columns))

{'home_penalties', 'home_goal_attempts', 'home_injuries', 'home_attacks_attacks', 'away_penalties', 'away_injuries', 'away_attacks_dangerous_attacks', 'away_attacks_attacks', 'away_ball_safe', 'away_goal_attempts', 'home_ball_safe', 'home_attacks_dangerous_attacks'}
204


In [38]:
for col in (set(final_16.columns)-b):
    if final_16[col].isna().sum() != 0:
        print(col, ' ==> ', final_16[col].isna().sum())

home_penalties  ==>  5412
home_ball_safe  ==>  2765
away_penalties  ==>  5412
away_attacks_attacks  ==>  1942
home_injuries  ==>  4305
away_injuries  ==>  4305
away_ball_safe  ==>  2765
home_attacks_dangerous_attacks  ==>  1946
away_attacks_dangerous_attacks  ==>  1942
home_attacks_attacks  ==>  1946
home_goal_attempts  ==>  2825
away_goal_attempts  ==>  2825


In [39]:
#final_16['home_goal_attempts'].unique()
#final_16[final_16['home_goal_attempts'].notna()].head(10)

In [40]:
final_16.to_csv('../Model Performances/almost_finaldata.csv')