# **Data Transformation and Preparation**

##### Initial Setup 

Imports

In [1]:
# Packages
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import dateutil.parser
from geopy import distance
import re
from itertools import product
import warnings
# Pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None  # default='warn'

Static Import & Data Types

In [2]:
# Import data
static = pd.read_csv('../../Data/From_Preparation/match_cleaned.csv', low_memory = False)
# Set Fixtures' id as index
static.set_index('id', inplace=True)
# Change data types
float_columns = ['weather_report_pressure', 'weather_report_temp_celsius', 'weather_report_windspeed(m/s)', 'home_passes_percentage', 'away_passes_percentage'] # columns that will be floats
static[float_columns] = static[float_columns].astype('float64')       
m = static.select_dtypes(np.number).loc[:, ~static.select_dtypes(np.number).columns.isin(float_columns)]
static[m.columns]= m.round().astype('Int64')
# Convert dates to datetime format
static['time_starting_at_date_time'] = pd.to_datetime(static['time_starting_at_date_time'], infer_datetime_format=True)
date_dates = ['time_starting_at_date', 'round_start', 'round_end', 'homecoach_birthdate', 'awaycoach_birthdate']
for date_col in date_dates:
    static[date_col] = pd.to_datetime(static[date_col], format = '%Y-%m-%d')
# Sort values
static = static.sort_values(by='time_starting_at_date_time')
static.head()

Unnamed: 0_level_0,league_id,season_id,stage_id,venue_id,referee_id,home_id,away_id,winner_team_id,commentaries,attendance,formations_home_formation,formations_away_formation,scores_home_score,scores_away_score,scores_ht_score,scores_ft_score,scores_et_score,scores_ps_score,time_starting_at_date_time,time_starting_at_date,time_starting_at_timezone,time_minute,standings_home_position,standings_away_position,home_name,home_twitter,home_country_id,home_founded,home_venue_id,away_name,away_twitter,away_country_id,away_founded,away_venue_id,league_type,league_country_id,league_name,league_is_cup,season_name,round_name,round_start,round_end,venue_name,venue_surface_isgrass,venue_city,venue_capacity,venue_coordinates,referee_fullname,homecoach_coach_id,homecoach_country_id,homecoach_fullname,homecoach_nationality,homecoach_birthdate,homecoach_birthcountry,awaycoach_coach_id,awaycoach_country_id,awaycoach_fullname,awaycoach_nationality,awaycoach_birthdate,awaycoach_birthcountry,weather_report_code,weather_report_type,weather_report_windspeed(m/s),weather_report_wind_degree,weather_report_clouds(%),weather_report_humidity(%),colors_home_color,colors_away_color,weather_report_pressure,home_shots_total,home_shots_ongoal,home_shots_offgoal,home_shots_blocked,home_shots_insidebox,home_shots_outsidebox,home_fouls,home_corners,home_offsides,home_possessiontime,home_yellowcards,home_redcards,home_yellowredcards,home_saves,home_tackles,away_shots_total,away_shots_ongoal,away_shots_offgoal,away_shots_blocked,away_shots_insidebox,away_shots_outsidebox,away_fouls,away_corners,away_offsides,away_possessiontime,away_yellowcards,away_redcards,away_yellowredcards,away_saves,away_tackles,home_passes_total,home_passes_accurate,home_passes_percentage,away_passes_total,away_passes_accurate,away_passes_percentage,home_attacks_attacks,home_attacks_dangerous_attacks,away_attacks_attacks,away_attacks_dangerous_attacks,weather_lat_lon,weather_report_temp_celsius,result,goal_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1
1062005,2,5321,10023,5732,,8643,579,579,0,,,,1,2,1-1,1-2,,,2015-07-01 20:00:00,2015-07-01,Europe/Rome,90,,,B36,,2154,1936,5605,The New Saints,,515,1959,114,cup_international,41,Champions League,1,2015/2016,,NaT,NaT,Tórsvøllur,1.0,Tórshavn,81044.0,"(40.453068,-3.688354)",,466794,2154,Eyðun Klakstein,Faroe Islands,1972-11-28,Faroe Islands,455867,462,Craig Harrison,England,1977-11-10,England,,,,,,,#F0F0F0,#2B72DE,,,,,,,,,0,0,,0,0,0,,,,,,,,,,0,0,,0,0,0,,,,,,,,,,,,,"(nan, nan)",,2,-1
1067032,5,5337,10210,7361,,7345,589,0,0,,,,0,0,0-0,0-0,,,2015-07-02 14:00:00,2015-07-02,Europe/Rome,90,,,Ordabasy,,2427,1998,4763,Beitar Jerusalem,,802,1936,35,cup_international,41,Europa League,1,2015/2016,,NaT,NaT,Stadion im. Qajimuqan Mungaytpasuli,1.0,Astana,12343.0,"(51.16159,71.39925)",,1552713,227,Viktor Kumykov,Russia,1963-05-12,Russia,3188090,296,Slobodan Drapić,Serbia,1965-02-28,Serbia,,,,,,,#F0F0F0,#FBED32,,,,,,,,,0,0,,0,0,0,,,,,,,,,,0,0,,0,0,0,,,,,,,,,,,,,"(nan, nan)",,0,0
1067052,5,5337,10210,3653,,5500,6213,5500,0,,,,2,0,1-0,2-0,,,2015-07-02 15:30:00,2015-07-02,Europe/Rome,90,,,Shirak,,919,1958,3632,Zrinjski,,507,1905,4032,cup_international,41,Europa League,1,2015/2016,,NaT,NaT,Gyumri,,Gyumri,,"(40.80272,43.84833)",,462696,919,Vardan Bichakhchyan,Armenia,1977-10-09,Armenia,1477271,296,Vinko Marinović,Serbia,1971-03-03,Austria,,,,,,,,#002B87,,,,,,,,,0,0,,0,0,0,,,,,,,,,,0,0,,0,0,0,,,,,,,,,,,,,"(nan, nan)",,1,2
1067071,5,5337,10210,2536,,6497,890,890,0,,,,0,1,0-0,0-1,,,2015-07-02 17:00:00,2015-07-02,Europe/Rome,90,,,Pakruojis,,2079,0,338658,Jagiellonia Białystok,,2,1927,316,cup_international,41,Europa League,1,2015/2016,,NaT,NaT,Šiaulių m. savivaldybes stadionas,1.0,Šiauliai,4000.0,"(55.936667,23.295556)",,18922742,86,Mykola Trubachov,Ukraine,1957-05-20,Ukraine,456327,2,Michal Probierz,Poland,1972-09-24,Poland,,,,,,,,,,,,,,,,,0,0,,0,0,0,,,,,,,,,,0,0,,0,0,0,,,,,,,,,,,,,"(nan, nan)",,2,-1
1067093,5,5337,10210,3627,,5495,734,5495,0,,,,1,0,0-0,1-0,,,2015-07-02 17:00:00,2015-07-02,Europe/Rome,90,,,Alashkert,,919,1990,260195,St. Johnstone,,1161,1885,219,cup_international,41,Europa League,1,2015/2016,,NaT,NaT,Vazgen Sargsyan anvan Hanrapetakan Marzadasht,1.0,Yerevan,53849.0,"(40.180403,44.494967)",,462666,919,Abraham Khashmanyan,Armenia,1967-11-11,Armenia,896466,491,Tommy Wright,Northern Ireland,1963-08-29,Northern Ireland,,,,,,,#F0F0F0,#0046A8,,,,,,,,,0,0,,0,0,0,,,,,,,,,,0,0,,0,0,0,,,,,,,,,,,,,"(nan, nan)",,1,1


## *Standings Data*

Prepare static dataframe

In [3]:
# Reset static index
static = static.reset_index()
static_leagues = static.loc[static['league_is_cup'] == 0, :] # Interest in attributes only for leagues' matches 
static_cups = static.loc[~(static['league_is_cup'] == 0), :]
# Consider for each match the standings before the game started
static_leagues['ROUND'] = static_leagues['round_name'] - 1

Import tables

In [4]:
# Import tables
table_total = pd.read_csv('../../Data/From_Collection/Standings_output/tables_FULL.csv')
table_home = pd.read_csv('../../Data/From_Collection/Standings_output/tables_HOME.csv')
table_away = pd.read_csv('../../Data/From_Collection/Standings_output/tables_AWAY.csv')
# Create DataFrames for Full Time, 1st Half, and 2nd Half results for table_total (containing the total standings)
table_FT_TO = table_total[(table_total['Type'] == 'FT')]
table_1H_TO = table_total[(table_total['Type'] == '1H')]
table_2H_TO = table_total[(table_total['Type'] == '2H')]
# Create DataFrames for Full Time, 1st Half, and 2nd Half results for table_home (containing only the home standings)
table_FT_OH = table_home[(table_home['Type'] == 'FT')]
table_1H_OH = table_home[(table_home['Type'] == '1H')]
table_2H_OH = table_home[(table_home['Type'] == '2H')]
# Create DataFrames for Full Time, 1st Half, and 2nd Half results for table_away (containing only the away standings)
table_FT_OA = table_away[(table_away['Type'] == 'FT')]
table_1H_OA = table_away[(table_away['Type'] == '1H')]
table_2H_OA = table_away[(table_away['Type'] == '2H')]

Merge static_leagues with multiple tables & create new features

In [5]:
# Dictionary connecting suffixes with tables
suffixes_tables = {'_FT_TO_H': table_FT_TO, '_1H_TO_H': table_1H_TO, '_2H_TO_H': table_2H_TO, '_FT_TO_A': table_FT_TO, '_1H_TO_A': table_1H_TO, '_2H_TO_A': table_2H_TO, '_FT_OH_H': table_FT_OH, '_1H_OH_H': table_1H_OH, '_2H_OH_H': table_2H_OH, '_FT_OA_A': table_FT_OA, '_1H_OA_A': table_1H_OA, '_2H_OA_A': table_2H_OA}
# Dictionary with new columns' names as keys and old columns' names as values 
new_cols_names = {'W%': 'Won', 'D%': 'Drawn', 'L%': 'Lost', 'GFxGame': 'GF', 'GAxGame': 'GA', 'GDxGame': 'GD', 'PointsxGame': 'Points'}
features_names_links = {}
# Loop over suffixes_tables items
for suffix, table in suffixes_tables.items():
    # A table could be either for home teams or away teams, with different keys used in the merger
    if re.search('_H$', suffix): left_keys_merge = ['season_id', 'home_name', 'ROUND']
    elif re.search('_A$', suffix): left_keys_merge = ['season_id', 'away_name', 'ROUND']
    else: raise Exception('Suffix neither for home or away!') 
    # Format table columns and columns' names
    table.loc[:, 'round_name'] = table.loc[:, 'round_name'].astype(int)
    table = table.add_suffix(suffix)
    # Merge static data with table
    static_leagues = static_leagues.merge(right=table, how='left', left_on=left_keys_merge, \
        right_on=['season_id' + suffix, 'Team' + suffix, 'round_name' + suffix])
    # Create new columns for each table by looping on the new_cols_names dictionary
    for new_name, old_name in new_cols_names.items():
        static_leagues[new_name + suffix] = static_leagues[old_name + suffix] / static_leagues['Played' + suffix]
        # Create dictionary linking old and newly generated features for later use
        features_names_links[old_name + suffix] = new_name + suffix



Fill empty rounds data using previous observations

In [6]:
static_leagues = static_leagues.set_index('id')
fill_table_cols = static_leagues.columns[-241:].tolist()
for col in tqdm(fill_table_cols):
    if re.search('_H$', col): team = 'home_name'
    elif re.search('_A$', col): team = 'away_name'
    for i in static_leagues.index:
        if pd.isnull(static_leagues.loc[i, col]):
            datee = static_leagues.loc[i, 'time_starting_at_date']
            namee = static_leagues.loc[i, team]
            try: 
                maxx = max(static_leagues.loc[(static_leagues['time_starting_at_date'] < datee) & (static_leagues[team] == namee), 'time_starting_at_date'])
                f_i = (static_leagues.index[(static_leagues['time_starting_at_date'] == maxx) & (static_leagues[team] == namee)]).tolist()
                static_leagues.loc[i, col] = static_leagues.loc[f_i[0], col]
            except:
                static_leagues.loc[i, col] = np.nan

  0%|          | 0/241 [00:00<?, ?it/s]

In [7]:
static_leagues = static_leagues.loc[static_leagues['season_name'] != '2015/2016', :]

Fill empty rounds data using quantiles (for newly promoted teams)

In [8]:
def fillna_random_quantiles_group(df, column, quantile_range_lower, quantile_range_upper):
    """This function fills NAs in a datframe's specified column using random values contained in the given quantile range (between the lower and upper bound).
    df: dataframe
    column: column with NAs to fill 
    quantile_range_lower: lower bound for the qauntile range
    quantile_range_upper: upper bound for the qauntile range
    """
    # Get the n=20 quantiles inside the provided quantile range
    quantiles = {df[column].quantile(q=i) for i in np.linspace(quantile_range_lower, quantile_range_upper, 20)}
    # Fill NAs with a randomly chosen quantile value
    mask = df[column].isnull()
    samples = np.random.choice(list(quantiles), size=mask.sum())
    df.loc[mask, column] = samples
    return df

In [9]:
# Dictionary of columns' filters and quantile range indications (giving always 20% worst values to newly promoted teams)
reg_fill_by_quantile = {'^W%_': 'l', '^D%_': 'l', '^L%_': 'h', 'GFxGame_': 'l', 'GAxGame_': 'h', 'GDxGame_': 'l', 'PointsxGame_': 'l', 'rank_': 'h'}
for reg, value in reg_fill_by_quantile.items():
    quantile_low, quantile_up = (0, 0.20) if value == 'l' else (0.80, 1) # translate from value to quantile range
    for col in static_leagues.filter(regex=reg).columns: # filter columns using regex 
        static_leagues = fillna_random_quantiles_group(df=static_leagues, column=col, \
            quantile_range_lower=quantile_low, quantile_range_upper=quantile_up)

# Fill absolute features using already filled relative features (opposite of the relative features creation) - (Rel. value x Games played)
for key, value in features_names_links.items():
    static_leagues[key] = static_leagues[key].fillna(round(static_leagues[value] * (static_leagues['round_name'] - 1)))

Drop unwanted tables' features & specify data types

In [10]:
# Drop features by filtering columns' names
for reg in ['^Team_', '^Played_', '^season_id_', '^round_name_', '^Type_']:
    static_leagues.drop(static_leagues.filter(regex=reg).columns, axis = 1, inplace=True)
# Change features' dtype to Int64 by filtering
for feat_int in ['^Won_', '^Drawn_', '^Lost_', '^GF_', '^GA_', '^GD_', '^Points_', '^rank_']:
    static_leagues[static_leagues.filter(regex=feat_int).columns] = static_leagues.filter(regex=feat_int).astype('Int64')

In [11]:
# NAs test
columns_with_NA = 0
for i in static_leagues.loc[:, 'ROUND':].columns:
    if static_leagues.loc[:, i].isna().sum() != 0:
        print('NAs in', i, 'column: ', static_leagues.loc[:, i].isna().sum())
        columns_with_NA += 1
print("N. of table's features with NAs: ", columns_with_NA, "\nN. of total table's features: ", \
    static_leagues.loc[:, 'ROUND':].shape[1])

N. of table's features with NAs:  0 
N. of total table's features:  181


In [12]:
stand_cols = static_leagues.loc[:,:'winner_team_id'].columns.tolist() + ['time_starting_at_date', 'season_name'] + static_leagues.loc[:,'result':].columns.tolist()
standings_df = static_leagues.loc[:, stand_cols]
standings_df.head(3)

Unnamed: 0_level_0,league_id,season_id,stage_id,venue_id,referee_id,home_id,away_id,winner_team_id,time_starting_at_date,season_name,result,goal_diff,ROUND,Won_FT_TO_H,Drawn_FT_TO_H,Lost_FT_TO_H,GF_FT_TO_H,GA_FT_TO_H,GD_FT_TO_H,Points_FT_TO_H,rank_FT_TO_H,W%_FT_TO_H,D%_FT_TO_H,L%_FT_TO_H,GFxGame_FT_TO_H,GAxGame_FT_TO_H,GDxGame_FT_TO_H,PointsxGame_FT_TO_H,Won_1H_TO_H,Drawn_1H_TO_H,Lost_1H_TO_H,GF_1H_TO_H,GA_1H_TO_H,GD_1H_TO_H,Points_1H_TO_H,rank_1H_TO_H,W%_1H_TO_H,D%_1H_TO_H,L%_1H_TO_H,GFxGame_1H_TO_H,GAxGame_1H_TO_H,GDxGame_1H_TO_H,PointsxGame_1H_TO_H,Won_2H_TO_H,Drawn_2H_TO_H,Lost_2H_TO_H,GF_2H_TO_H,GA_2H_TO_H,GD_2H_TO_H,Points_2H_TO_H,rank_2H_TO_H,W%_2H_TO_H,D%_2H_TO_H,L%_2H_TO_H,GFxGame_2H_TO_H,GAxGame_2H_TO_H,GDxGame_2H_TO_H,PointsxGame_2H_TO_H,Won_FT_TO_A,Drawn_FT_TO_A,Lost_FT_TO_A,GF_FT_TO_A,GA_FT_TO_A,GD_FT_TO_A,Points_FT_TO_A,rank_FT_TO_A,W%_FT_TO_A,D%_FT_TO_A,L%_FT_TO_A,GFxGame_FT_TO_A,GAxGame_FT_TO_A,GDxGame_FT_TO_A,PointsxGame_FT_TO_A,Won_1H_TO_A,Drawn_1H_TO_A,Lost_1H_TO_A,GF_1H_TO_A,GA_1H_TO_A,GD_1H_TO_A,Points_1H_TO_A,rank_1H_TO_A,W%_1H_TO_A,D%_1H_TO_A,L%_1H_TO_A,GFxGame_1H_TO_A,GAxGame_1H_TO_A,GDxGame_1H_TO_A,PointsxGame_1H_TO_A,Won_2H_TO_A,Drawn_2H_TO_A,Lost_2H_TO_A,GF_2H_TO_A,GA_2H_TO_A,GD_2H_TO_A,Points_2H_TO_A,rank_2H_TO_A,W%_2H_TO_A,D%_2H_TO_A,L%_2H_TO_A,GFxGame_2H_TO_A,GAxGame_2H_TO_A,GDxGame_2H_TO_A,PointsxGame_2H_TO_A,Won_FT_OH_H,Drawn_FT_OH_H,Lost_FT_OH_H,GF_FT_OH_H,GA_FT_OH_H,GD_FT_OH_H,Points_FT_OH_H,rank_FT_OH_H,W%_FT_OH_H,D%_FT_OH_H,L%_FT_OH_H,GFxGame_FT_OH_H,GAxGame_FT_OH_H,GDxGame_FT_OH_H,PointsxGame_FT_OH_H,Won_1H_OH_H,Drawn_1H_OH_H,Lost_1H_OH_H,GF_1H_OH_H,GA_1H_OH_H,GD_1H_OH_H,Points_1H_OH_H,rank_1H_OH_H,W%_1H_OH_H,D%_1H_OH_H,L%_1H_OH_H,GFxGame_1H_OH_H,GAxGame_1H_OH_H,GDxGame_1H_OH_H,PointsxGame_1H_OH_H,Won_2H_OH_H,Drawn_2H_OH_H,Lost_2H_OH_H,GF_2H_OH_H,GA_2H_OH_H,GD_2H_OH_H,Points_2H_OH_H,rank_2H_OH_H,W%_2H_OH_H,D%_2H_OH_H,L%_2H_OH_H,GFxGame_2H_OH_H,GAxGame_2H_OH_H,GDxGame_2H_OH_H,PointsxGame_2H_OH_H,Won_FT_OA_A,Drawn_FT_OA_A,Lost_FT_OA_A,GF_FT_OA_A,GA_FT_OA_A,GD_FT_OA_A,Points_FT_OA_A,rank_FT_OA_A,W%_FT_OA_A,D%_FT_OA_A,L%_FT_OA_A,GFxGame_FT_OA_A,GAxGame_FT_OA_A,GDxGame_FT_OA_A,PointsxGame_FT_OA_A,Won_1H_OA_A,Drawn_1H_OA_A,Lost_1H_OA_A,GF_1H_OA_A,GA_1H_OA_A,GD_1H_OA_A,Points_1H_OA_A,rank_1H_OA_A,W%_1H_OA_A,D%_1H_OA_A,L%_1H_OA_A,GFxGame_1H_OA_A,GAxGame_1H_OA_A,GDxGame_1H_OA_A,PointsxGame_1H_OA_A,Won_2H_OA_A,Drawn_2H_OA_A,Lost_2H_OA_A,GF_2H_OA_A,GA_2H_OA_A,GD_2H_OA_A,Points_2H_OA_A,rank_2H_OA_A,W%_2H_OA_A,D%_2H_OA_A,L%_2H_OA_A,GFxGame_2H_OA_A,GAxGame_2H_OA_A,GDxGame_2H_OA_A,PointsxGame_2H_OA_A
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1
251728,301,765,1392,37,15478,430,591,591,2016-08-12,2016/2017,2,-1,0,12,8,16,33,41,-8,44,14,0.333333,0.222222,0.444444,0.916667,1.138889,-0.222222,1.222222,7,20,9,14,14,0,41,14,0.194444,0.555556,0.25,0.388889,0.388889,0.0,1.138889,10,12,14,19,27,-8,42,15,0.277778,0.333333,0.388889,0.527778,0.75,-0.222222,1.166667,27,5,2,89,18,71,86,1,0.794118,0.147059,0.058824,2.617647,0.529412,2.088235,2.529412,17,15,2,35,10,25,66,1,0.5,0.441176,0.058824,1.029412,0.294118,0.735294,1.941176,24,7,3,54,8,46,79,1,0.705882,0.205882,0.088235,1.588235,0.235294,1.352941,2.323529,10,2,6,22,14,8,32,7,0.555556,0.111111,0.333333,1.222222,0.777778,0.444444,1.777778,5,10,3,10,5,5,25,9,0.277778,0.555556,0.166667,0.555556,0.277778,0.277778,1.388889,8,5,5,12,9,3,29,5,0.444444,0.277778,0.277778,0.666667,0.5,0.166667,1.611111,14,2,1,38,6,32,44,1,0.823529,0.117647,0.058824,2.235294,0.352941,1.882353,2.588235,4,11,2,10,4,6,23,6,0.235294,0.647059,0.117647,0.588235,0.235294,0.352941,1.352941,14,2,1,28,2,26,44,1,0.823529,0.117647,0.058824,1.647059,0.117647,1.529412,2.588235
251738,301,765,1392,4451,11668,6789,2919,0,2016-08-12,2016/2017,0,0,0,16,14,7,55,50,5,62,3,0.432432,0.378378,0.189189,1.486486,1.351351,0.135135,1.675676,13,13,11,22,22,0,52,8,0.351351,0.351351,0.297297,0.594595,0.594595,0.0,1.405405,13,13,11,33,28,5,52,7,0.351351,0.351351,0.297297,0.891892,0.756757,0.135135,1.405405,11,10,15,45,53,-8,43,15,0.305556,0.277778,0.416667,1.25,1.472222,-0.222222,1.194444,8,21,7,19,19,0,45,9,0.222222,0.583333,0.194444,0.527778,0.527778,0.0,1.25,11,9,16,26,34,-8,42,14,0.305556,0.25,0.444444,0.722222,0.944444,-0.222222,1.166667,9,6,3,28,19,9,33,7,0.5,0.333333,0.166667,1.555556,1.055556,0.5,1.833333,8,6,4,13,7,6,30,7,0.444444,0.333333,0.222222,0.722222,0.388889,0.333333,1.666667,7,7,4,15,12,3,28,10,0.388889,0.388889,0.222222,0.833333,0.666667,0.166667,1.555556,5,3,10,16,28,-12,18,14,0.277778,0.166667,0.555556,0.888889,1.555556,-0.666667,1.0,4,10,4,6,9,-3,22,12,0.222222,0.555556,0.222222,0.333333,0.5,-0.166667,1.222222,4,5,9,10,19,-9,17,15,0.222222,0.277778,0.5,0.555556,1.055556,-0.5,0.944444
2188,8,13,11,199,15241,22,42,22,2016-08-13,2016/2017,1,1,0,0,0,0,0,0,0,0,17,0.107143,0.142857,0.619048,0.92,1.833333,-1.364179,0.857143,0,0,0,0,0,0,0,20,0.090909,0.2,0.571429,0.166667,1.25,-0.478261,0.666667,0,0,0,0,0,0,0,19,0.0,0.210526,0.46875,0.4,1.074074,-1.5,0.615385,23,11,3,67,35,32,80,1,0.621622,0.297297,0.081081,1.810811,0.945946,0.864865,2.162162,17,14,6,28,13,15,65,1,0.459459,0.378378,0.162162,0.756757,0.351351,0.405405,1.756757,18,11,8,39,22,17,65,2,0.486486,0.297297,0.216216,1.054054,0.594595,0.459459,1.756757,0,0,0,0,0,0,0,19,0.214286,0.076923,0.714286,0.25,1.666667,-0.642857,0.857143,0,0,0,0,0,0,0,20,0.133333,0.142857,0.428571,0.0,4.0,-0.666667,0.928571,0,0,0,0,0,0,0,18,0.166667,0.0625,0.666667,0.470588,1.0625,-0.666667,0.5,11,5,2,32,17,15,38,1,0.611111,0.277778,0.111111,1.777778,0.944444,0.833333,2.111111,8,6,4,12,8,4,30,1,0.444444,0.333333,0.222222,0.666667,0.444444,0.222222,1.666667,11,3,4,20,9,11,36,1,0.611111,0.166667,0.222222,1.111111,0.5,0.611111,2.0


In [13]:
standings_df.to_csv('../../Data/Modeling_Before_Preparation/Standings_v01.csv')

## *Weather Data*

Distance Function

In [14]:
def create_travel_distance(df):
    d = {}
    s = set()
    for i, id in enumerate(df.loc[:, 'home_id']):
        if id not in s:
            s.add(id)
            d[id] = df['venue_coordinates'].iloc[i]

    df['coord_away'] = df['away_id'].map(d)
    df['travel_dist(km)'] = df.apply(lambda row: \
        distance.distance(eval(row['venue_coordinates']), eval(row['coord_away'])).km, axis=1)
    df['travel_dist(km)'] = pd.to_numeric(df['travel_dist(km)'].round(0).astype(int), errors='coerce', downcast='integer')
    return df

Weather Data Preparation 

In [15]:
# Import Data
weather_data = pd.read_csv('../../Data/From_Collection/Weather_output/weather_output.csv').set_index('id')
# Create new columns from Time variable
weather_data['time'] = weather_data['time'].apply(lambda x: datetime.fromisoformat(x))
weather_data['year'] = pd.to_datetime(weather_data['time']).dt.year
weather_data['month'] = pd.to_datetime(weather_data['time']).dt.month
weather_data['hour'] = pd.to_datetime(weather_data['time']).dt.time

Merge Static and Weather Data (using static columns to fill weather NAs)

In [16]:
# Merge static_leagues with weather data by Index
static_leagues = pd.merge(static_leagues, weather_data, how='left', left_index=True, right_index=True)
# Convert windspeed column from kph to m/s
static_leagues['windspeed'] = round((static_leagues['windspeed'] / 3.6), 2)
# Using static columns to fill weather columns
fill_weather_cols = {'windspeed': 'weather_report_windspeed(m/s)', 'winddir': 'weather_report_wind_degree', 'cloudcover': 'weather_report_clouds(%)', 'humidity': 'weather_report_humidity(%)', 'temp': 'weather_report_temp_celsius', 'pressure': 'weather_report_pressure'}
for key, value in fill_weather_cols.items():
    static_leagues[key].fillna(static_leagues[value], inplace=True)

Fill Weather NAs & Convert Dtypes

In [17]:
# Fill weather NAs from columns' list
for col in ['temp', 'precip', 'cloudcover', 'humidity', 'pressure', 'winddir', 'windspeed', 'visibility']:
    static_leagues.loc[:, col] = static_leagues.groupby(['venue_city', 'month'])[col].apply(lambda x: x.fillna(x.mean()))
    static_leagues.loc[:, col] = static_leagues.groupby(['home_country_id', 'month'])[col].apply(lambda x: x.fillna(x.mean()))
# Change weather columns dtypes and round float columns
for col in ['winddir', 'cloudcover', 'humidity', 'pressure', 'visibility']:
    static_leagues[col] = static_leagues[col].astype(int)
static_leagues['temp'] = static_leagues['temp'].apply(lambda x: round(x, 1))
static_leagues['windspeed'] = static_leagues['windspeed'].apply(lambda x: round(x, 2))

Create night_game Column (using sunset feature)

In [18]:
# Convert sunset column from string to datetime & convert all times to Rome time-zone
repl_dict = {re.compile('\+\d{2}:\d{2}$'): '', re.compile('Z$'): ''}
static_leagues.loc[:, 'sunset'] = static_leagues.loc[:, 'sunset'].replace(repl_dict, regex=True)
static_leagues['sunset'] = pd.to_datetime(static_leagues['sunset'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')
static_leagues['sunset_rome_TZ'] = np.where(static_leagues['league_name'] == 'Premier League', \
    static_leagues['sunset'] + timedelta(hours=1), static_leagues['sunset'])
# Fill NAs for sunset column by city/country and month
static_leagues['sunset_rome_TZ'] = static_leagues.groupby(['venue_city', 'month'])['sunset_rome_TZ'].apply(lambda x: \
    x.fillna(method='ffill').fillna(method='bfill'))
static_leagues['sunset_rome_TZ'] = static_leagues.groupby(['home_country_id', 'month'])['sunset_rome_TZ'].apply(lambda x: \
    x.fillna(method='ffill').fillna(method='bfill'))
# Create new variable night_game (games played after sunset for at least one half) comparing match time and sunset time
static_leagues['night_game'] = np.where(static_leagues['hour'] >= ((static_leagues['sunset_rome_TZ'] - timedelta(minutes=45)).dt.time), 1, 0)

  
  del sys.path[0]


Drop Valueless Columns

In [19]:
# Drop not useful weather columns 
weather_cols_drop = ['precip', 'sunset', 'windgust', 'year', 'month', 'hour', 'sunset_rome_TZ', 'weather_report_code', 'weather_report_type', 'weather_report_windspeed(m/s)', 'weather_report_wind_degree', 'weather_report_clouds(%)', 'weather_report_humidity(%)', 'weather_report_pressure', 'weather_lat_lon', 'weather_report_temp_celsius']
static_leagues = static_leagues.drop(columns=weather_cols_drop)
static_leagues.head()

Unnamed: 0_level_0,league_id,season_id,stage_id,venue_id,referee_id,home_id,away_id,winner_team_id,commentaries,attendance,formations_home_formation,formations_away_formation,scores_home_score,scores_away_score,scores_ht_score,scores_ft_score,scores_et_score,scores_ps_score,time_starting_at_date_time,time_starting_at_date,time_starting_at_timezone,time_minute,standings_home_position,standings_away_position,home_name,home_twitter,home_country_id,home_founded,home_venue_id,away_name,away_twitter,away_country_id,away_founded,away_venue_id,league_type,league_country_id,league_name,league_is_cup,season_name,round_name,round_start,round_end,venue_name,venue_surface_isgrass,venue_city,venue_capacity,venue_coordinates,referee_fullname,homecoach_coach_id,homecoach_country_id,homecoach_fullname,homecoach_nationality,homecoach_birthdate,homecoach_birthcountry,awaycoach_coach_id,awaycoach_country_id,awaycoach_fullname,awaycoach_nationality,awaycoach_birthdate,awaycoach_birthcountry,colors_home_color,colors_away_color,home_shots_total,home_shots_ongoal,home_shots_offgoal,home_shots_blocked,home_shots_insidebox,home_shots_outsidebox,home_fouls,home_corners,home_offsides,home_possessiontime,home_yellowcards,home_redcards,home_yellowredcards,home_saves,home_tackles,away_shots_total,away_shots_ongoal,away_shots_offgoal,away_shots_blocked,away_shots_insidebox,away_shots_outsidebox,away_fouls,away_corners,away_offsides,away_possessiontime,away_yellowcards,away_redcards,away_yellowredcards,away_saves,away_tackles,home_passes_total,home_passes_accurate,home_passes_percentage,away_passes_total,away_passes_accurate,away_passes_percentage,home_attacks_attacks,home_attacks_dangerous_attacks,away_attacks_attacks,away_attacks_dangerous_attacks,result,goal_diff,ROUND,Won_FT_TO_H,Drawn_FT_TO_H,Lost_FT_TO_H,GF_FT_TO_H,GA_FT_TO_H,GD_FT_TO_H,Points_FT_TO_H,rank_FT_TO_H,W%_FT_TO_H,D%_FT_TO_H,L%_FT_TO_H,GFxGame_FT_TO_H,GAxGame_FT_TO_H,GDxGame_FT_TO_H,PointsxGame_FT_TO_H,Won_1H_TO_H,Drawn_1H_TO_H,Lost_1H_TO_H,GF_1H_TO_H,GA_1H_TO_H,GD_1H_TO_H,Points_1H_TO_H,rank_1H_TO_H,W%_1H_TO_H,D%_1H_TO_H,L%_1H_TO_H,GFxGame_1H_TO_H,GAxGame_1H_TO_H,GDxGame_1H_TO_H,PointsxGame_1H_TO_H,Won_2H_TO_H,Drawn_2H_TO_H,Lost_2H_TO_H,GF_2H_TO_H,GA_2H_TO_H,GD_2H_TO_H,Points_2H_TO_H,rank_2H_TO_H,W%_2H_TO_H,D%_2H_TO_H,L%_2H_TO_H,GFxGame_2H_TO_H,GAxGame_2H_TO_H,GDxGame_2H_TO_H,PointsxGame_2H_TO_H,Won_FT_TO_A,Drawn_FT_TO_A,Lost_FT_TO_A,GF_FT_TO_A,GA_FT_TO_A,GD_FT_TO_A,Points_FT_TO_A,rank_FT_TO_A,W%_FT_TO_A,D%_FT_TO_A,L%_FT_TO_A,GFxGame_FT_TO_A,GAxGame_FT_TO_A,GDxGame_FT_TO_A,PointsxGame_FT_TO_A,Won_1H_TO_A,Drawn_1H_TO_A,Lost_1H_TO_A,GF_1H_TO_A,GA_1H_TO_A,GD_1H_TO_A,Points_1H_TO_A,rank_1H_TO_A,W%_1H_TO_A,D%_1H_TO_A,L%_1H_TO_A,GFxGame_1H_TO_A,GAxGame_1H_TO_A,GDxGame_1H_TO_A,PointsxGame_1H_TO_A,Won_2H_TO_A,Drawn_2H_TO_A,Lost_2H_TO_A,GF_2H_TO_A,GA_2H_TO_A,GD_2H_TO_A,Points_2H_TO_A,rank_2H_TO_A,W%_2H_TO_A,D%_2H_TO_A,L%_2H_TO_A,GFxGame_2H_TO_A,GAxGame_2H_TO_A,GDxGame_2H_TO_A,PointsxGame_2H_TO_A,Won_FT_OH_H,Drawn_FT_OH_H,Lost_FT_OH_H,GF_FT_OH_H,GA_FT_OH_H,GD_FT_OH_H,Points_FT_OH_H,rank_FT_OH_H,W%_FT_OH_H,D%_FT_OH_H,L%_FT_OH_H,GFxGame_FT_OH_H,GAxGame_FT_OH_H,GDxGame_FT_OH_H,PointsxGame_FT_OH_H,Won_1H_OH_H,Drawn_1H_OH_H,Lost_1H_OH_H,GF_1H_OH_H,GA_1H_OH_H,GD_1H_OH_H,Points_1H_OH_H,rank_1H_OH_H,W%_1H_OH_H,D%_1H_OH_H,L%_1H_OH_H,GFxGame_1H_OH_H,GAxGame_1H_OH_H,GDxGame_1H_OH_H,PointsxGame_1H_OH_H,Won_2H_OH_H,Drawn_2H_OH_H,Lost_2H_OH_H,GF_2H_OH_H,GA_2H_OH_H,GD_2H_OH_H,Points_2H_OH_H,rank_2H_OH_H,W%_2H_OH_H,D%_2H_OH_H,L%_2H_OH_H,GFxGame_2H_OH_H,GAxGame_2H_OH_H,GDxGame_2H_OH_H,PointsxGame_2H_OH_H,Won_FT_OA_A,Drawn_FT_OA_A,Lost_FT_OA_A,GF_FT_OA_A,GA_FT_OA_A,GD_FT_OA_A,Points_FT_OA_A,rank_FT_OA_A,W%_FT_OA_A,D%_FT_OA_A,L%_FT_OA_A,GFxGame_FT_OA_A,GAxGame_FT_OA_A,GDxGame_FT_OA_A,PointsxGame_FT_OA_A,Won_1H_OA_A,Drawn_1H_OA_A,Lost_1H_OA_A,GF_1H_OA_A,GA_1H_OA_A,GD_1H_OA_A,Points_1H_OA_A,rank_1H_OA_A,W%_1H_OA_A,D%_1H_OA_A,L%_1H_OA_A,GFxGame_1H_OA_A,GAxGame_1H_OA_A,GDxGame_1H_OA_A,PointsxGame_1H_OA_A,Won_2H_OA_A,Drawn_2H_OA_A,Lost_2H_OA_A,GF_2H_OA_A,GA_2H_OA_A,GD_2H_OA_A,Points_2H_OA_A,rank_2H_OA_A,W%_2H_OA_A,D%_2H_OA_A,L%_2H_OA_A,GFxGame_2H_OA_A,GAxGame_2H_OA_A,GDxGame_2H_OA_A,PointsxGame_2H_OA_A,time,lat,lon,temp,cloudcover,humidity,pressure,visibility,winddir,windspeed,night_game
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1,Unnamed: 281_level_1,Unnamed: 282_level_1,Unnamed: 283_level_1,Unnamed: 284_level_1,Unnamed: 285_level_1,Unnamed: 286_level_1,Unnamed: 287_level_1,Unnamed: 288_level_1,Unnamed: 289_level_1,Unnamed: 290_level_1,Unnamed: 291_level_1,Unnamed: 292_level_1,Unnamed: 293_level_1,Unnamed: 294_level_1,Unnamed: 295_level_1,Unnamed: 296_level_1
251728,301,765,1392,37,15478,430,591,591,1,13122,4-3-2-1,4-2-3-1,0,1,0-0,0-1,,,2016-08-12 20:00:00,2016-08-12,Europe/Rome,90,,,Bastia,@SCBastia,17,1905,37,Paris Saint Germain,@PSG_inside,17,1970,131,domestic,17,Ligue 1,0,2016/2017,1,2016-08-12,2016-08-14,Stade Armand-Césari,1,Furiani,16078,"(42.651400,9.442619)",Frank Schneider,896573,17,François Ciccolini,France,1962-06-03,France,455907,32,Unai Emery Etxegoien,Spain,1971-11-03,Spain,#202A44,#F0F0F0,10,0,9,1,4,6,15,2,0,26,3,0,0,5,24,9,6,3,0,6,3,16,6,3,74,3,0,0,0,14,215,134,62.0,600,529,88.0,101,43,96,52,2,-1,0,12,8,16,33,41,-8,44,14,0.333333,0.222222,0.444444,0.916667,1.138889,-0.222222,1.222222,7,20,9,14,14,0,41,14,0.194444,0.555556,0.25,0.388889,0.388889,0.0,1.138889,10,12,14,19,27,-8,42,15,0.277778,0.333333,0.388889,0.527778,0.75,-0.222222,1.166667,27,5,2,89,18,71,86,1,0.794118,0.147059,0.058824,2.617647,0.529412,2.088235,2.529412,17,15,2,35,10,25,66,1,0.5,0.441176,0.058824,1.029412,0.294118,0.735294,1.941176,24,7,3,54,8,46,79,1,0.705882,0.205882,0.088235,1.588235,0.235294,1.352941,2.323529,10,2,6,22,14,8,32,7,0.555556,0.111111,0.333333,1.222222,0.777778,0.444444,1.777778,5,10,3,10,5,5,25,9,0.277778,0.555556,0.166667,0.555556,0.277778,0.277778,1.388889,8,5,5,12,9,3,29,5,0.444444,0.277778,0.277778,0.666667,0.5,0.166667,1.611111,14,2,1,38,6,32,44,1,0.823529,0.117647,0.058824,2.235294,0.352941,1.882353,2.588235,4,11,2,10,4,6,23,6,0.235294,0.647059,0.117647,0.588235,0.235294,0.352941,1.352941,14,2,1,28,2,26,44,1,0.823529,0.117647,0.058824,1.647059,0.117647,1.529412,2.588235,2016-08-12 20:00:00,42.6514,9.442619,23.9,27,48,1022,30,21,3.06,1
251738,301,765,1392,4451,11668,6789,2919,0,1,8019,4-4-2,4-2-2-2,2,2,0-2,2-2,,,2016-08-12 20:30:00,2016-08-12,Europe/Rome,90,,,Monaco,@AS_Monaco,75285,1919,4451,Guingamp,@EAGuingamp,17,1912,1715,domestic,17,Ligue 1,0,2016/2017,1,2016-08-12,2016-08-14,Stade Louis II.,1,Monaco,18523,"(43.727606,7.415614)",Jérôme Miguelgorry,896525,20,José Leonardo Nunes Alves Sousa Jardim,Portugal,1974-08-01,Venezuela,459054,17,Antoine Kombouare,France,1963-11-16,New Caledonia,#C40010,#C40010,18,4,8,6,12,6,14,8,1,70,3,0,0,3,13,9,5,3,1,4,5,15,2,1,30,4,0,0,2,22,513,446,87.0,223,124,56.0,115,53,96,49,0,0,0,16,14,7,55,50,5,62,3,0.432432,0.378378,0.189189,1.486486,1.351351,0.135135,1.675676,13,13,11,22,22,0,52,8,0.351351,0.351351,0.297297,0.594595,0.594595,0.0,1.405405,13,13,11,33,28,5,52,7,0.351351,0.351351,0.297297,0.891892,0.756757,0.135135,1.405405,11,10,15,45,53,-8,43,15,0.305556,0.277778,0.416667,1.25,1.472222,-0.222222,1.194444,8,21,7,19,19,0,45,9,0.222222,0.583333,0.194444,0.527778,0.527778,0.0,1.25,11,9,16,26,34,-8,42,14,0.305556,0.25,0.444444,0.722222,0.944444,-0.222222,1.166667,9,6,3,28,19,9,33,7,0.5,0.333333,0.166667,1.555556,1.055556,0.5,1.833333,8,6,4,13,7,6,30,7,0.444444,0.333333,0.222222,0.722222,0.388889,0.333333,1.666667,7,7,4,15,12,3,28,10,0.388889,0.388889,0.222222,0.833333,0.666667,0.166667,1.555556,5,3,10,16,28,-12,18,14,0.277778,0.166667,0.555556,0.888889,1.555556,-0.666667,1.0,4,10,4,6,9,-3,22,12,0.222222,0.555556,0.222222,0.333333,0.5,-0.166667,1.222222,4,5,9,10,19,-9,17,15,0.222222,0.277778,0.5,0.555556,1.055556,-0.5,0.944444,2016-08-12 20:30:00,43.727606,7.415614,17.9,25,73,1014,10,312,0.92,1
2188,8,13,11,199,15241,22,42,22,1,20137,4-3-3,4-4-2,2,1,1-0,2-1,,,2016-08-13 13:30:00,2016-08-13,Europe/Rome,90,,,Hull City,@HullCity,462,1904,199,Leicester City,@LCFC,462,1884,117,domestic,462,Premier League,0,2016/2017,1,2016-08-13,2016-08-15,KCOM Stadium,1,Hull,25400,"(53.746111,-0.367778)",Mike Dean,896472,462,Mike Phelan,England,1962-09-24,England,893654,251,Claudio Ranieri,Italy,1951-10-20,Italy,#FC7E00,#B0E8E6,14,5,7,2,7,7,8,5,1,49,2,0,0,4,20,18,5,9,4,9,9,17,3,0,51,2,0,0,3,14,449,343,76.0,453,352,78.0,110,55,113,49,1,1,0,0,0,0,0,0,0,0,17,0.107143,0.142857,0.619048,0.92,1.833333,-1.364179,0.857143,0,0,0,0,0,0,0,20,0.090909,0.2,0.571429,0.166667,1.25,-0.478261,0.666667,0,0,0,0,0,0,0,19,0.0,0.210526,0.46875,0.4,1.074074,-1.5,0.615385,23,11,3,67,35,32,80,1,0.621622,0.297297,0.081081,1.810811,0.945946,0.864865,2.162162,17,14,6,28,13,15,65,1,0.459459,0.378378,0.162162,0.756757,0.351351,0.405405,1.756757,18,11,8,39,22,17,65,2,0.486486,0.297297,0.216216,1.054054,0.594595,0.459459,1.756757,0,0,0,0,0,0,0,19,0.214286,0.076923,0.714286,0.25,1.666667,-0.642857,0.857143,0,0,0,0,0,0,0,20,0.133333,0.142857,0.428571,0.0,4.0,-0.666667,0.928571,0,0,0,0,0,0,0,18,0.166667,0.0625,0.666667,0.470588,1.0625,-0.666667,0.5,11,5,2,32,17,15,38,1,0.611111,0.277778,0.111111,1.777778,0.944444,0.833333,2.111111,8,6,4,12,8,4,30,1,0.444444,0.333333,0.222222,0.666667,0.444444,0.222222,1.666667,11,3,4,20,9,11,36,1,0.611111,0.166667,0.222222,1.111111,0.5,0.611111,2.0,2016-08-13 13:30:00,53.746111,-0.367778,20.2,54,62,1014,10,315,3.36,0
2208,8,13,11,201,15293,51,10,10,1,24490,4-2-3-1,4-4-2,0,1,0-0,0-1,,,2016-08-13 16:00:00,2016-08-13,Europe/Rome,90,,,Crystal Palace,@CPFC,462,1905,201,West Bromwich Albion,@WBA,462,1878,119,domestic,462,Premier League,0,2016/2017,1,2016-08-13,2016-08-15,Selhurst Park,1,London,25073,"(51.398333,-0.085556)",Craig Pawson,896470,462,Alan Pardew,England,1961-07-18,England,455360,515,Tony Pulis,Wales,1958-01-16,Wales,#0046A8,#025C17,14,4,7,3,7,7,12,3,0,62,2,0,0,2,28,13,3,8,2,8,5,15,6,2,38,2,0,0,4,18,414,298,72.0,245,143,58.0,110,53,106,45,2,-1,0,10,9,17,36,46,-10,39,16,0.277778,0.25,0.472222,1.0,1.277778,-0.277778,1.083333,5,22,9,13,19,-6,37,16,0.138889,0.611111,0.25,0.361111,0.527778,-0.166667,1.027778,13,8,15,23,27,-4,47,9,0.361111,0.222222,0.416667,0.638889,0.75,-0.111111,1.305556,10,11,15,32,46,-14,41,14,0.277778,0.305556,0.416667,0.888889,1.277778,-0.388889,1.138889,7,18,11,16,23,-7,39,14,0.194444,0.5,0.305556,0.444444,0.638889,-0.194444,1.083333,9,16,11,16,23,-7,43,14,0.25,0.444444,0.305556,0.444444,0.638889,-0.194444,1.194444,5,3,10,17,22,-5,18,19,0.277778,0.166667,0.555556,0.944444,1.222222,-0.277778,1.0,3,13,2,9,7,2,22,13,0.166667,0.722222,0.111111,0.5,0.388889,0.111111,1.222222,4,5,9,8,15,-7,17,20,0.222222,0.277778,0.5,0.444444,0.833333,-0.388889,0.944444,4,7,7,13,21,-8,19,15,0.222222,0.388889,0.388889,0.722222,1.166667,-0.444444,1.055556,3,8,7,7,13,-6,17,14,0.166667,0.444444,0.388889,0.388889,0.722222,-0.333333,0.944444,4,10,4,6,8,-2,22,10,0.222222,0.555556,0.222222,0.333333,0.444444,-0.111111,1.222222,2016-08-13 16:00:00,51.398333,-0.085556,22.2,38,62,1022,43,243,4.28,0
2225,8,13,11,203,15242,7,26,0,1,32110,4-2-3-1,4-2-3-1,1,1,1-0,1-1,,,2016-08-13 16:00:00,2016-08-13,Europe/Rome,90,,,Middlesbrough,@Boro,462,1876,203,Stoke City,@stokecity,462,1868,207,domestic,462,Premier League,0,2016/2017,1,2016-08-13,2016-08-15,Riverside Stadium,1,Middlesbrough,34988,"(54.578333,-1.216944)",Kevin Friend,896473,32,Aitor Karanka de la Hoz,Spain,1973-09-18,Spain,455458,515,Mark Hughes,Wales,1963-11-01,Wales,#C40010,#339063,12,2,6,4,8,4,18,9,1,45,3,0,0,1,17,12,1,9,2,7,5,14,6,2,55,5,0,0,1,16,349,262,75.0,420,330,79.0,106,48,103,40,0,0,0,0,0,0,0,0,0,0,16,0.125,0.142857,0.704736,0.5,2.071429,-1.285714,0.416667,0,0,0,0,0,0,0,18,0.117647,0.285714,0.6,0.214286,0.8125,-1.0,0.5,0,0,0,0,0,0,0,17,0.157895,0.166667,0.666667,0.0,1.117647,-0.517779,0.4,13,9,14,38,52,-14,48,10,0.361111,0.25,0.388889,1.055556,1.444444,-0.388889,1.333333,10,13,13,16,22,-6,43,13,0.277778,0.361111,0.361111,0.444444,0.611111,-0.166667,1.194444,9,15,12,22,30,-8,42,15,0.25,0.416667,0.333333,0.611111,0.833333,-0.222222,1.166667,0,0,0,0,0,0,0,20,0.214286,0.0,0.857143,0.9,1.818182,-1.666667,0.8,0,0,0,0,0,0,0,17,0.066667,0.125,0.6,0.3,1.0,-0.277778,0.5,0,0,0,0,0,0,0,16,0.117647,0.125,0.5,0.3,1.4,-0.444444,0.9,6,5,7,18,29,-11,23,10,0.333333,0.277778,0.388889,1.0,1.611111,-0.611111,1.277778,5,6,7,8,13,-5,21,10,0.277778,0.333333,0.388889,0.444444,0.722222,-0.277778,1.166667,4,7,7,10,16,-6,19,14,0.222222,0.388889,0.388889,0.555556,0.888889,-0.333333,1.055556,2016-08-13 16:00:00,54.578333,-1.216944,19.0,63,61,1019,39,279,7.39,0


Filter for venue data

In [20]:
columns_venue_data = static_leagues.loc[:, :'colors_away_color'].columns.tolist() + ['result', 'goal_diff'] + \
    static_leagues.loc[:, 'time':].columns.tolist()
venue_data = static_leagues.loc[:, columns_venue_data]
venue_data = create_travel_distance(venue_data)
venue_data.head()

Unnamed: 0_level_0,league_id,season_id,stage_id,venue_id,referee_id,home_id,away_id,winner_team_id,commentaries,attendance,formations_home_formation,formations_away_formation,scores_home_score,scores_away_score,scores_ht_score,scores_ft_score,scores_et_score,scores_ps_score,time_starting_at_date_time,time_starting_at_date,time_starting_at_timezone,time_minute,standings_home_position,standings_away_position,home_name,home_twitter,home_country_id,home_founded,home_venue_id,away_name,away_twitter,away_country_id,away_founded,away_venue_id,league_type,league_country_id,league_name,league_is_cup,season_name,round_name,round_start,round_end,venue_name,venue_surface_isgrass,venue_city,venue_capacity,venue_coordinates,referee_fullname,homecoach_coach_id,homecoach_country_id,homecoach_fullname,homecoach_nationality,homecoach_birthdate,homecoach_birthcountry,awaycoach_coach_id,awaycoach_country_id,awaycoach_fullname,awaycoach_nationality,awaycoach_birthdate,awaycoach_birthcountry,colors_home_color,colors_away_color,result,goal_diff,time,lat,lon,temp,cloudcover,humidity,pressure,visibility,winddir,windspeed,night_game,coord_away,travel_dist(km)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1
251728,301,765,1392,37,15478,430,591,591,1,13122,4-3-2-1,4-2-3-1,0,1,0-0,0-1,,,2016-08-12 20:00:00,2016-08-12,Europe/Rome,90,,,Bastia,@SCBastia,17,1905,37,Paris Saint Germain,@PSG_inside,17,1970,131,domestic,17,Ligue 1,0,2016/2017,1,2016-08-12,2016-08-14,Stade Armand-Césari,1,Furiani,16078,"(42.651400,9.442619)",Frank Schneider,896573,17,François Ciccolini,France,1962-06-03,France,455907,32,Unai Emery Etxegoien,Spain,1971-11-03,Spain,#202A44,#F0F0F0,2,-1,2016-08-12 20:00:00,42.6514,9.442619,23.9,27,48,1022,30,21,3.06,1,"(48.841389,2.253056)",886
251738,301,765,1392,4451,11668,6789,2919,0,1,8019,4-4-2,4-2-2-2,2,2,0-2,2-2,,,2016-08-12 20:30:00,2016-08-12,Europe/Rome,90,,,Monaco,@AS_Monaco,75285,1919,4451,Guingamp,@EAGuingamp,17,1912,1715,domestic,17,Ligue 1,0,2016/2017,1,2016-08-12,2016-08-14,Stade Louis II.,1,Monaco,18523,"(43.727606,7.415614)",Jérôme Miguelgorry,896525,20,José Leonardo Nunes Alves Sousa Jardim,Portugal,1974-08-01,Venezuela,459054,17,Antoine Kombouare,France,1963-11-16,New Caledonia,#C40010,#C40010,0,0,2016-08-12 20:30:00,43.727606,7.415614,17.9,25,73,1014,10,312,0.92,1,"(48.566285,-3.164599)",977
2188,8,13,11,199,15241,22,42,22,1,20137,4-3-3,4-4-2,2,1,1-0,2-1,,,2016-08-13 13:30:00,2016-08-13,Europe/Rome,90,,,Hull City,@HullCity,462,1904,199,Leicester City,@LCFC,462,1884,117,domestic,462,Premier League,0,2016/2017,1,2016-08-13,2016-08-15,KCOM Stadium,1,Hull,25400,"(53.746111,-0.367778)",Mike Dean,896472,462,Mike Phelan,England,1962-09-24,England,893654,251,Claudio Ranieri,Italy,1951-10-20,Italy,#FC7E00,#B0E8E6,1,1,2016-08-13 13:30:00,53.746111,-0.367778,20.2,54,62,1014,10,315,3.36,0,"(52.620278,-1.142222)",136
2208,8,13,11,201,15293,51,10,10,1,24490,4-2-3-1,4-4-2,0,1,0-0,0-1,,,2016-08-13 16:00:00,2016-08-13,Europe/Rome,90,,,Crystal Palace,@CPFC,462,1905,201,West Bromwich Albion,@WBA,462,1878,119,domestic,462,Premier League,0,2016/2017,1,2016-08-13,2016-08-15,Selhurst Park,1,London,25073,"(51.398333,-0.085556)",Craig Pawson,896470,462,Alan Pardew,England,1961-07-18,England,455360,515,Tony Pulis,Wales,1958-01-16,Wales,#0046A8,#025C17,2,-1,2016-08-13 16:00:00,51.398333,-0.085556,22.2,38,62,1022,43,243,4.28,0,"(52.509068,-1.963892)",179
2225,8,13,11,203,15242,7,26,0,1,32110,4-2-3-1,4-2-3-1,1,1,1-0,1-1,,,2016-08-13 16:00:00,2016-08-13,Europe/Rome,90,,,Middlesbrough,@Boro,462,1876,203,Stoke City,@stokecity,462,1868,207,domestic,462,Premier League,0,2016/2017,1,2016-08-13,2016-08-15,Riverside Stadium,1,Middlesbrough,34988,"(54.578333,-1.216944)",Kevin Friend,896473,32,Aitor Karanka de la Hoz,Spain,1973-09-18,Spain,455458,515,Mark Hughes,Wales,1963-11-01,Wales,#C40010,#339063,0,0,2016-08-13 16:00:00,54.578333,-1.216944,19.0,63,61,1019,39,279,7.39,0,"(52.988419,-2.175398)",188


In [21]:
venue_data.to_csv('../../Data/Modeling_Before_Preparation/Venue_data_v01.csv')

## *Team Attributes Data*

Import Teams' Attributes Data

In [22]:
# Data Import 
tm_attr = pd.read_csv('../../Data/From_Collection/FIFA_scraped/team_weekly_complete.csv', low_memory=False)
# ObservationDate format
date_replace = {'June': 'Jun.', 'Sept': 'Sep', 'April': 'Apr.', 'June': 'Jun.', 'March': 'Mar.', 'May': 'May.', 'July': 'Jul.'}
for key, value in date_replace.items():
    tm_attr['ObservationDate'] = tm_attr['ObservationDate'].str.replace(key, value)
tm_attr['ObservationDate'] = tm_attr['ObservationDate'].map(lambda x: datetime.strptime(x, '%b. %d, %Y'))
# Sort values + filter by date
tm_attr = tm_attr.sort_values(by = ['ObservationDate', 'TeamName'], ascending=True).reset_index(drop = True)
attr16 = tm_attr.loc[(tm_attr['ObservationDate'] > '2016-05-01'), :]
# Fill columns' NAs
columns_to_fill = ['Captain','Short Free Kick','Long Free Kick','Penalties','Left Corner','Right Corner']
for col in columns_to_fill:
    attr16[col] = attr16.groupby('TeamName')[col].fillna(method='bfill').fillna(method='ffill')
# Get Width column 
attr16['Width'] = np.where(attr16['Width'].isna(), attr16['Team Width'], attr16['Width'])
attr16['Width'] = np.where(attr16['Width'] >= 10, round(attr16['Width']/10), round(attr16['Width']))
# Filter columns
attr16 = attr16.loc[:, ['ObservationDate','TeamName','Attack','Midfield','Defence','TransferBudget','Width','is_major','RivalTeam','Captain','Short Free Kick','Long Free Kick','Penalties','Left Corner','Right Corner','TeamRoster','LoanedPlayers']]

Replace teams' names for consistency

In [23]:
# Teams Names from attr16 and replacement names 
teams_name_repl =  {'1. ': '', 'RC Deportivo de La Coruña': 'Deportivo La Coruña', 'RC Deportivo': 'Deportivo La Coruña', 'RC Celta de Vigo': 'RC Celta', 'RC Celta': 'Celta de Vigo','RC ': '', 'RCD ': '', ' Football Club SA': '', 'AS ': '', ' Football Club': '', ' Football': '', 'R.': 'Real', 'CD ': '', 'CF ': '', ' CF': '', 'CA ': '', ' FC': '', 'LOSC ': '', 'OGC ': '', 'AD ': '', 'Balompié': 'BP', ' BP': '', ' 63': '', 'SL ': '', 'Nancy-Lorraine': 'Nancy', 'Arminia Bielefeld': 'DSC Arminia Bielefeld', 'Athletic Club de Bilbao': 'Athletic Club', 'Atlético de Madrid': 'Atlético Madrid', 'Bergamo Calcio': 'Atalanta', "Borussia M'gladbach": 'Borussia Mönchengladbach', 'Bournemouth': 'AFC Bournemouth', 'Chievo Verona': 'Chievo', 'Clermont Foot': 'Clermont', 'D. Alavés': 'Deportivo Alavés', 'DijonO': 'Dijon', 'ES Troyes AC': 'Troyes', 'ESTAC Troyes': 'Troyes', 'En Avant de Guingamp': 'Guingamp', 'En Avant Guingamp': 'Guingamp', 'FC Bayern Munich': 'Bayern München', 'FC Bayern München': 'Bayern München','Bayern München': 'FC Bayern München', 'FC Girondins de Bordeaux': 'Bordeaux', 'Girondins de Bordeaux': 'Bordeaux', 'FC Ingolstadt 04': 'Ingolstadt', 'FC Lorient': 'Lorient', 'FC Metz': 'Metz', 'FC Nantes': 'Nantes', 'FC Nürnberg': 'Nürnberg', 'FC Schalke 04': 'Schalke 04',  'Football Club de Metz': 'Metz', 'Hertha BSC Berlin': 'Hertha BSC', 'Hertha Berlin': 'Hertha BSC', 'La Spezia': 'Spezia', 'Latium': 'Lazio', 'Levante UD': 'Levante', 'Montpellier HSC': 'Montpellier', 'Montpellier Hérault SC': 'Montpellier', 'Nîmes Olympique': 'Nîmes', 'Olympique de Marseille': 'Olympique Marseille', 'Paris Saint-Germain': 'Paris Saint Germain',  'Strasbourg Alsace': 'Strasbourg', 'Real Sporting de Gijón': 'Sporting Gijón', 'SC Bastia': 'Bastia', 'SC Paderborn 07': 'Paderborn', 'SD Huesca': 'Huesca', 'SM Caen': 'Caen', 'Stade Malherbe Caen': 'Caen', 'SV Darmstadt 98': 'Darmstadt 98', 'SV Werder Bremen': 'Werder Bremen', 'Spal': 'SPAL', 'Sport-Club Freiburg': 'SC Freiburg', 'Stade Brestois 29': 'Brest', 'Stade Rennais': 'Rennes', 'Stade de Reims': 'Reims', 'TSG 1899 Hoffenheim': 'TSG Hoffenheim', 'Toulouse Club': 'Toulouse', 'UD Las Palmas': 'Las Palmas', 'VfL Bochum': 'VfL Bochum 1848', 'Racing Club de Lens': 'Lens', 'Bilbao Athletic': 'Athletic Club', 'Salerno': 'Salernitana', 'Cádiz C.F.': 'Cádiz', 'Cremona': 'Cremonese', 'FC Porto': 'Porto', 'SD Ponferradina': 'Ponferradina', 'UD Almería': 'Almería', 'Reus': 'Reus Deportiu', 'Gimnàstic de Tarragona': 'Gimnàstic Tarragona', 'Vercelli': 'Pro Vercelli', 'Numancia de Soria': 'Numancia', 'Castellammare di Stabia': 'Juve Stabia', 'SD Amorebieta': 'Amorebieta', 'DSC DSC Arminia Bielefeld': 'DSC Arminia Bielefeld', 'AFC AFC Bournemouth': 'AFC Bournemouth', 'Nancy Lorraine': 'Nancy'}

# Substitute names using dictionary
for old, new in teams_name_repl.items():
    attr16['TeamName'] = attr16['TeamName'].str.replace(old, new, regex=False)
# Tests for names: if both OK continue with no problem
st_mask = static.loc[static['league_is_cup'] == 0, 'home_name']
attr_mask = attr16.loc[(attr16['is_major'] == 1) & (attr16['ObservationDate'] > '2016-09-28'), 'TeamName']
print('Static test: OK!') if not (set(st_mask) - set(attr16.loc[:, 'TeamName'])) else print(set(st_mask) - set(attr16.loc[:, 'TeamName']))
print('Attributes test: OK!') if not (set(attr_mask) - set(st_mask)) else print(set(attr_mask) - set(st_mask))

Static test: OK!
Attributes test: OK!


Functions - for data merge

In [24]:
def test_merge(df, homeORaway):
    """This function tests whether the merger between DataFrames occurred correctly or not by comparing both teams names and dates. It prints how many observations are not consistently merged.
    df: dataframe to test
    homeORaway: either 'home' or 'away' are tested 
    """
    print('****** TEST ', homeORaway.upper(), ' ******')
    print('Name Errors = ', (df[homeORaway + '_name'] != df[homeORaway + '_TeamName']).sum())
    print('Date Errors = ', (df[homeORaway + '_ObservationDate'].dt.date > df['time_starting_at_date_time'].dt.date).sum())

In [25]:
def merge_list_dicts(original_df, list_of_dicts, prefix):
    """This function merges the provided original_df with a new_df created from a provided list of dictionaries (list_of_dicts), and also uses test_merge() to test the success of the merging operation.
    original_df: original dataframe 
    list_of_dicts: list of dictionaries to merge with original_df
    prefix: prefix added to columns created from list_of_dicts (either 'home' or 'away', if else the function prints a warning)
    """
    if prefix not in ['home', 'away']: # uncorrectly specified prefix
        print('Selected option is not possoble!! (Must be home or away)')
    else:
        new_df = pd.DataFrame(list_of_dicts).add_prefix(prefix + '_') # Create new home dataframe
        merged_df = pd.merge(original_df, new_df, how='outer', left_index=True, right_index=True) # Merge it with original_df
        test_merge(merged_df, prefix) # Test the operation
        return merged_df

In [26]:
def get_binaryrivals(df):
    df['isrival_home'] = np.where(df['away_name'] == df['home_RivalTeam'], 1, 0)
    df['isrival_away'] = np.where(df['home_name'] == df['away_RivalTeam'], 1, 0)
    df.drop(['home_RivalTeam', 'away_RivalTeam'], axis = 1, inplace=True)
    return df

Merge static and attr16

In [27]:
static_leagues = static_leagues.reset_index()

# Initial loop to consider both home and away teams
for h_a in ['home_name', 'away_name']:
    all_ordered = []
    for index, row in tqdm(static_leagues.iterrows(), total=static_leagues.shape[0]):
        mask_temp = attr16.loc[attr16['TeamName'] == row[h_a], :]
        if mask_temp.shape[0] == 0: dict_append = {}
        else:
            # Calculate the difference between all dates from static_leagues and attr16 datasets, only when the teams are the same (else 999)
            st_date = row['time_starting_at_date_time'].date()
            time_d = [(st_date - j.date()).days if (st_date - j.date()).days >= 0 else 999 for j in mask_temp.loc[:, 'ObservationDate']]
            index_found = min(range(len(time_d)), key=time_d.__getitem__) # Get closest date's index from time_d
            # In case the previous step was unsuccessful consider closest date but in the future
            if index_found == 0 and time_d[index_found] > 90:
                time_d_exrelegated = [(st_date - k.date()).days if (st_date - k.date()).days < 0 else -999 for k in mask_temp.loc[:, 'ObservationDate']]
                index_found = max(range(len(time_d_exrelegated)), key=time_d_exrelegated.__getitem__) # Get closest date index but posterior
            dict_append = dict(mask_temp.iloc[index_found,:])

        # Append dictionary of attributes data to list
        all_ordered.append(dict_append) # Append the row to a list

    # Merge list of dictionaries to initial dataframe
    if h_a == 'home_name':
        merge_home = merge_list_dicts(original_df=static_leagues, list_of_dicts=all_ordered, prefix='home')
    else:
        matchattr_l = merge_list_dicts(original_df=merge_home, list_of_dicts=all_ordered, prefix='away')

  """Entry point for launching an IPython kernel.


  0%|          | 0/10536 [00:00<?, ?it/s]

****** TEST  HOME  ******
Name Errors =  0
Date Errors =  2


  0%|          | 0/10536 [00:00<?, ?it/s]

****** TEST  AWAY  ******
Name Errors =  0
Date Errors =  2


Results and Checks

In [28]:
# Concatenate resulting df of leagues' matches with attributes and df of cups' matches without attributes (should create NAs)
# matchattr_l = pd.concat([matchattr_l, static_cups], axis=0).sort_index()
# matchattr_l = matchattr_l.set_index('id')
# Test if concatenated properly, NAs for teams' attributes should be of the same lenght of static_cups data
print('Concatenate step is OK?', matchattr_l['home_TeamName'].isna().sum() == matchattr_l['away_TeamName'].isna().sum())
print('Are home and away dates perfectly equal?', (matchattr_l.loc[(~matchattr_l['home_ObservationDate'].isna()), 'home_ObservationDate'] != matchattr_l.loc[(~matchattr_l['away_ObservationDate'].isna()), 'away_ObservationDate']).sum() == 0)

Concatenate step is OK? True
Are home and away dates perfectly equal? False


###### **NOTES**: To avoid problems with teams coming from a minor league we consider the case of the index_min taking the value 0 and the team name from match data not matching the team name on tm_weekly16, in this case we consider the attributes of the right team but coming from the closest observation in the future (instead that from the past). This problem could be avoided through scraping also data from the minor leagues! **SOLVED SCRAPING MINOR LEAGUES TOO**

Filter Team Attributes Data

In [29]:
columns_team_at = matchattr_l.loc[:, :'colors_away_color'].columns.tolist() + ['result', 'goal_diff'] + \
    matchattr_l.loc[:, 'home_ObservationDate':].columns.tolist()
team_at = matchattr_l.loc[:, columns_team_at]
team_at = get_binaryrivals(team_at)
team_at.head()

Unnamed: 0,id,league_id,season_id,stage_id,venue_id,referee_id,home_id,away_id,winner_team_id,commentaries,attendance,formations_home_formation,formations_away_formation,scores_home_score,scores_away_score,scores_ht_score,scores_ft_score,scores_et_score,scores_ps_score,time_starting_at_date_time,time_starting_at_date,time_starting_at_timezone,time_minute,standings_home_position,standings_away_position,home_name,home_twitter,home_country_id,home_founded,home_venue_id,away_name,away_twitter,away_country_id,away_founded,away_venue_id,league_type,league_country_id,league_name,league_is_cup,season_name,round_name,round_start,round_end,venue_name,venue_surface_isgrass,venue_city,venue_capacity,venue_coordinates,referee_fullname,homecoach_coach_id,homecoach_country_id,homecoach_fullname,homecoach_nationality,homecoach_birthdate,homecoach_birthcountry,awaycoach_coach_id,awaycoach_country_id,awaycoach_fullname,awaycoach_nationality,awaycoach_birthdate,awaycoach_birthcountry,colors_home_color,colors_away_color,result,goal_diff,home_ObservationDate,home_TeamName,home_Attack,home_Midfield,home_Defence,home_TransferBudget,home_Width,home_is_major,home_Captain,home_Short Free Kick,home_Long Free Kick,home_Penalties,home_Left Corner,home_Right Corner,home_TeamRoster,home_LoanedPlayers,away_ObservationDate,away_TeamName,away_Attack,away_Midfield,away_Defence,away_TransferBudget,away_Width,away_is_major,away_Captain,away_Short Free Kick,away_Long Free Kick,away_Penalties,away_Left Corner,away_Right Corner,away_TeamRoster,away_LoanedPlayers,isrival_home,isrival_away
0,251728,301,765,1392,37,15478,430,591,591,1,13122,4-3-2-1,4-2-3-1,0,1,0-0,0-1,,,2016-08-12 20:00:00,2016-08-12,Europe/Rome,90,,,Bastia,@SCBastia,17,1905,37,Paris Saint Germain,@PSG_inside,17,1970,131,domestic,17,Ligue 1,0,2016/2017,1,2016-08-12,2016-08-14,Stade Armand-Césari,1,Furiani,16078,"(42.651400,9.442619)",Frank Schneider,896573,17,François Ciccolini,France,1962-06-03,France,455907,32,Unai Emery Etxegoien,Spain,1971-11-03,Spain,#202A44,#F0F0F0,2,-1,2016-06-09,Bastia,70,71,71,2800000,5.0,1,Cahuzac,Ayité,Danic,Ayité,Danic,Danic,"['Jean-Louis Leca', 'Gilles Cioni', 'Sébastien...",[],2016-06-09,Paris Saint Germain,84,82,82,100000000,6.0,1,Thiago Silva,Ibrahimović,Ibrahimović,Ibrahimović,Di María,Di María,"['Kevin Trapp', 'Serge Aurier', 'Thiago Silva'...","['Alphonse Aréola', 'Lucas Digne', 'Youssouf S...",0,0
1,251738,301,765,1392,4451,11668,6789,2919,0,1,8019,4-4-2,4-2-2-2,2,2,0-2,2-2,,,2016-08-12 20:30:00,2016-08-12,Europe/Rome,90,,,Monaco,@AS_Monaco,75285,1919,4451,Guingamp,@EAGuingamp,17,1912,1715,domestic,17,Ligue 1,0,2016/2017,1,2016-08-12,2016-08-14,Stade Louis II.,1,Monaco,18523,"(43.727606,7.415614)",Jérôme Miguelgorry,896525,20,José Leonardo Nunes Alves Sousa Jardim,Portugal,1974-08-01,Venezuela,459054,17,Antoine Kombouare,France,1963-11-16,New Caledonia,#C40010,#C40010,0,0,2016-06-09,Monaco,77,77,77,55000000,5.0,1,Toulalan,João Moutinho,João Moutinho,Fabinho,Bernardo Silva,Lemar,"['Danijel Subašić', 'Fabinho', 'WallaceL', 'Ri...","['Falcao', 'Valère Germain', 'Marcos Lopes', ""...",2016-06-09,Guingamp,73,73,72,3200000,5.0,1,Giresse,Salibur,Salibur,Briand,Salibur,Salibur,"['Jonas Lössl', 'Jonathan Martins Pereira', 'C...","['Ronnie Schwartz', 'Rachid Alioui', 'Baïssama...",0,0
2,2188,8,13,11,199,15241,22,42,22,1,20137,4-3-3,4-4-2,2,1,1-0,2-1,,,2016-08-13 13:30:00,2016-08-13,Europe/Rome,90,,,Hull City,@HullCity,462,1904,199,Leicester City,@LCFC,462,1884,117,domestic,462,Premier League,0,2016/2017,1,2016-08-13,2016-08-15,KCOM Stadium,1,Hull,25400,"(53.746111,-0.367778)",Mike Dean,896472,462,Mike Phelan,England,1962-09-24,England,893654,251,Claudio Ranieri,Italy,1951-10-20,Italy,#FC7E00,#B0E8E6,1,1,2016-06-09,Hull City,73,73,73,15000000,5.0,0,Dawson,Snodgrass,Huddlestone,Snodgrass,Snodgrass,Snodgrass,"['Eldin Jakupović', 'Moses Odubajo', 'Michael ...",[],2016-06-09,Leicester City,76,77,75,27000000,4.0,1,Morgan,Mahrez,Mahrez,Vardy,Mahrez,Fuchs,"['Kasper Schmeichel', 'Danny Simpson', 'Wes Mo...",['Yohan Benalouane'],0,0
3,2208,8,13,11,201,15293,51,10,10,1,24490,4-2-3-1,4-4-2,0,1,0-0,0-1,,,2016-08-13 16:00:00,2016-08-13,Europe/Rome,90,,,Crystal Palace,@CPFC,462,1905,201,West Bromwich Albion,@WBA,462,1878,119,domestic,462,Premier League,0,2016/2017,1,2016-08-13,2016-08-15,Selhurst Park,1,London,25073,"(51.398333,-0.085556)",Craig Pawson,896470,462,Alan Pardew,England,1961-07-18,England,455360,515,Tony Pulis,Wales,1958-01-16,Wales,#0046A8,#025C17,2,-1,2016-06-09,Crystal Palace,75,76,75,23500000,4.0,1,Delaney,Puncheon,Puncheon,Wickham,Puncheon,Puncheon,"['Wayne Hennessey', 'Joel Ward', 'Scott Dann',...",[],2016-06-09,West Bromwich Albion,76,73,73,22000000,5.0,1,Fletcher,McClean,McClean,Gardner,McClean,McClean,"['Ben Foster', 'Craig Dawson', 'Gareth McAuley...",[],0,0
4,2225,8,13,11,203,15242,7,26,0,1,32110,4-2-3-1,4-2-3-1,1,1,1-0,1-1,,,2016-08-13 16:00:00,2016-08-13,Europe/Rome,90,,,Middlesbrough,@Boro,462,1876,203,Stoke City,@stokecity,462,1868,207,domestic,462,Premier League,0,2016/2017,1,2016-08-13,2016-08-15,Riverside Stadium,1,Middlesbrough,34988,"(54.578333,-1.216944)",Kevin Friend,896473,32,Aitor Karanka de la Hoz,Spain,1973-09-18,Spain,455458,515,Mark Hughes,Wales,1963-11-01,Wales,#C40010,#339063,0,0,2016-06-09,Middlesbrough,74,73,71,8000000,6.0,0,Leadbitter,Ramírez,Clayton,Leadbitter,Leadbitter,Ramírez,"['Dimitrios Konstantopoulos', 'Nsue', 'Daniel ...",[],2016-06-09,Stoke City,76,78,76,30000000,5.0,1,Shawcross,Arnautović,Arnautović,Bojan,Arnautović,Arnautović,"['Jakob Haugaard', 'Phil Bardsley', 'Ryan Shaw...",[],0,0


## *Rest Data*

#### Functions

Number of observation last n days

In [30]:
def number_observation_last_n_days(items, pivot, n):
    """This function returns, for an iterable object items, the number of date elements that are previous to the pivot date and contained in a n day range. 

    items: iterable containg datetime elements
    pivot: datetime value
    n: max value of past days (range is between pivot and [pivot - n] days)
    """
    return len([i for i in items if i < pivot and (pivot-i).days <= n])

def games_last_n_days(df, num_days_list=[60, 30, 15, 7]):
    """This function creates new columns in the given DataFrame (df) by computing the number of games played by both the home and away team in the last n days (num_days_list). The parameter num_days_list has default values ([60, 30, 15, 7]), that could be differently specified. 

    df: dataframe to add new columns
    num_days_list (Default: [60, 30, 15, 7]): list of the number of past days to consider when looking at previous matches
    """
    for elem in tqdm(list(product(['home_', 'away_'], num_days_list)), desc='New columns in games_last_n_days()', unit='New col.'):
        tqdm.pandas(desc=('Games over last ' + str(elem[1]) + ' days for ' + elem[0]))
        df.loc[:, elem[0] + 'n_games_last' + str(elem[1]) + 'd'] = df.progress_apply(lambda x: number_observation_last_n_days(\
            items = df.loc[((x[elem[0] + 'id'] == df['home_id']) | (x[elem[0] + 'id'] == df['away_id'])), 'time_starting_at_date'],\
                pivot = x['time_starting_at_date'], n=elem[1]), axis=1)
    return df

Rest days

In [31]:
def days_differential_nearest_previous_date(items, pivot):
    """This function returns, for an iterable object items, the minimum days distance between any element of items previous to pivot, and pivot date value. 

    items: iterable containg datetime elements
    pivot: datetime value
    """
    return min([abs((i - pivot).days) for i in items if i < pivot], default=999)

def rest_days_columns(df):
    """This function creates 2 new columns (one for home team and one for away team) in the given DataFrame (df) by computing the  days differential (rest days) between the considered game and the previous one.

    df: dataframe to add new columns
    """
    for team in tqdm(['home_', 'away_'], desc='New columns in rest_days_columns()', unit='New col.'):
        tqdm.pandas(desc=('Rest days for ' + team))
        df.loc[:, team + 'rest_days'] = df.progress_apply(lambda x: days_differential_nearest_previous_date(
          items = df.loc[((x[team + 'id'] == df['home_id']) | (x[team + 'id'] == df['away_id'])), 'time_starting_at_date'], 
          pivot = x['time_starting_at_date']), axis=1)
    return df 

#### New Features 

In [32]:
# Mask DataFrame
static = static.set_index('id')
musk = static.loc[:, ['home_id', 'away_id', 'home_name', 'away_name', 'time_starting_at_date', 'league_is_cup']]
musk[musk.select_dtypes(['object']).columns] = musk.select_dtypes(['object']).apply(lambda x: x.astype('category'))
# Games over previous n days
musk = games_last_n_days(musk)
# Rest days
musk = rest_days_columns(musk)
# Remove outliers in rest days
musk.loc[:, 'home_rest_days'] = np.where(musk.loc[:, 'home_rest_days'] < 30, musk.loc[:, 'home_rest_days'], 30)
musk.loc[:, 'away_rest_days'] = np.where(musk.loc[:, 'away_rest_days'] < 30, musk.loc[:, 'away_rest_days'], 30)
# DataFrame datatypes
musk[musk.select_dtypes(np.number).columns] = musk.select_dtypes(np.number).apply(lambda x: x.astype(int))
musk.head(2)

New columns in games_last_n_days():   0%|          | 0/8 [00:00<?, ?New col./s]

Games over last 60 days for home_:   0%|          | 0/23483 [00:00<?, ?it/s]

Games over last 30 days for home_:   0%|          | 0/23483 [00:00<?, ?it/s]

Games over last 15 days for home_:   0%|          | 0/23483 [00:00<?, ?it/s]

Games over last 7 days for home_:   0%|          | 0/23483 [00:00<?, ?it/s]

Games over last 60 days for away_:   0%|          | 0/23483 [00:00<?, ?it/s]

Games over last 30 days for away_:   0%|          | 0/23483 [00:00<?, ?it/s]

Games over last 15 days for away_:   0%|          | 0/23483 [00:00<?, ?it/s]

Games over last 7 days for away_:   0%|          | 0/23483 [00:00<?, ?it/s]

New columns in rest_days_columns():   0%|          | 0/2 [00:00<?, ?New col./s]

Rest days for home_:   0%|          | 0/23483 [00:00<?, ?it/s]

Rest days for away_:   0%|          | 0/23483 [00:00<?, ?it/s]

Unnamed: 0_level_0,home_id,away_id,home_name,away_name,time_starting_at_date,league_is_cup,home_n_games_last60d,home_n_games_last30d,home_n_games_last15d,home_n_games_last7d,away_n_games_last60d,away_n_games_last30d,away_n_games_last15d,away_n_games_last7d,home_rest_days,away_rest_days
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1062005,8643,579,B36,The New Saints,2015-07-01,1,0,0,0,0,0,0,0,0,30,30
1067032,7345,589,Ordabasy,Beitar Jerusalem,2015-07-02,1,0,0,0,0,0,0,0,0,30,30


In [33]:
team_attr_rest = pd.merge(team_at.set_index('id'), musk, how='left', left_index=True, right_index=True, suffixes=[None, '_extra'])
team_attr_rest.drop(columns=team_attr_rest.filter(regex='_extra$').columns.tolist(), inplace=True)
print(team_attr_rest.shape)
team_attr_rest.head(2)

(10536, 108)


Unnamed: 0_level_0,league_id,season_id,stage_id,venue_id,referee_id,home_id,away_id,winner_team_id,commentaries,attendance,formations_home_formation,formations_away_formation,scores_home_score,scores_away_score,scores_ht_score,scores_ft_score,scores_et_score,scores_ps_score,time_starting_at_date_time,time_starting_at_date,time_starting_at_timezone,time_minute,standings_home_position,standings_away_position,home_name,home_twitter,home_country_id,home_founded,home_venue_id,away_name,away_twitter,away_country_id,away_founded,away_venue_id,league_type,league_country_id,league_name,league_is_cup,season_name,round_name,round_start,round_end,venue_name,venue_surface_isgrass,venue_city,venue_capacity,venue_coordinates,referee_fullname,homecoach_coach_id,homecoach_country_id,homecoach_fullname,homecoach_nationality,homecoach_birthdate,homecoach_birthcountry,awaycoach_coach_id,awaycoach_country_id,awaycoach_fullname,awaycoach_nationality,awaycoach_birthdate,awaycoach_birthcountry,colors_home_color,colors_away_color,result,goal_diff,home_ObservationDate,home_TeamName,home_Attack,home_Midfield,home_Defence,home_TransferBudget,home_Width,home_is_major,home_Captain,home_Short Free Kick,home_Long Free Kick,home_Penalties,home_Left Corner,home_Right Corner,home_TeamRoster,home_LoanedPlayers,away_ObservationDate,away_TeamName,away_Attack,away_Midfield,away_Defence,away_TransferBudget,away_Width,away_is_major,away_Captain,away_Short Free Kick,away_Long Free Kick,away_Penalties,away_Left Corner,away_Right Corner,away_TeamRoster,away_LoanedPlayers,isrival_home,isrival_away,home_n_games_last60d,home_n_games_last30d,home_n_games_last15d,home_n_games_last7d,away_n_games_last60d,away_n_games_last30d,away_n_games_last15d,away_n_games_last7d,home_rest_days,away_rest_days
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1
251728,301,765,1392,37,15478,430,591,591,1,13122,4-3-2-1,4-2-3-1,0,1,0-0,0-1,,,2016-08-12 20:00:00,2016-08-12,Europe/Rome,90,,,Bastia,@SCBastia,17,1905,37,Paris Saint Germain,@PSG_inside,17,1970,131,domestic,17,Ligue 1,0,2016/2017,1,2016-08-12,2016-08-14,Stade Armand-Césari,1,Furiani,16078,"(42.651400,9.442619)",Frank Schneider,896573,17,François Ciccolini,France,1962-06-03,France,455907,32,Unai Emery Etxegoien,Spain,1971-11-03,Spain,#202A44,#F0F0F0,2,-1,2016-06-09,Bastia,70,71,71,2800000,5.0,1,Cahuzac,Ayité,Danic,Ayité,Danic,Danic,"['Jean-Louis Leca', 'Gilles Cioni', 'Sébastien...",[],2016-06-09,Paris Saint Germain,84,82,82,100000000,6.0,1,Thiago Silva,Ibrahimović,Ibrahimović,Ibrahimović,Di María,Di María,"['Kevin Trapp', 'Serge Aurier', 'Thiago Silva'...","['Alphonse Aréola', 'Lucas Digne', 'Youssouf S...",0,0,0,0,0,0,0,0,0,0,30,30
251738,301,765,1392,4451,11668,6789,2919,0,1,8019,4-4-2,4-2-2-2,2,2,0-2,2-2,,,2016-08-12 20:30:00,2016-08-12,Europe/Rome,90,,,Monaco,@AS_Monaco,75285,1919,4451,Guingamp,@EAGuingamp,17,1912,1715,domestic,17,Ligue 1,0,2016/2017,1,2016-08-12,2016-08-14,Stade Louis II.,1,Monaco,18523,"(43.727606,7.415614)",Jérôme Miguelgorry,896525,20,José Leonardo Nunes Alves Sousa Jardim,Portugal,1974-08-01,Venezuela,459054,17,Antoine Kombouare,France,1963-11-16,New Caledonia,#C40010,#C40010,0,0,2016-06-09,Monaco,77,77,77,55000000,5.0,1,Toulalan,João Moutinho,João Moutinho,Fabinho,Bernardo Silva,Lemar,"['Danijel Subašić', 'Fabinho', 'WallaceL', 'Ri...","['Falcao', 'Valère Germain', 'Marcos Lopes', ""...",2016-06-09,Guingamp,73,73,72,3200000,5.0,1,Giresse,Salibur,Salibur,Briand,Salibur,Salibur,"['Jonas Lössl', 'Jonathan Martins Pereira', 'C...","['Ronnie Schwartz', 'Rachid Alioui', 'Baïssama...",0,0,2,2,1,0,0,0,0,0,9,30


In [34]:
team_attr_rest.to_csv('../../Data/Modeling_Before_Preparation/Team_attributes_and_Rest_data_v01.csv')

## *Form Data*

Functions

In [35]:
def rename_columns_to_team(df):
    """This function returns a DataFrame, after replacing the substrings 'home' and 'away' with the substring 'team' in all columns' names.

    df: DataFrame
    """
    renamed_df = df.rename(columns={old_name: (re.sub(r'(home|away)', 'team', old_name)) for old_name in df.columns})
    return renamed_df

In [36]:
def avg_value_n_closest_dates(df, date_col, value_col, n_size):
    """Given a DataFrame, a column and a integer n; this function returns the average for the specified column in the DataFrame of the last (considering precedent and closest dates) n_size number of values.

    df: DataFrame
    date_col: column in df which contains dates values
    value_col: column of values to average from
    n_size: number of last values (in date) 
    """
    return np.mean(df.loc[df[date_col].nlargest(n=n_size, keep='last').index, value_col])
    
def avg_new_features(df, list_features, list_n_games, suffix_new_columns=''):
    """This function adds new columns to the provided DataFrame, containing the features in list_features averaged on the last n past observation contained in list_n_games.
    
    df: DataFrame
    list_features: list of features for new form features
    list_n_games: list for numbers of last matches to consider from
    """
    for elem in tqdm(list(product(list_features, list_n_games))):
        # tqdm.pandas(desc=(elem[0] + ' over the last ' + str(elem[1]) + ' games')) --> progress_apply()
        team_id = 'home_id' if 'home' in elem[0] else 'away_id' if 'away' in elem[0] else 'team_id' if 'team' in elem[0] else Exception(ValueError)
        # Apply avg_value_n_closest_dates by row
        df.loc[:, elem[0] + '_last' + str(elem[1]) + '_games' + suffix_new_columns] = df.apply(lambda x: avg_value_n_closest_dates(\
                df = df.loc[((x[team_id] == df[team_id]) & (x['time_starting_at_date'] > df['time_starting_at_date'])), :], date_col = 'time_starting_at_date', value_col = elem[0], n_size=elem[1]), axis=1)
                
    return df

Data Preparation

In [37]:
mask_form = static.loc[:, ['home_id','away_id','time_starting_at_date','league_is_cup','result','scores_home_score','scores_away_score']]
# Rename score columns
mask_form.rename(columns={'scores_home_score': 'home_goals_scored', 'scores_away_score': 'away_goals_scored'}, inplace=True)
# Point's columns for both home and away team from result column
mask_form['home_points'] = mask_form['result'].map({0:1, 1:3, 2:0})
mask_form['away_points'] = mask_form['result'].map({0:1, 1:0, 2:3})
# New goals conceded columns for both home and away 
mask_form['home_goals_conceded'] = mask_form['away_goals_scored']
mask_form['away_goals_conceded'] = mask_form['home_goals_scored']
# Home & Away goals differential columns
mask_form['home_goals_diff'] = mask_form['home_goals_scored'] - mask_form['home_goals_conceded']
mask_form['away_goals_diff'] = mask_form['away_goals_scored'] - mask_form['away_goals_conceded']
# Downcast numeric columns (integers)
mask_form[mask_form.select_dtypes(np.number).columns] = mask_form.select_dtypes(np.number).apply(pd.to_numeric, errors='coerce', downcast='integer')
mask_form.tail(2)

Unnamed: 0_level_0,home_id,away_id,time_starting_at_date,league_is_cup,result,home_goals_scored,away_goals_scored,home_points,away_points,home_goals_conceded,away_goals_conceded,home_goals_diff,away_goals_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
18165743,594,485,2022-04-15,0,0,0,0,1,1,0,0,0,0
18157344,598,6789,2022-04-15,0,2,2,3,0,3,3,2,-1,1


In [38]:
# Mask home teams 
mask_home = rename_columns_to_team(mask_form.loc[:, ['home_id', 'time_starting_at_date', 'league_is_cup', 'home_points', 'home_goals_scored', 'home_goals_conceded', 'home_goals_diff']])
# Mask away teams 
mask_away = rename_columns_to_team(mask_form.loc[:, ['away_id', 'time_starting_at_date', 'league_is_cup', 'away_points', 'away_goals_scored', 'away_goals_conceded', 'away_goals_diff']])
# Concatenate vertically mask_home and mask_away (avoided problem of using separate home and away IDs by using a team ID)
mask_form_teams = pd.concat([mask_home, mask_away], ignore_index=True).sort_values('time_starting_at_date').reset_index(drop=True)
print(mask_form_teams.shape)
mask_form_teams.head(2)

(46966, 7)


Unnamed: 0,team_id,time_starting_at_date,league_is_cup,team_points,team_goals_scored,team_goals_conceded,team_goals_diff
0,8643,2015-07-01,1,0,1,2,-1
1,579,2015-07-01,1,3,2,1,1


For only home & only away 

In [39]:
list_number_games = [1, 3, 5, 10, 20] # List of n. games to consider

In [40]:
feats = ['home_points','away_points','home_goals_scored','away_goals_scored','home_goals_conceded','away_goals_conceded','home_goals_diff','away_goals_diff'] # list of features
# Only home and away WITH CUPS
mask_form = avg_new_features(df=mask_form, list_features=feats, list_n_games=list_number_games, suffix_new_columns='_withcups')
# Only home and away NO CUPS. Since we are only interested with cups data there is no need to create two different DataFrames and then merge them to preserve the data where league_is_cup == 0. (OK to filter out cups data)
mask_form_nocups = mask_form.loc[mask_form['league_is_cup'] == 0, :]
mask_form_nocups = avg_new_features(df=mask_form_nocups, list_features=feats, list_n_games=list_number_games, \
    suffix_new_columns='_nopcups')
# Print some info
print(mask_form_nocups.shape)
mask_form_nocups.head(3)

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

(12362, 93)


Unnamed: 0_level_0,home_id,away_id,time_starting_at_date,league_is_cup,result,home_goals_scored,away_goals_scored,home_points,away_points,home_goals_conceded,away_goals_conceded,home_goals_diff,away_goals_diff,home_points_last1_games_withcups,home_points_last3_games_withcups,home_points_last5_games_withcups,home_points_last10_games_withcups,home_points_last20_games_withcups,away_points_last1_games_withcups,away_points_last3_games_withcups,away_points_last5_games_withcups,away_points_last10_games_withcups,away_points_last20_games_withcups,home_goals_scored_last1_games_withcups,home_goals_scored_last3_games_withcups,home_goals_scored_last5_games_withcups,home_goals_scored_last10_games_withcups,home_goals_scored_last20_games_withcups,away_goals_scored_last1_games_withcups,away_goals_scored_last3_games_withcups,away_goals_scored_last5_games_withcups,away_goals_scored_last10_games_withcups,away_goals_scored_last20_games_withcups,home_goals_conceded_last1_games_withcups,home_goals_conceded_last3_games_withcups,home_goals_conceded_last5_games_withcups,home_goals_conceded_last10_games_withcups,home_goals_conceded_last20_games_withcups,away_goals_conceded_last1_games_withcups,away_goals_conceded_last3_games_withcups,away_goals_conceded_last5_games_withcups,away_goals_conceded_last10_games_withcups,away_goals_conceded_last20_games_withcups,home_goals_diff_last1_games_withcups,home_goals_diff_last3_games_withcups,home_goals_diff_last5_games_withcups,home_goals_diff_last10_games_withcups,home_goals_diff_last20_games_withcups,away_goals_diff_last1_games_withcups,away_goals_diff_last3_games_withcups,away_goals_diff_last5_games_withcups,away_goals_diff_last10_games_withcups,away_goals_diff_last20_games_withcups,home_points_last1_games_nopcups,home_points_last3_games_nopcups,home_points_last5_games_nopcups,home_points_last10_games_nopcups,home_points_last20_games_nopcups,away_points_last1_games_nopcups,away_points_last3_games_nopcups,away_points_last5_games_nopcups,away_points_last10_games_nopcups,away_points_last20_games_nopcups,home_goals_scored_last1_games_nopcups,home_goals_scored_last3_games_nopcups,home_goals_scored_last5_games_nopcups,home_goals_scored_last10_games_nopcups,home_goals_scored_last20_games_nopcups,away_goals_scored_last1_games_nopcups,away_goals_scored_last3_games_nopcups,away_goals_scored_last5_games_nopcups,away_goals_scored_last10_games_nopcups,away_goals_scored_last20_games_nopcups,home_goals_conceded_last1_games_nopcups,home_goals_conceded_last3_games_nopcups,home_goals_conceded_last5_games_nopcups,home_goals_conceded_last10_games_nopcups,home_goals_conceded_last20_games_nopcups,away_goals_conceded_last1_games_nopcups,away_goals_conceded_last3_games_nopcups,away_goals_conceded_last5_games_nopcups,away_goals_conceded_last10_games_nopcups,away_goals_conceded_last20_games_nopcups,home_goals_diff_last1_games_nopcups,home_goals_diff_last3_games_nopcups,home_goals_diff_last5_games_nopcups,home_goals_diff_last10_games_nopcups,home_goals_diff_last20_games_nopcups,away_goals_diff_last1_games_nopcups,away_goals_diff_last3_games_nopcups,away_goals_diff_last5_games_nopcups,away_goals_diff_last10_games_nopcups,away_goals_diff_last20_games_nopcups
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1
251711,690,591,2015-08-07,0,2,0,1,0,3,1,0,-1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
849,14,6,2015-08-08,0,1,1,0,3,0,0,1,1,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
864,33,51,2015-08-08,0,2,1,3,0,3,3,1,-2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Both home and away games 

In [41]:
list_feats2 = ['team_points', 'team_goals_scored', 'team_goals_conceded', 'team_goals_diff']
# Both home and away WITH CUPS
mask_form_teams = avg_new_features(df=mask_form_teams, list_features=list_feats2, list_n_games=list_number_games, \
    suffix_new_columns='_withcups')
# Both home and away NO CUPS
mask_form_teams_nocups = mask_form_teams.loc[mask_form_teams['league_is_cup'] == 0, :]
mask_form_teams_nocups = avg_new_features(df=mask_form_teams_nocups, list_features=list_feats2, list_n_games=list_number_games, \
    suffix_new_columns='_nopcups')
# Print some info
print(mask_form_teams_nocups.shape)
mask_form_teams_nocups.tail(3)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

(24724, 47)


Unnamed: 0,team_id,time_starting_at_date,league_is_cup,team_points,team_goals_scored,team_goals_conceded,team_goals_diff,team_points_last1_games_withcups,team_points_last3_games_withcups,team_points_last5_games_withcups,team_points_last10_games_withcups,team_points_last20_games_withcups,team_goals_scored_last1_games_withcups,team_goals_scored_last3_games_withcups,team_goals_scored_last5_games_withcups,team_goals_scored_last10_games_withcups,team_goals_scored_last20_games_withcups,team_goals_conceded_last1_games_withcups,team_goals_conceded_last3_games_withcups,team_goals_conceded_last5_games_withcups,team_goals_conceded_last10_games_withcups,team_goals_conceded_last20_games_withcups,team_goals_diff_last1_games_withcups,team_goals_diff_last3_games_withcups,team_goals_diff_last5_games_withcups,team_goals_diff_last10_games_withcups,team_goals_diff_last20_games_withcups,team_points_last1_games_nopcups,team_points_last3_games_nopcups,team_points_last5_games_nopcups,team_points_last10_games_nopcups,team_points_last20_games_nopcups,team_goals_scored_last1_games_nopcups,team_goals_scored_last3_games_nopcups,team_goals_scored_last5_games_nopcups,team_goals_scored_last10_games_nopcups,team_goals_scored_last20_games_nopcups,team_goals_conceded_last1_games_nopcups,team_goals_conceded_last3_games_nopcups,team_goals_conceded_last5_games_nopcups,team_goals_conceded_last10_games_nopcups,team_goals_conceded_last20_games_nopcups,team_goals_diff_last1_games_nopcups,team_goals_diff_last3_games_nopcups,team_goals_diff_last5_games_nopcups,team_goals_diff_last10_games_nopcups,team_goals_diff_last20_games_nopcups
46963,345,2022-04-15,0,0,1,3,-2,1.0,1.333333,1.4,1.1,1.1,0.0,0.666667,0.8,0.9,0.85,0.0,1.333333,1.0,1.2,1.25,0.0,-0.666667,-0.2,-0.3,-0.4,1.0,1.333333,1.4,1.1,1.1,0.0,0.666667,0.8,0.9,0.95,0.0,1.333333,1.0,1.2,1.4,0.0,-0.666667,-0.2,-0.3,-0.45
46964,485,2022-04-15,0,1,0,0,0,3.0,2.333333,2.0,1.5,2.0,2.0,2.0,1.6,1.3,2.0,1.0,0.666667,0.8,1.1,1.05,1.0,1.333333,0.8,0.2,0.95,3.0,2.333333,2.0,1.9,1.75,2.0,2.0,1.6,1.9,1.85,1.0,0.666667,1.0,1.3,1.25,1.0,1.333333,0.6,0.6,0.6
46965,6789,2022-04-15,0,3,3,2,1,3.0,3.0,2.0,1.5,1.65,2.0,2.333333,1.6,1.1,1.5,1.0,0.666667,0.8,0.9,0.9,1.0,1.666667,0.8,0.2,0.6,3.0,3.0,2.4,1.7,1.65,2.0,2.333333,1.6,1.4,1.5,1.0,0.666667,0.6,0.9,0.9,1.0,1.666667,1.0,0.5,0.6


Merge only home & away data with both home & away data

In [42]:
mask_form_nocups = mask_form_nocups.reset_index()

In [43]:
# Merge mask_form_nocups which contains already separated home & away data with mask_form_teams_nocups which has not yet be partitioned in home and away.
mask_form_nocups = mask_form_nocups.reset_index()
# By merging on home_id and date we create before columns for home (defined using home_ prefix)
mask_form_with_only_home_teams = pd.merge(mask_form_nocups, mask_form_teams_nocups.add_prefix('home_'), how='left', \
    left_on=['home_id', 'time_starting_at_date'], right_on=['home_team_id', 'home_time_starting_at_date']) # on home_id
# By merging on away_id and date we create before columns for away (defined using away_ prefix)
mask_form_complete = pd.merge(mask_form_with_only_home_teams, mask_form_teams_nocups.add_prefix('away_'), how='left', \
    left_on=['away_id', 'time_starting_at_date'], right_on=['away_team_id', 'away_time_starting_at_date']) # on_away
mask_form_complete = mask_form_complete[mask_form_complete['time_starting_at_date'] > '2016-06-06'].set_index('id')
mask_form_complete.head()

Unnamed: 0_level_0,index,home_id,away_id,time_starting_at_date,league_is_cup,result,home_goals_scored,away_goals_scored,home_points,away_points,home_goals_conceded,away_goals_conceded,home_goals_diff,away_goals_diff,home_points_last1_games_withcups,home_points_last3_games_withcups,home_points_last5_games_withcups,home_points_last10_games_withcups,home_points_last20_games_withcups,away_points_last1_games_withcups,away_points_last3_games_withcups,away_points_last5_games_withcups,away_points_last10_games_withcups,away_points_last20_games_withcups,home_goals_scored_last1_games_withcups,home_goals_scored_last3_games_withcups,home_goals_scored_last5_games_withcups,home_goals_scored_last10_games_withcups,home_goals_scored_last20_games_withcups,away_goals_scored_last1_games_withcups,away_goals_scored_last3_games_withcups,away_goals_scored_last5_games_withcups,away_goals_scored_last10_games_withcups,away_goals_scored_last20_games_withcups,home_goals_conceded_last1_games_withcups,home_goals_conceded_last3_games_withcups,home_goals_conceded_last5_games_withcups,home_goals_conceded_last10_games_withcups,home_goals_conceded_last20_games_withcups,away_goals_conceded_last1_games_withcups,away_goals_conceded_last3_games_withcups,away_goals_conceded_last5_games_withcups,away_goals_conceded_last10_games_withcups,away_goals_conceded_last20_games_withcups,home_goals_diff_last1_games_withcups,home_goals_diff_last3_games_withcups,home_goals_diff_last5_games_withcups,home_goals_diff_last10_games_withcups,home_goals_diff_last20_games_withcups,away_goals_diff_last1_games_withcups,away_goals_diff_last3_games_withcups,away_goals_diff_last5_games_withcups,away_goals_diff_last10_games_withcups,away_goals_diff_last20_games_withcups,home_points_last1_games_nopcups,home_points_last3_games_nopcups,home_points_last5_games_nopcups,home_points_last10_games_nopcups,home_points_last20_games_nopcups,away_points_last1_games_nopcups,away_points_last3_games_nopcups,away_points_last5_games_nopcups,away_points_last10_games_nopcups,away_points_last20_games_nopcups,home_goals_scored_last1_games_nopcups,home_goals_scored_last3_games_nopcups,home_goals_scored_last5_games_nopcups,home_goals_scored_last10_games_nopcups,home_goals_scored_last20_games_nopcups,away_goals_scored_last1_games_nopcups,away_goals_scored_last3_games_nopcups,away_goals_scored_last5_games_nopcups,away_goals_scored_last10_games_nopcups,away_goals_scored_last20_games_nopcups,home_goals_conceded_last1_games_nopcups,home_goals_conceded_last3_games_nopcups,home_goals_conceded_last5_games_nopcups,home_goals_conceded_last10_games_nopcups,home_goals_conceded_last20_games_nopcups,away_goals_conceded_last1_games_nopcups,away_goals_conceded_last3_games_nopcups,away_goals_conceded_last5_games_nopcups,away_goals_conceded_last10_games_nopcups,away_goals_conceded_last20_games_nopcups,home_goals_diff_last1_games_nopcups,home_goals_diff_last3_games_nopcups,home_goals_diff_last5_games_nopcups,home_goals_diff_last10_games_nopcups,home_goals_diff_last20_games_nopcups,away_goals_diff_last1_games_nopcups,away_goals_diff_last3_games_nopcups,away_goals_diff_last5_games_nopcups,away_goals_diff_last10_games_nopcups,away_goals_diff_last20_games_nopcups,home_team_id,home_time_starting_at_date,home_league_is_cup,home_team_points,home_team_goals_scored,home_team_goals_conceded,home_team_goals_diff,home_team_points_last1_games_withcups,home_team_points_last3_games_withcups,home_team_points_last5_games_withcups,home_team_points_last10_games_withcups,home_team_points_last20_games_withcups,home_team_goals_scored_last1_games_withcups,home_team_goals_scored_last3_games_withcups,home_team_goals_scored_last5_games_withcups,home_team_goals_scored_last10_games_withcups,home_team_goals_scored_last20_games_withcups,home_team_goals_conceded_last1_games_withcups,home_team_goals_conceded_last3_games_withcups,home_team_goals_conceded_last5_games_withcups,home_team_goals_conceded_last10_games_withcups,home_team_goals_conceded_last20_games_withcups,home_team_goals_diff_last1_games_withcups,home_team_goals_diff_last3_games_withcups,home_team_goals_diff_last5_games_withcups,home_team_goals_diff_last10_games_withcups,home_team_goals_diff_last20_games_withcups,home_team_points_last1_games_nopcups,home_team_points_last3_games_nopcups,home_team_points_last5_games_nopcups,home_team_points_last10_games_nopcups,home_team_points_last20_games_nopcups,home_team_goals_scored_last1_games_nopcups,home_team_goals_scored_last3_games_nopcups,home_team_goals_scored_last5_games_nopcups,home_team_goals_scored_last10_games_nopcups,home_team_goals_scored_last20_games_nopcups,home_team_goals_conceded_last1_games_nopcups,home_team_goals_conceded_last3_games_nopcups,home_team_goals_conceded_last5_games_nopcups,home_team_goals_conceded_last10_games_nopcups,home_team_goals_conceded_last20_games_nopcups,home_team_goals_diff_last1_games_nopcups,home_team_goals_diff_last3_games_nopcups,home_team_goals_diff_last5_games_nopcups,home_team_goals_diff_last10_games_nopcups,home_team_goals_diff_last20_games_nopcups,away_team_id,away_time_starting_at_date,away_league_is_cup,away_team_points,away_team_goals_scored,away_team_goals_conceded,away_team_goals_diff,away_team_points_last1_games_withcups,away_team_points_last3_games_withcups,away_team_points_last5_games_withcups,away_team_points_last10_games_withcups,away_team_points_last20_games_withcups,away_team_goals_scored_last1_games_withcups,away_team_goals_scored_last3_games_withcups,away_team_goals_scored_last5_games_withcups,away_team_goals_scored_last10_games_withcups,away_team_goals_scored_last20_games_withcups,away_team_goals_conceded_last1_games_withcups,away_team_goals_conceded_last3_games_withcups,away_team_goals_conceded_last5_games_withcups,away_team_goals_conceded_last10_games_withcups,away_team_goals_conceded_last20_games_withcups,away_team_goals_diff_last1_games_withcups,away_team_goals_diff_last3_games_withcups,away_team_goals_diff_last5_games_withcups,away_team_goals_diff_last10_games_withcups,away_team_goals_diff_last20_games_withcups,away_team_points_last1_games_nopcups,away_team_points_last3_games_nopcups,away_team_points_last5_games_nopcups,away_team_points_last10_games_nopcups,away_team_points_last20_games_nopcups,away_team_goals_scored_last1_games_nopcups,away_team_goals_scored_last3_games_nopcups,away_team_goals_scored_last5_games_nopcups,away_team_goals_scored_last10_games_nopcups,away_team_goals_scored_last20_games_nopcups,away_team_goals_conceded_last1_games_nopcups,away_team_goals_conceded_last3_games_nopcups,away_team_goals_conceded_last5_games_nopcups,away_team_goals_conceded_last10_games_nopcups,away_team_goals_conceded_last20_games_nopcups,away_team_goals_diff_last1_games_nopcups,away_team_goals_diff_last3_games_nopcups,away_team_goals_diff_last5_games_nopcups,away_team_goals_diff_last10_games_nopcups,away_team_goals_diff_last20_games_nopcups
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1
251728,1826,430,591,2016-08-12,0,2,0,1,0,3,1,0,-1,1,3.0,2.0,1.4,2.0,1.842105,1.0,1.333333,2.0,2.2,2.35,1.0,1.0,0.8,1.0,1.210526,1.0,1.666667,3.2,2.4,2.35,0.0,0.666667,0.8,0.4,0.736842,1.0,0.666667,0.4,0.6,0.45,1.0,0.333333,0.0,0.6,0.473684,0.0,1.0,2.8,1.8,1.9,3.0,2.0,1.4,2.0,1.842105,1.0,2.333333,2.0,2.5,2.526316,1.0,1.0,0.8,1.0,1.210526,1.0,2.333333,3.4,2.8,2.263158,0.0,0.666667,0.8,0.4,0.736842,1.0,0.333333,0.6,0.4,0.368421,1.0,0.333333,0.0,0.6,0.473684,0.0,2.0,2.8,2.4,1.894737,430,2016-08-12,0,0,0,1,-1,3.0,2.333333,1.4,1.2,1.55,2.0,1.0,1.0,0.9,0.9,1.0,0.333333,1.0,1.3,0.9,1.0,0.666667,0.0,-0.4,0.0,3.0,2.333333,1.4,1.2,1.55,2.0,1.0,1.0,0.9,0.9,1.0,0.333333,1.0,1.3,0.9,1.0,0.666667,0.0,-0.4,0.0,591,2016-08-12,0,3,1,0,1,3.0,2.333333,2.6,2.0,2.15,4.0,3.0,3.8,2.7,2.6,0.0,0.333333,0.2,0.7,0.7,4.0,2.666667,3.6,2.0,1.9,3.0,2.333333,2.6,2.3,2.4,4.0,3.0,3.8,3.4,2.85,0.0,0.333333,0.2,0.4,0.5,4.0,2.666667,3.6,3.0,2.35
251738,1827,6789,2919,2016-08-12,0,0,2,2,1,1,2,2,0,0,3.0,3.0,2.4,2.5,2.1,1.0,1.333333,1.4,1.2,1.0,3.0,2.666667,2.2,2.3,1.65,0.0,1.666667,1.4,1.3,0.842105,1.0,1.0,1.2,0.9,0.85,0.0,1.0,1.6,1.5,1.473684,2.0,1.666667,1.0,1.4,0.8,0.0,0.666667,-0.2,-0.2,-0.631579,3.0,3.0,2.0,2.3,1.894737,1.0,1.333333,1.4,1.2,1.0,2.0,2.333333,2.0,2.2,1.578947,0.0,1.666667,1.4,1.3,0.842105,0.0,1.0,1.4,1.0,1.0,0.0,1.0,1.6,1.5,1.473684,2.0,1.333333,0.6,1.2,0.578947,0.0,0.666667,-0.2,-0.2,-0.631579,6789,2016-08-12,0,1,2,2,0,3.0,2.0,1.8,1.6,1.75,3.0,2.0,2.0,1.7,1.7,1.0,1.0,2.2,1.9,1.4,2.0,1.0,-0.2,-0.2,0.3,3.0,2.0,2.0,1.5,1.7,2.0,2.0,1.8,1.7,1.6,0.0,2.666667,2.0,2.0,1.35,2.0,-0.666667,-0.2,-0.3,0.25,2919,2016-08-12,0,1,2,2,0,0.0,0.333333,1.0,1.2,1.25,2.0,1.333333,1.6,1.4,1.5,3.0,2.0,1.4,1.6,1.55,-1.0,-0.666667,0.2,-0.2,-0.05,0.0,0.333333,1.0,1.2,1.25,2.0,1.333333,1.6,1.4,1.5,3.0,2.0,1.4,1.6,1.55,-1.0,-0.666667,0.2,-0.2,-0.05
2188,1828,22,42,2016-08-13,0,1,2,1,3,0,1,2,1,-1,0.0,1.5,1.5,1.5,1.5,1.0,1.666667,2.2,1.9,2.0,0.0,0.5,0.5,0.5,0.5,1.0,1.333333,1.2,1.4,1.75,4.0,2.0,2.0,2.0,2.0,1.0,0.666667,0.4,0.8,1.0,-4.0,-1.5,-1.5,-1.5,-1.5,0.0,0.666667,0.8,0.6,0.75,,,,,,1.0,1.666667,2.2,1.8,2.052632,,,,,,1.0,1.333333,1.2,1.2,1.736842,,,,,,1.0,0.666667,0.4,0.7,0.947368,,,,,,0.0,0.666667,0.8,0.5,0.789474,22,2016-08-13,0,3,2,1,1,0.0,1.333333,1.75,1.75,1.75,0.0,1.0,1.0,1.0,1.0,4.0,1.666667,1.25,1.25,1.25,-4.0,-0.666667,-0.25,-0.25,-0.25,,,,,,,,,,,,,,,,,,,,,42,2016-08-13,0,0,1,2,-1,1.0,1.666667,1.8,2.4,2.1,1.0,1.666667,2.2,1.7,1.65,1.0,1.0,1.0,0.5,0.75,0.0,0.666667,1.2,1.2,0.9,1.0,1.666667,1.8,2.4,2.15,1.0,1.666667,2.2,1.7,1.55,1.0,1.0,1.0,0.5,0.55,0.0,0.666667,1.2,1.2,1.0
2208,1829,51,10,2016-08-13,0,2,0,1,0,3,1,0,-1,1,3.0,2.333333,2.0,1.3,1.35,1.0,0.666667,0.6,1.0,1.1,2.0,1.333333,1.0,1.0,1.05,1.0,0.666667,0.6,0.8,0.8,1.0,0.666667,0.6,1.2,1.1,1.0,1.333333,1.2,1.2,1.25,1.0,0.666667,0.4,-0.2,-0.05,0.0,-0.666667,-0.6,-0.4,-0.45,3.0,2.333333,1.4,0.8,1.105263,1.0,0.666667,0.6,0.8,1.052632,2.0,1.0,0.8,0.7,1.0,1.0,0.666667,0.6,0.8,0.736842,1.0,0.333333,0.8,1.4,1.210526,1.0,1.333333,1.2,1.4,1.157895,1.0,0.666667,0.0,-0.7,-0.210526,0.0,-0.666667,-0.6,-0.6,-0.421053,51,2016-08-13,0,0,0,1,-1,0.0,1.0,1.2,1.2,1.15,1.0,1.0,1.0,0.9,1.1,4.0,2.0,1.8,1.3,1.4,-3.0,-1.0,-0.8,-0.4,-0.3,0.0,1.0,0.8,0.9,0.6,1.0,1.0,0.8,0.8,0.8,4.0,2.0,1.8,1.4,1.75,-3.0,-1.0,-1.0,-0.6,-0.95,10,2016-08-13,0,3,1,0,1,1.0,0.666667,0.6,0.7,1.0,1.0,0.666667,0.6,0.5,0.8,1.0,1.666667,1.6,1.2,1.3,0.0,-1.0,-1.0,-0.7,-0.5,1.0,0.666667,0.6,0.7,1.15,1.0,0.666667,0.6,0.5,0.85,1.0,1.666667,1.6,1.2,1.2,0.0,-1.0,-1.0,-0.7,-0.35
2225,1830,7,26,2016-08-13,0,0,1,1,1,1,1,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,1.0,1.25,1.0,1.0,1.0,1.0,1.0,1.0,0.666667,1.0,1.0,0.95,2.0,2.0,2.0,2.0,2.0,2.0,3.333333,2.4,2.1,1.55,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-2.666667,-1.4,-1.1,-0.6,,,,,,0.0,0.0,0.8,1.0,1.210526,,,,,,1.0,0.666667,1.0,1.3,1.0,,,,,,2.0,3.333333,2.4,2.4,1.631579,,,,,,-1.0,-2.666667,-1.4,-1.1,-0.631579,7,2016-08-13,0,1,1,1,0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,26,2016-08-13,0,1,1,1,0,3.0,1.333333,0.8,0.9,1.25,2.0,1.333333,0.8,1.1,1.1,1.0,1.333333,2.4,2.2,1.8,1.0,0.0,-1.6,-1.1,-0.7,3.0,1.333333,0.8,0.9,1.25,2.0,1.333333,0.8,1.1,1.25,1.0,1.333333,2.4,2.2,1.95,1.0,0.0,-1.6,-1.1,-0.7


In [44]:
mask_form_complete.to_csv('../../Data/Modeling_Before_Preparation/Form_data_v01.csv')

## *Stats Data*

In [45]:
def avg_value_nan_closest_dates(df, date_col, value_col, n_size):
    """Given a DataFrame, a column and a integer n; this function returns the average for the specified column in the DataFrame of the last (considering precedent and closest dates) n_size number of values.

    df: DataFrame
    date_col: column in df which contains dates values
    value_col: column of values to average from
    n_size: number of last values (in date) 
    """
    with warnings.catch_warnings():
        warnings.filterwarnings('error')
        try:
            x = np.mean(df.loc[df[date_col].nlargest(n=n_size, keep='last').index, value_col])
        except RuntimeWarning:
            x=np.NaN  
    return x
    
def avg_nan_new_features(df, list_features, list_n_games, suffix_new_columns=''):
    """This function adds new columns to the provided DataFrame, containing the features in list_features averaged on the last n past observation contained in list_n_games.
    
    df: DataFrame
    list_features: list of features for new form features
    list_n_games: list for numbers of last matches to consider from
    """
    for elem in tqdm(list(product(list_features, list_n_games))):
        # tqdm.pandas(desc=(elem[0] + ' over the last ' + str(elem[1]) + ' games')) --> progress_apply()
        team_id = 'home_id' if 'home' in elem[0] else 'away_id' if 'away' in elem[0] else 'team_id' if 'team' in elem[0] else Exception(ValueError)
        # Apply avg_value_n_closest_dates by row
        df.loc[:, elem[0] + '_last' + str(elem[1]) + '_games' + suffix_new_columns] = df.apply(lambda x: avg_value_nan_closest_dates(\
                df = df.loc[((x[team_id] == df[team_id]) & (x['time_starting_at_date'] > df['time_starting_at_date'])), :], date_col = 'time_starting_at_date', value_col = elem[0], n_size=elem[1]), axis=1)
                
    return df

In [46]:
# Select general columns & stats columns 
cols = ['home_id','away_id','time_starting_at_date','league_is_cup'] + \
    static.loc[:,'home_shots_total':'away_attacks_dangerous_attacks'].columns.tolist()
# Filter for desired columns and only non-cup observations 
mask_stats = static.loc[static['league_is_cup'] == 0, cols]
# Create both home & away differentials columns for all types (home is home - away, while away is away - home)
col_names_for_differentials = ['shots_total','shots_ongoal','shots_offgoal','shots_blocked','shots_insidebox','shots_outsidebox','fouls','corners','offsides','possessiontime','yellowcards','redcards','yellowredcards','saves','tackles','passes_total','passes_accurate','passes_percentage','attacks_attacks','attacks_dangerous_attacks']
for col_type in col_names_for_differentials:
    mask_stats.loc[:, 'home_' + 'diff_' + col_type] = mask_stats.loc[:, 'home_' + col_type] - mask_stats.loc[:, 'away_' + col_type]
    mask_stats.loc[:, 'away_' + 'diff_' + col_type] = mask_stats.loc[:, 'away_' + col_type] - mask_stats.loc[:, 'home_' + col_type]
# Downcast numeric columns (integers)
mask_stats[mask_stats.select_dtypes(int).columns] = mask_stats.select_dtypes(int).apply(pd.to_numeric, errors='coerce', downcast='integer')
# Print some info
print(mask_stats.shape)
mask_stats.head(2)

(12362, 84)


Unnamed: 0_level_0,home_id,away_id,time_starting_at_date,league_is_cup,home_shots_total,home_shots_ongoal,home_shots_offgoal,home_shots_blocked,home_shots_insidebox,home_shots_outsidebox,home_fouls,home_corners,home_offsides,home_possessiontime,home_yellowcards,home_redcards,home_yellowredcards,home_saves,home_tackles,away_shots_total,away_shots_ongoal,away_shots_offgoal,away_shots_blocked,away_shots_insidebox,away_shots_outsidebox,away_fouls,away_corners,away_offsides,away_possessiontime,away_yellowcards,away_redcards,away_yellowredcards,away_saves,away_tackles,home_passes_total,home_passes_accurate,home_passes_percentage,away_passes_total,away_passes_accurate,away_passes_percentage,home_attacks_attacks,home_attacks_dangerous_attacks,away_attacks_attacks,away_attacks_dangerous_attacks,home_diff_shots_total,away_diff_shots_total,home_diff_shots_ongoal,away_diff_shots_ongoal,home_diff_shots_offgoal,away_diff_shots_offgoal,home_diff_shots_blocked,away_diff_shots_blocked,home_diff_shots_insidebox,away_diff_shots_insidebox,home_diff_shots_outsidebox,away_diff_shots_outsidebox,home_diff_fouls,away_diff_fouls,home_diff_corners,away_diff_corners,home_diff_offsides,away_diff_offsides,home_diff_possessiontime,away_diff_possessiontime,home_diff_yellowcards,away_diff_yellowcards,home_diff_redcards,away_diff_redcards,home_diff_yellowredcards,away_diff_yellowredcards,home_diff_saves,away_diff_saves,home_diff_tackles,away_diff_tackles,home_diff_passes_total,away_diff_passes_total,home_diff_passes_accurate,away_diff_passes_accurate,home_diff_passes_percentage,away_diff_passes_percentage,home_diff_attacks_attacks,away_diff_attacks_attacks,home_diff_attacks_dangerous_attacks,away_diff_attacks_dangerous_attacks
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1
251711,690,591,2015-08-07,0,12,2,0,0,0,0,20,3,0,52,3,0,0,1,16,7,2,0,0,0,0,17,2,2,48,2,1,0,2,17,,,,,,,106,48,93,42,5,-5,0,0,0,0,0,0,0,0,0,0,3,-3,1,-1,-2,2,4,-4,1,-1,-1,1,0,0,-1,1,-1,1,,,,,,,13,-13,6,-6
849,14,6,2015-08-08,0,9,1,0,0,0,0,12,1,1,50,2,0,0,4,17,9,4,0,0,0,0,12,2,2,50,3,0,0,1,17,,,,,,,100,45,96,48,0,0,-3,3,0,0,0,0,0,0,0,0,0,0,-1,1,-1,1,0,0,-1,1,0,0,0,0,3,-3,0,0,,,,,,,4,-4,-3,3


In [47]:
# Mask home teams 
stats_home = rename_columns_to_team(mask_stats.loc[:, ['time_starting_at_date', 'league_is_cup'] + mask_stats.filter(regex='^home').columns.tolist()])
# Mask away teams 
stats_away = rename_columns_to_team(mask_stats.loc[:, ['time_starting_at_date', 'league_is_cup'] + mask_stats.filter(regex='^away').columns.tolist()])
# Concatenate vertically stats_home and stats_away (avoided problem of using separate home and away IDs by using a team ID)
mask_stats_team = pd.concat([stats_home, stats_away], ignore_index=True).sort_values('time_starting_at_date').reset_index(drop=True)
print(mask_stats_team.shape)
mask_stats_team.tail(2)

(24724, 43)


Unnamed: 0,time_starting_at_date,league_is_cup,team_id,team_shots_total,team_shots_ongoal,team_shots_offgoal,team_shots_blocked,team_shots_insidebox,team_shots_outsidebox,team_fouls,team_corners,team_offsides,team_possessiontime,team_yellowcards,team_redcards,team_yellowredcards,team_saves,team_tackles,team_passes_total,team_passes_accurate,team_passes_percentage,team_attacks_attacks,team_attacks_dangerous_attacks,team_diff_shots_total,team_diff_shots_ongoal,team_diff_shots_offgoal,team_diff_shots_blocked,team_diff_shots_insidebox,team_diff_shots_outsidebox,team_diff_fouls,team_diff_corners,team_diff_offsides,team_diff_possessiontime,team_diff_yellowcards,team_diff_redcards,team_diff_yellowredcards,team_diff_saves,team_diff_tackles,team_diff_passes_total,team_diff_passes_accurate,team_diff_passes_percentage,team_diff_attacks_attacks,team_diff_attacks_dangerous_attacks
24722,2022-04-15,0,485,9,2,7,1,3,6,7,1,3,43,4,0,0,2,26,377,305,80.9,95,37,-1,-1,0,0,-4,4,-10,-7,-4,-14,1,-1,-1,0,10,-100,-103,-4.63,-13,-41
24723,2022-04-15,0,6789,12,4,8,1,9,3,15,0,1,36,3,0,0,2,13,383,293,76.5,74,33,-2,0,-2,-2,2,-3,9,-6,1,-28,2,0,0,1,4,-285,-276,-8.68,-51,-25


Only Home & Only Away data

In [48]:
list_number_games = [1, 3, 5, 10, 15, 20] # List of n. games to consider

In [49]:
l_features = mask_stats.loc[:,'home_shots_total':'home_shots_outsidebox'].columns.tolist() # list of features
# Only home and away. Contrary to the form case, there is no distinction between with cups data and without (cups are excluded)
mask_stats = avg_nan_new_features(df=mask_stats, list_features=l_features, list_n_games=list_number_games)
# Print some info
print(mask_stats.shape)
mask_stats.head(3)

  0%|          | 0/18 [00:00<?, ?it/s]

(12362, 102)


Unnamed: 0_level_0,home_id,away_id,time_starting_at_date,league_is_cup,home_shots_total,home_shots_ongoal,home_shots_offgoal,home_shots_blocked,home_shots_insidebox,home_shots_outsidebox,home_fouls,home_corners,home_offsides,home_possessiontime,home_yellowcards,home_redcards,home_yellowredcards,home_saves,home_tackles,away_shots_total,away_shots_ongoal,away_shots_offgoal,away_shots_blocked,away_shots_insidebox,away_shots_outsidebox,away_fouls,away_corners,away_offsides,away_possessiontime,away_yellowcards,away_redcards,away_yellowredcards,away_saves,away_tackles,home_passes_total,home_passes_accurate,home_passes_percentage,away_passes_total,away_passes_accurate,away_passes_percentage,home_attacks_attacks,home_attacks_dangerous_attacks,away_attacks_attacks,away_attacks_dangerous_attacks,home_diff_shots_total,away_diff_shots_total,home_diff_shots_ongoal,away_diff_shots_ongoal,home_diff_shots_offgoal,away_diff_shots_offgoal,home_diff_shots_blocked,away_diff_shots_blocked,home_diff_shots_insidebox,away_diff_shots_insidebox,home_diff_shots_outsidebox,away_diff_shots_outsidebox,home_diff_fouls,away_diff_fouls,home_diff_corners,away_diff_corners,home_diff_offsides,away_diff_offsides,home_diff_possessiontime,away_diff_possessiontime,home_diff_yellowcards,away_diff_yellowcards,home_diff_redcards,away_diff_redcards,home_diff_yellowredcards,away_diff_yellowredcards,home_diff_saves,away_diff_saves,home_diff_tackles,away_diff_tackles,home_diff_passes_total,away_diff_passes_total,home_diff_passes_accurate,away_diff_passes_accurate,home_diff_passes_percentage,away_diff_passes_percentage,home_diff_attacks_attacks,away_diff_attacks_attacks,home_diff_attacks_dangerous_attacks,away_diff_attacks_dangerous_attacks,home_shots_total_last1_games,home_shots_total_last3_games,home_shots_total_last5_games,home_shots_ongoal_last1_games,home_shots_ongoal_last3_games,home_shots_ongoal_last5_games,home_shots_offgoal_last1_games,home_shots_offgoal_last3_games,home_shots_offgoal_last5_games,home_shots_blocked_last1_games,home_shots_blocked_last3_games,home_shots_blocked_last5_games,home_shots_insidebox_last1_games,home_shots_insidebox_last3_games,home_shots_insidebox_last5_games,home_shots_outsidebox_last1_games,home_shots_outsidebox_last3_games,home_shots_outsidebox_last5_games
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1
251711,690,591,2015-08-07,0,12,2,0,0,0,0,20,3,0,52,3,0,0,1,16,7,2,0,0,0,0,17,2,2,48,2,1,0,2,17,,,,,,,106,48,93,42,5,-5,0,0,0,0,0,0,0,0,0,0,3,-3,1,-1,-2,2,4,-4,1,-1,-1,1,0,0,-1,1,-1,1,,,,,,,13,-13,6,-6,,,,,,,,,,,,,,,,,,
849,14,6,2015-08-08,0,9,1,0,0,0,0,12,1,1,50,2,0,0,4,17,9,4,0,0,0,0,12,2,2,50,3,0,0,1,17,,,,,,,100,45,96,48,0,0,-3,3,0,0,0,0,0,0,0,0,0,0,-1,1,-1,1,0,0,-1,1,0,0,0,0,3,-3,0,0,,,,,,,4,-4,-3,3,,,,,,,,,,,,,,,,,,
864,33,51,2015-08-08,0,17,6,0,0,0,0,14,1,4,63,1,0,0,4,15,11,7,0,0,0,0,20,4,2,37,0,0,0,5,18,,,,,,,114,58,100,54,6,-6,-1,1,0,0,0,0,0,0,0,0,-6,6,-3,3,2,-2,26,-26,1,-1,0,0,0,0,-1,1,-3,3,,,,,,,14,-14,4,-4,,,,,,,,,,,,,,,,,,


Both Home & Away data

In [50]:
l_features2 = mask_stats_team.loc[:,'team_shots_total':].columns.tolist() # list of features
# Both home and away. Contrary to the form case, there is no distinction between with cups data and without (cups are excluded)
mask_stats_team = avg_nan_new_features(df=mask_stats_team, list_features=l_features2, list_n_games=list_number_games)
# Print some info
print(mask_stats_team.shape)
mask_stats_team.head(3)

  0%|          | 0/120 [00:00<?, ?it/s]

KeyboardInterrupt: 

Merge only home & away data with both home & away data

In [None]:
# Merge mask_stats which contains already separated home & away data with mask_stats_teams which has not yet be partitioned in home and away.
# By merging on home_id and date we create before columns for home (defined using home_ prefix)
mask_stats_with_only_home_teams = pd.merge(mask_stats.reset_index(), mask_stats_team.add_prefix('home_'), how='left', \
    left_on=['home_id', 'time_starting_at_date'], right_on=['home_team_id', 'home_time_starting_at_date']) # on home_id
# By merging on away_id and date we create before columns for away (defined using away_ prefix)
mask_stats_complete = pd.merge(mask_stats_with_only_home_teams, mask_stats_team.add_prefix('away_'), how='left', \
    left_on=['away_id', 'time_starting_at_date'], right_on=['away_team_id', 'away_time_starting_at_date']) # on_away
mask_stats_complete = mask_stats_complete[mask_stats_complete['time_starting_at_date'] > '2016-06-06'].set_index('id')
mask_stats_complete.head()

In [None]:
mask_stats_complete.to_csv('../../Data/Modeling_Before_Preparation/Stats_data_v01.csv')