#### Imports

In [1]:
# Packages
import numpy as np
import pandas as pd
from datetime import datetime
import os
# Pandas' options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## **PREPARE TEAMS' ATTRIBUTES DATA**

#### Import Different Data Sources

In [2]:
def concatenate_csv_in_directory(path_IN, path_OUT, write_OUTFILE):
    if os.path.isfile(path_OUT):
        appended_data = pd.read_csv(path_OUT)
    else:    
        appended_data = [pd.read_csv(os.path.join(path_IN, file)) for file in os.listdir(path_IN) if file.endswith('.csv')]
        appended_data = pd.concat(appended_data)
    if write_OUTFILE:
        appended_data.to_csv(path_OUT)
    return appended_data

###### **concatenate_csv_in_directory()**: Import chunks of in the same directory and concatenates them. ***path_IN*** is the path of the directory where the data can be found. ***path_OUT*** is the path (including filename) of the output file. ***write_OUTFILE*** if True it saves the data into a CSV file, if False it only creates a dataframe in pandas

In [3]:
# Import CSV of weekly data (major leagues)
tm_w_MAJOR = pd.read_csv('../Data/Scraping FIFA/Dirty data/Teamdata_weekly.csv')
tm_w_MAJOR['is_major'] = 1
# Import CSV of weekly data (Minor leagues)
tm_MINOR = pd.read_csv('../Data/Scraping FIFA/Dirty data/TeamData_weekly_MINOR.csv')
tm_MINOR['is_major'] = 0
# Import CSV of weekly data (new observations)
tm_w_new = pd.read_csv('../Data/Scraping FIFA/Dirty data/Teamdata_weekly_NEW.csv')
tm_w_new['is_major'] = 1
tm_15_MAJ =  pd.read_csv('../Data/Scraping FIFA/Dirty data/Teamdata_weekly_NEW_15_MAJ.csv')
tm_15_MAJ['is_major'] = 1
tm_15_MIN =  pd.read_csv('../Data/Scraping FIFA/Dirty data/Teamdata_weekly_NEW_15_MIN.csv')
tm_15_MIN['is_major'] = 0
# Concatenate the previous dataframes
tm_weekly = pd.concat([tm_w_MAJOR, tm_MINOR, tm_w_new, tm_15_MAJ, tm_15_MIN])

#### Cleaning of Team Attributes Data

In [4]:
# Exclude data from FIFA10 and FIFA11
tm_weekly = tm_weekly[(tm_weekly['ObservationDate'] != 'Sept. 1, 2009') & (tm_weekly['ObservationDate'] != 'Sept. 1, 2010')] 
# Change ObservationDate to Date Format
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('Sept', 'Sep')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('April', 'Apr.')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('June', 'Jun.')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('March', 'Mar.')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('May', 'May.')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('July', 'Jul.')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].map(lambda x: datetime.strptime(x, '%b. %d, %Y'))
# Keep only some columns (most columns dropped have too many NA) + Sort by Date and TeamName
tm_weekly = tm_weekly[['ObservationDate', 'TeamName', 'Attack', 'Midfield', 'Defence', 'TransferBudget', 'RivalTeam', 'TeamRoster', 'is_major']]
tm_weekly = tm_weekly.sort_values(by = ['ObservationDate', 'TeamName']).reset_index(drop = True)
tm_weekly16 = tm_weekly[tm_weekly['ObservationDate'] > '2015-05-01']
tm_weekly16 = tm_weekly16.sort_values(by="ObservationDate")
tm_weekly16.head()

Unnamed: 0,ObservationDate,TeamName,Attack,Midfield,Defence,TransferBudget,RivalTeam,TeamRoster,is_major
595,2015-07-10,1. FC Heidenheim,65,63,64,500000,VfR Aalen,"['Jan Zimmermann', 'Robert Strauß', 'Tim Göhle...",0
725,2015-07-10,Olympique Lyonnais,78,75,74,13000000,AS Saint-Etienne,"['Anthony Lopes', 'Mouhamadou Dabo', 'Lindsay ...",1
726,2015-07-10,Olympique de Marseille,77,75,75,15000000,Paris Saint-Germain,"['Steve Mandanda', 'Brice Dja Djédjé', 'Rod Fa...",1
727,2015-07-10,Palermo,73,72,72,9000000,Catania,"['Samir Ujkani', 'Roberto Vitiello', 'Giancarl...",1
728,2015-07-10,Paris Saint-Germain,82,81,80,120000000,Olympique de Marseille,"['Nicolas Douchez', 'Marquinhos', 'Thiago Silv...",1


In [5]:
teams_name_repl = {'1. ':'','AS Monaco Football Club SA':'Monaco','Monaco Football Club SA':'AS Monaco Football Club SA','AS Monaco':'Monaco','AS Nancy-Lorraine':'Nancy', 'AS Saint-Étienne':'Saint-Étienne','Amiens SC Football': 'Amiens SC',  'Arminia Bielefeld':'DSC Arminia Bielefeld', 'Athletic Club de Bilbao':'Athletic Club', 'Atlético de Madrid': 'Atlético Madrid', 'Bergamo Calcio':'Atalanta', "Borussia M'gladbach": 'Borussia Mönchengladbach', 'Bournemouth':'AFC Bournemouth','CA Osasuna':'Osasuna', 'CD Leganés':'Leganés', 'Chievo Verona':'Chievo', 'Clermont Foot':'Clermont', 'Cádiz CF':'Cádiz', 'D. Alavés':'Deportivo Alavés', 'Dijon FCO': 'Dijon', 'ES Troyes AC':'Troyes', 'ESTAC Troyes':'Troyes', 'Elche CF':'Elche', 'En Avant de Guingamp':'Guingamp', 'FC Bayern Munich':'FC Bayern München', 'Bayern München':'FC Bayern München', 'FC Girondins de Bordeaux':'Bordeaux', 'FC Ingolstadt 04':'Ingolstadt', 'FC Lorient':'Lorient', 'FC Metz':'Metz', 'FC Nantes':'Nantes', 'FC Nürnberg':'Nürnberg', 'FC Schalke 04':'Schalke 04', 'Football Club de Metz':'Metz', 'GFC Ajaccio':'Gazélec Ajaccio', 'Getafe CF':'Getafe', 'Girona CF':'Girona', 'Girona FC':'Girona', 'Girondins de Bordeaux':'Bordeaux', 'Granada CF':'Granada', 'Hertha BSC Berlin':'Hertha BSC', 'Hertha Berlin':'Hertha BSC', 'LOSC Lille':'Lille', 'La Spezia':'Spezia', 'Latium':'Lazio', 'Levante UD':'Levante', 'Montpellier HSC':'Montpellier', 'Montpellier Hérault SC':'Montpellier', 'Málaga CF':'Málaga', 'Nîmes Olympique':'Nîmes', 'OGC Nice':'Nice', 'Olympique de Marseille':'Olympique Marseille', 'Paris Saint-Germain':'Paris Saint Germain', 'R. Valladolid CF':'Real Valladolid', 
'Celta de Vigo de Vigo':'Celta de Vigo', 'RC Celta de Vigo':'Celta de Vigo', 'RC Celta':'Celta de Vigo', 'RC Deportivo de La Coruña':'Deportivo La Coruña', 'RC Lens':'Lens', 'RC Strasbourg Alsace':'Strasbourg','Strasbourg Alsace':'Strasbourg','RC Strasbourg':'Strasbourg', 'RCD Espanyol':'Espanyol', 'RCD Mallorca':'Mallorca', 'Real Betis Balompié':'Real Betis', 'Real Madrid CF':'Real Madrid', 'Real Sporting de Gijón':'Sporting Gijón', 'SC Bastia':'Bastia', 'SC Paderborn 07':'Paderborn', 'SD Huesca':'Huesca', 'SM Caen':'Caen', 'SV Darmstadt 98':'Darmstadt 98', 'SV Werder Bremen':'Werder Bremen', 'Sevilla FC':'Sevilla', 'Spal':'SPAL', 'Sport-Club Freiburg':'SC Freiburg', 'Stade Brestois 29':'Brest', 'Stade Malherbe Caen':'Caen', 'Stade Rennais FC':'Rennes','Stade Rennais':'Rennes','Rennes FC':'Rennes', 'Stade de Reims':'Reims', 'TSG 1899 Hoffenheim':'TSG Hoffenheim', 'Toulouse FC':'Toulouse', 'Toulouse Football Club':'Toulouse', 'UD Las Palmas':'Las Palmas', 'Valencia CF':'Valencia', 'VfL Bochum':'VfL Bochum 1848', 'Villarreal CF':'Villarreal', 'AFC AFC Bournemouth':'AFC Bournemouth', 'DSC DSC Arminia Bielefeld':'DSC Arminia Bielefeld', 'FC FC Bayern München':'FC Bayern München', '1899 Hoffenheim':'TSG Hoffenheim', 'ACF Fiorentina':'Fiorentina', 'AS Saint-Etienne':'Saint-Étienne', 'Club Atlético Madrid':'Atlético Madrid', 'Elche Club de Fútbol':'Elche', 'Espanyol de Barcelona':'Espanyol', 'Getafe Club de Fútbol':'Getafe', 'Granada Club de Fútbol':'Granada', 'Levante Unión Deportiva':'Levante', 'Lorient Bretagne Sud':'Lorient', 'Monaco FC':'Monaco', 'Montpellier Hérault Sport Club':'Montpellier', 'Málaga Club de Fútbol':'Málaga', 'Queens Park Rangers':'Crystal Palace', 'Racing Club de Lens':'Lens', 'Rayo Vallecano de Madrid':'Rayo Vallecano', 'Real Club Celta de Vigo':'Celta de Vigo', 'Real Club Deportivo de La Coruña':'Deportivo La Coruña', 'Real Madrid Club de Fútbol':'Real Madrid', 'Real Sociedad de Fútbol':'Real Sociedad', 'Sevilla Fútbol Club':'Sevilla', 'Sporting Club Bastia':'Bastia', 'VfL Bochum 1848 1848':'VfL Bochum 1848', 'Valencia Club de Fútbol':'Valencia', 'Villarreal Club de Fútbol':'Villarreal', 'AC Ajaccio':'Gazélec Ajaccio', 'AS Nancy':'Nancy', 'AS Nancy Lorraine':'Nancy', 'Athlétic Club Ajaccio':'Gazélec Ajaccio',  'Bilbao Athletic':'Athletic Club', 'Clermont 63':'Clermont', 'Clermont Auvergne 63':'Clermont', 'Club Atlético Osasuna':'Osasuna', 'Club Deportivo Leganés':'Leganés'}

for old, new in teams_name_repl.items():
    tm_weekly16['TeamName'] = tm_weekly16['TeamName'].str.replace(old, new, regex=False)

### Prepare MATCH DATA 

In [6]:
# Import CSV
match_data = pd.read_csv('../Data/Match&Odds Data/matchdata_2015_out.csv', low_memory=False)
# Set Fixture id as index
match_data.set_index('id', inplace=True)
#### Change data types ####
float_columns = ['weather_report_pressure', 'weather_report_temperature_celcius_temp', 'weather_report_wind_degree', 'weather_windspeed(m/s)', 'home_passes_percentage', 'away_passes_percentage'] # columns that will be floats
# Convert all numeric columns to int except those that are expressed in floats
match_data[['weather_windspeed(m/s)', 'weather_clouds(%)', 'weather_humidity(%)']] = match_data[['weather_windspeed(m/s)', 'weather_clouds(%)', 'weather_humidity(%)']].astype('float64')       
m = match_data.select_dtypes(np.number).loc[:, ~match_data.select_dtypes(np.number).columns.isin(float_columns)]
match_data[m.columns]= m.round().astype('Int64')
# Convert dates to datetime format
match_data['time_starting_at_date_time'] = pd.to_datetime(match_data['time_starting_at_date_time'], infer_datetime_format=True)
match_data['round_start'] = pd.to_datetime(match_data['round_start'], format = '%Y-%m-%d')
match_data['round_end'] = pd.to_datetime(match_data['round_end'], format = '%Y-%m-%d')
match_data = match_data[match_data['league_is_cup'] == 0]
match_data = match_data.sort_values(by='time_starting_at_date_time')

In [7]:
match_data.head()

Unnamed: 0_level_0,league_id,season_id,stage_id,round_id,venue_id,referee_id,home_id,away_id,winner_team_id,attendance,formations_home_formation,formations_away_formation,scores_home_score,scores_away_score,scores_ht_score,scores_ft_score,time_starting_at_date_time,time_minute,time_injury_time,coaches_home_coach_id,coaches_away_coach_id,standings_home_position,standings_away_position,assistants_first_assistant_id,assistants_second_assistant_id,assistants_fourth_official_id,home_name,home_short_code,home_twitter,home_country_id,home_founded,home_venue_id,away_name,away_short_code,away_twitter,away_country_id,away_founded,away_venue_id,league_country_id,league_name,league_is_cup,season_name,season_league_id,current_season,round_name,round_league_id,round_season_id,round_stage_id,round_start,round_end,venue_name,venue_grass_surface,venue_city,venue_capacity,venue_coordinates,referee_fullname,homecoach_coach_id,homecoach_team_id,homecoach_country_id,homecoach_fullname,homecoach_nationality,homecoach_birthdate,homecoach_birthcountry,homecoach_birthplace,Awaycoach_coach_id,Awaycoach_team_id,Awaycoach_country_id,Awaycoach_fullname,Awaycoach_nationality,Awaycoach_birthdate,Awaycoach_birthcountry,Awaycoach_birthplace,weather_report_code,weather_report_type,weather_windspeed(m/s),weather_report_wind_degree,weather_clouds(%),weather_humidity(%),colors_home_color,colors_away_color,weather_report_pressure,weather_report_temperature_celcius_temp,home_team_id,home_fixture_id,home_shots_total,home_shots_ongoal,home_shots_offgoal,home_shots_blocked,home_shots_insidebox,home_shots_outsidebox,home_passes,home_attacks,home_fouls,home_corners,home_offsides,home_possessiontime,home_yellowcards,home_redcards,home_yellowredcards,home_saves,home_substitutions,home_goal_kick,home_goal_attempts,home_free_kick,home_throw_in,home_ball_safe,home_goals,home_penalties,home_injuries,home_tackles,away_team_id,away_fixture_id,away_shots_total,away_shots_ongoal,away_shots_offgoal,away_shots_blocked,away_shots_insidebox,away_shots_outsidebox,away_passes,away_attacks,away_fouls,away_corners,away_offsides,away_possessiontime,away_yellowcards,away_redcards,away_yellowredcards,away_saves,away_substitutions,away_goal_kick,away_goal_attempts,away_free_kick,away_throw_in,away_ball_safe,away_goals,away_penalties,away_injuries,away_tackles,home_passes_total,home_passes_accurate,home_passes_percentage,away_passes_total,away_passes_accurate,away_passes_percentage,home_attacks_attacks,home_attacks_dangerous_attacks,away_attacks_attacks,away_attacks_dangerous_attacks,home_shots,away_shots,weather_lat_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1
251711,301,1390,2497,37918,184,48975,690,591,591.0,,,,0,1,0-0,0-1,2015-08-07 20:30:00,90,,474402,37438919,,,12852,18518,15934,Lille,LIL,@losclive,17,1944,184,Paris Saint Germain,PSG,@PSG_inside,17,1970,131,17,Ligue 1,0,2015/2016,301,0,1,301,1390,2497,2015-08-07,2015-08-09,Stade Pierre-Mauroy,1,Villeneuve d'Ascq,50186,"(50.631111,3.137500)",Fredy Fautrel,474402,18562.0,17,Hervé Renard,France,30/09/1968,France,Aix-les-Bains,37438919,13260.0,17,Laurent Blanc,France,19/11/1965,France,Ales,,,,,,,,,,,690,251711,12,2,0,0,0,0,,,20,3,,52,3,,,1,,,,,,,,,,,591,251711,7,2,0,0,0,0,,,17,2,2,48,2.0,1.0,,2,,,,,,,,,,,,,,,,,,,,,,,"(nan, nan)"
849,8,10,8,85,206,13533,14,6,14.0,75261.0,,,1,0,1-0,1-0,2015-08-08 13:45:00,90,,37523072,455358,,,13561,12245,14533,Manchester United,MUN,@ManUtd,462,1878,206,Tottenham Hotspur,TOT,@SpursOfficial,462,1882,281313,462,Premier League,0,2015/2016,8,0,1,8,10,8,2015-08-08,2015-08-10,Old Trafford,1,Manchester,75635,"(53.463150,-2.291444)",Jonathan Moss,37523072,18694.0,38,Louis van Gaal,Netherlands,08/08/1951,Netherlands,Amsterdam,455358,591.0,44,Mauricio Roberto Pochettino Trossero,Argentina,02/03/1972,Argentina,Murphy,,,,,,,,,,,14,849,9,1,0,0,0,0,,,12,1,1.0,50,2,,,4,,,,,,,,,,,6,849,9,4,0,0,0,0,,,12,2,2,50,3.0,,,1,,,,,,,,,,,,,,,,,,,,,,,"(nan, nan)"
864,8,10,8,85,489,14663,33,51,51.0,27036.0,,,1,3,0-1,1-3,2015-08-08 16:00:00,90,,173023,896470,,,12088,12147,15271,Norwich City,NOR,@NorwichCityFC,462,1902,489,Crystal Palace,CRY,@CPFC,462,1905,201,462,Premier League,0,2015/2016,8,0,1,8,10,8,2015-08-08,2015-08-10,Carrow Road,1,Norwich,27244,"(52.622116,1.30914)",Simon Hooper,173023,3.0,1161,Alex Neil,Scotland,09/06/1981,Scotland,Belshill,896470,,462,Alan Pardew,England,18/07/1961,England,London,,,,,,,,,,,33,864,17,6,0,0,0,0,,,14,1,4.0,63,1,,,4,,,,,,,,,,,51,864,11,7,0,0,0,0,,,20,4,2,37,,,,5,,,,,,,,,,,,,,,,,,,,,,,"(nan, nan)"
871,8,10,8,85,117,15270,42,3,42.0,32242.0,,,4,2,3-0,4-2,2015-08-08 16:00:00,90,,893654,523970,,,12146,12090,15241,Leicester City,LEI,@LCFC,462,1884,117,Sunderland,SUN,,462,1879,212,462,Premier League,0,2015/2016,8,0,1,8,10,8,2015-08-08,2015-08-10,King Power Stadium,1,Leicester,32312,"(52.620278,-1.142222)",Lee Mason,893654,,251,Claudio Ranieri,Italy,20/10/1951,Italy,Roma,523970,18600.0,38,Dick Advocaat,Netherlands,27/09/1947,Netherlands,Den Haag,,,,,,,,,,,42,871,19,8,0,0,0,0,,,13,6,,44,2,,,3,,,,,,,,,,,3,871,11,5,0,0,0,0,,,17,3,1,56,4.0,,,4,,,,,,,,,,,,,,,,,,,,,,,"(nan, nan)"
879,8,10,8,85,12,15272,13,25,,39063.0,,,2,2,0-1,2-2,2015-08-08 16:00:00,90,,474796,455821,,,12018,12091,14532,Everton,EVE,@Everton,462,1878,12,Watford,WAT,@WatfordFC,462,1881,19,462,Premier League,0,2015/2016,8,0,1,8,10,8,2015-08-08,2015-08-10,Goodison Park,1,Liverpool,40569,"(53.438801,-2.966328)",Mike Jones,474796,18743.0,32,Roberto Martínez Montoliú,Spain,13/07/1973,Spain,Balaguer,455821,106.0,32,Enrique Sánchez Flores,Spain,05/02/1965,Spain,Madrid,,,,,,,,,,,13,879,10,5,0,0,0,0,,,7,8,4.0,67,1,,,3,,,,,,,,,,,25,879,11,5,0,0,0,0,,,13,2,1,33,2.0,,,3,,,,,,,,,,,,,,,,,,,,,,,"(nan, nan)"


# **Create New Team Attributes Dataframe**

In [8]:
from tqdm.notebook import tqdm

In [9]:
def test_merge(df, homeORaway):
    if homeORaway == 'home':
        print('****** TEST ', homeORaway.upper(), ' ******')
        print('Name Errors = ', (df['home_name'] != df['Home_TeamName']).sum())
        print('Date Errors = ', (df['Home_ObservationDate'] > df['time_starting_at_date_time']).sum())
    elif homeORaway == 'away': 
        print('****** TEST ', homeORaway.upper(), ' ******')
        print('Name Errors = ', (df['away_name'] != df['Away_TeamName']).sum())
        print('Date Errors = ', (df['Away_ObservationDate'] > df['time_starting_at_date_time']).sum())
    else:
        print('BAD TEST!!')

In [10]:
match_data = match_data.reset_index()
for h_a in ['home_name', 'away_name']:
    all_ordered = []
    for index, i in enumerate(tqdm(match_data.time_starting_at_date_time)):
        time_d = [(i.date() - j.date()).days if (i.date() - j.date()).days >= 1 and match_data[h_a].iloc[index] == tm_weekly16.TeamName.iloc[ind] else 999 for ind, j in enumerate(tm_weekly16.ObservationDate)]
        index_min = min(range(len(time_d)), key=time_d.__getitem__)

        if index_min == 0 and match_data[h_a].iloc[index_min] != tm_weekly16.TeamName.iloc[0]:
            time_d_exrelegated = [(i.date() - j.date()).days if (i.date() - j.date()).days < 0 and match_data[h_a].iloc[index] == tm_weekly16.TeamName.iloc[ind] else -999 for ind, j in enumerate(tm_weekly16.ObservationDate)]
            index_max = max(range(len(time_d_exrelegated)), key=time_d_exrelegated.__getitem__)
            all_ordered.append(dict(tm_weekly16.iloc[index_max,:]))
        else:
            all_ordered.append(dict(tm_weekly16.iloc[index_min,:]))
            
    if h_a == 'home_name':
        new_df_home = pd.DataFrame(all_ordered).add_prefix('Home_')
        merge1 = pd.merge(match_data, new_df_home, how='outer', left_index=True, right_index=True)
        test_merge(merge1, 'home')
    elif h_a == 'away_name':
        new_df_away = pd.DataFrame(all_ordered).add_prefix('Away_')
        final_merged = pd.merge(merge1, new_df_away, how='outer', left_index=True, right_index=True)
        test_merge(final_merged, 'home')
        test_merge(final_merged, 'away')

  0%|          | 0/12362 [00:00<?, ?it/s]

****** TEST  HOME  ******
Name Errors =  0
Date Errors =  3


  0%|          | 0/12362 [00:00<?, ?it/s]

****** TEST  HOME  ******
Name Errors =  0
Date Errors =  3
****** TEST  AWAY  ******
Name Errors =  0
Date Errors =  4


**NOTES:** To avoid problems with teams coming from a minor league we consider the case of the index_min taking the value 0 and the team name from match data not matching the team name on tm_weekly16, in this case we consider the attributes of the right team but coming from the closest observation in the future (instead that from the past). This problem could be avoided through scraping also data from the minor leagues!

In [11]:
final_merged.tail()

Unnamed: 0,id,league_id,season_id,stage_id,round_id,venue_id,referee_id,home_id,away_id,winner_team_id,attendance,formations_home_formation,formations_away_formation,scores_home_score,scores_away_score,scores_ht_score,scores_ft_score,time_starting_at_date_time,time_minute,time_injury_time,coaches_home_coach_id,coaches_away_coach_id,standings_home_position,standings_away_position,assistants_first_assistant_id,assistants_second_assistant_id,assistants_fourth_official_id,home_name,home_short_code,home_twitter,home_country_id,home_founded,home_venue_id,away_name,away_short_code,away_twitter,away_country_id,away_founded,away_venue_id,league_country_id,league_name,league_is_cup,season_name,season_league_id,current_season,round_name,round_league_id,round_season_id,round_stage_id,round_start,round_end,venue_name,venue_grass_surface,venue_city,venue_capacity,venue_coordinates,referee_fullname,homecoach_coach_id,homecoach_team_id,homecoach_country_id,homecoach_fullname,homecoach_nationality,homecoach_birthdate,homecoach_birthcountry,homecoach_birthplace,Awaycoach_coach_id,Awaycoach_team_id,Awaycoach_country_id,Awaycoach_fullname,Awaycoach_nationality,Awaycoach_birthdate,Awaycoach_birthcountry,Awaycoach_birthplace,weather_report_code,weather_report_type,weather_windspeed(m/s),weather_report_wind_degree,weather_clouds(%),weather_humidity(%),colors_home_color,colors_away_color,weather_report_pressure,weather_report_temperature_celcius_temp,home_team_id,home_fixture_id,home_shots_total,home_shots_ongoal,home_shots_offgoal,home_shots_blocked,home_shots_insidebox,home_shots_outsidebox,home_passes,home_attacks,home_fouls,home_corners,home_offsides,home_possessiontime,home_yellowcards,home_redcards,home_yellowredcards,home_saves,home_substitutions,home_goal_kick,home_goal_attempts,home_free_kick,home_throw_in,home_ball_safe,home_goals,home_penalties,home_injuries,home_tackles,away_team_id,away_fixture_id,away_shots_total,away_shots_ongoal,away_shots_offgoal,away_shots_blocked,away_shots_insidebox,away_shots_outsidebox,away_passes,away_attacks,away_fouls,away_corners,away_offsides,away_possessiontime,away_yellowcards,away_redcards,away_yellowredcards,away_saves,away_substitutions,away_goal_kick,away_goal_attempts,away_free_kick,away_throw_in,away_ball_safe,away_goals,away_penalties,away_injuries,away_tackles,home_passes_total,home_passes_accurate,home_passes_percentage,away_passes_total,away_passes_accurate,away_passes_percentage,home_attacks_attacks,home_attacks_dangerous_attacks,away_attacks_attacks,away_attacks_dangerous_attacks,home_shots,away_shots,weather_lat_lon,Home_ObservationDate,Home_TeamName,Home_Attack,Home_Midfield,Home_Defence,Home_TransferBudget,Home_RivalTeam,Home_TeamRoster,Home_is_major,Away_ObservationDate,Away_TeamName,Away_Attack,Away_Midfield,Away_Defence,Away_TransferBudget,Away_RivalTeam,Away_TeamRoster,Away_is_major
12357,18165757,564,18462,77454016,250046,304396,18637,377,214,,,4-2-3-1,4-1-4-1,1,1,0-0,1-1,2022-04-11 21:00:00,90,,19960388,524009,13,9,14349,12225,50793,Rayo Vallecano,RAY,,32,1924,304396,Valencia,VAL,@valenciacf,32,1919,9240,32,La Liga,0,2021/2022,564,1,31,564,18462,77454016,2022-04-08,2022-04-11,Estadio de Vallecas,1,Madrid,14708,"(40.391808,-3.658611)",Ricardo De Burgos Bengoetxea,19960388,377,32,Andoni Iraola Sagarna,Spain,22/06/1982,Spain,Usurbil,524009,214,32,José Bordalás Jiménez,Spain,05/03/1964,Spain,Alicante,clouds,scattered clouds,17.27,240.0,40,70,#F0F0F0,#940014,1007.0,14.5,377,18165757,12,5,7,0,5,6,,,13,5,4,56,3,0,0,1,4,,6,,,103,1,0,0.0,16,214,18165757,7,2,5,1,4,2,,,17,4,0,44,4,0,0,3,5,,4,,,108,1,0,3.0,20,432,314,72.69,344,224,65.12,115,52,107,38,,,"(40.4165, -3.7026)",2022-03-07,Rayo Vallecano,78,74,73,3300000,CD Leganés,"['Stole Dimitrievski', 'Iván Balliu', 'Esteban...",1,2022-03-07,Valencia,78,77,78,60000000,Levante UD,"['Giorgi Mamardashvili', 'Thierry Correia', 'G...",1
12358,18220188,384,18576,77454372,252815,7305,15969,345,2930,2930.0,,4-2-3-1,3-5-2,1,3,0-1,1-3,2022-04-15 19:00:00,90,,95726,128160,15,2,13402,12205,13669,Spezia,SPE,@acspezia,251,1906,7305,Inter,INT,@Inter,251,1908,1721,251,Serie A,0,2021/2022,384,1,33,384,18576,77454372,2022-04-15,2022-04-18,Stadio Alberto Picco,0,La Spezia,10336,"(44.101711,9.808218)",Fabio Maresca,95726,345,251,Thiago Motta,Italy,28/08/1982,Brazil,São Bernardo do Campo,128160,2930,251,Simone Inzaghi,Italy,05/04/1976,Italy,Piacenza,clear,clear sky,4.27,350.0,3,51,#F0F0F0,#002B87,1013.0,17.1,345,18220188,11,3,8,4,6,4,,,9,6,1,33,2,0,0,4,5,,7,,,114,1,0,2.0,11,2930,18220188,23,7,16,5,18,5,,,11,7,1,67,0,0,0,1,5,,17,,,139,3,0,0.0,13,275,190,69.09,600,511,85.17,99,33,114,69,,,"(44.1105, 9.8434)",2022-04-11,Spezia,71,69,70,2700000,Genoa,"['Ivan Provedel', 'Kelvin Amian', 'Martin Erli...",1,2022-04-11,Inter,82,82,83,99500000,Milan,"['Samir Handanovič', 'Milan J. Plada', 'Stefan...",1
12359,18220185,384,18576,77454372,252815,1721,13970,113,102,113.0,,4-3-3,4-3-2-1,2,0,1-0,2-0,2022-04-15 21:00:00,90,,459100,35571,2,18,12816,12059,14082,Milan,MIL,@acmilan,251,1899,1721,Genoa,GEN,@GenoaCFC,251,1893,86,251,Serie A,0,2021/2022,384,1,33,384,18576,77454372,2022-04-15,2022-04-18,Stadio Giuseppe Meazza,1,Milano,80018,"(45.478025,9.124206)",Daniele Chiffi,459100,113,251,Stefano Pioli,Italy,20/10/1965,Italy,Parma,35571,102,11,Alexander Blessin,Germany,28/05/1973,Germany,,clear,clear sky,6.91,120.0,0,49,#C40010,#CCCCCC,1018.0,18.5,113,18220185,9,4,5,1,7,3,,,10,3,3,59,1,0,0,2,5,,7,,,108,2,0,3.0,17,102,18220185,8,2,6,1,3,4,,,19,1,1,41,2,0,0,3,5,,5,,,102,0,0,2.0,30,498,412,82.73,342,250,73.1,103,46,101,30,,,"(45.4643, 9.1895)",2022-04-11,Milan,81,80,81,37000000,Inter,"['Mike Maignan', 'Davide Calabria', 'Fikayo To...",1,2022-04-11,Genoa,72,72,72,8500000,Sampdoria,"['Salvatore Sirigu', 'Silvan Hefti', 'Nikola M...",1
12360,18157344,301,18441,77453967,249532,135,15484,598,6789,6789.0,,4-2-3-1,4-2-3-1,2,3,1-1,2-3,2022-04-15 21:00:00,90,,523953,2158189,3,6,12253,15840,17292,Rennes,REN,@staderennais,17,1901,135,Monaco,AMO,@AS_Monaco,75285,1919,4451,17,Ligue 1,0,2021/2022,301,1,32,301,18441,77453967,2022-04-15,2022-04-17,Roazhon Park,1,Rennes,29778,"(48.107458,-1.712839)",Stéphanie Frappart,523953,598,17,Bruno Génésio,France,01/09/1966,France,Lyon,2158189,6789,556,Philippe Clement,Belgium,22/03/1974,Belgium,Antwerpen,clear,clear sky,8.05,310.0,0,82,#C40010,#F0F0F0,1026.0,13.6,598,18157344,14,4,10,3,7,6,,,6,6,0,64,1,0,0,1,4,,9,,,103,2,1,1.0,9,6789,18157344,12,4,8,1,9,3,,,15,0,1,36,3,0,0,2,3,,11,,,99,3,0,0.0,13,668,569,85.18,383,293,76.5,125,58,74,33,,,"(48.1667, -1.6667)",2022-04-11,Rennes,78,76,76,14000000,FC Nantes,"['Alfred Gomis', 'Hamari Traoré', 'Warmed Omar...",1,2022-04-11,Monaco,83,78,76,26000000,OGC Nice,"['Alexander NübelL', 'Axel Disasi', 'Guillermo...",1
12361,18165743,564,18462,77454016,250047,133,13963,594,485,,,4-3-1-2,4-2-3-1,0,0,0-0,0-0,2022-04-15 21:00:00,90,,530755,523898,6,5,12257,13964,18344,Real Sociedad,SOC,@RealSociedad,32,1909,133,Real Betis,BET,@RealBetis,32,1907,68,32,La Liga,0,2021/2022,564,1,32,564,18462,77454016,2022-04-15,2022-04-18,Reale Arena,1,San Sebastián,32076,"(43.301376,-1.973602)",Isidro Díaz de Mera Escuderos,530755,594,32,Imanol Alguacil Barrenetxea,Spain,04/07/1971,Spain,Orio,523898,485,80,Manuel Luis Pellegrini Ripamonti,Chile,16/09/1953,Chile,Santiago de Chile,clouds,overcast clouds,3.0,207.0,100,85,#2B72DE,#EA9C08,1026.0,13.0,594,18165743,10,3,7,1,7,2,,,17,8,7,57,3,1,1,2,6,,6,,,77,0,0,,16,485,18165743,9,2,7,1,3,6,,,7,1,3,43,4,0,0,2,5,,7,,,94,0,0,,26,477,408,85.53,377,305,80.9,108,78,95,37,,,"(43.3128, -1.975)",2022-04-11,Real Sociedad,82,81,78,15500000,Athletic Club de Bilbao,"['Congo', 'Gorosabel', 'Aritz Elustondo', 'Rob...",1,2022-04-11,Real Betis,79,80,78,16000000,Sevilla FC,"['Rui Silva', 'Héctor Bellerín MorunoL', 'Germ...",1


In [12]:
print((final_merged['Home_ObservationDate'] != final_merged['Away_ObservationDate']).sum())
# if any print is different from 0, there is a problem with team coming from a minor league!

11


In [None]:
final_merged.to_csv('finalmerge_MATCHES.csv')