#### Creating the dataset with matches, results and odds

In [16]:
package = Package('https://datahub.io/sports-data/spanish-la-liga/datapackage.json')

columns_package = ['Div','Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR','HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365H','B365D','B365A']

dictionary_variables = {'Div':'League Division','Date':'Match Date (dd/mm/yy)','Time':'Match Time','HomeTeam':'Home Team','AwayTeam':'Away Team','FTHG':'Full Time Home Team Goals','FTAG':'Full Time Away Team Goals','FTR':'Full Time Result (H Home Win, D Draw, A Away Win)','HS':'Home Team Shots','AS':'Away Team Shots','HST':'Home Team Shots on Target','AST':'Away Team Shots on Target','B365H':'Bet365 home win odds','B365D':'Bet365 draw odds','B365A':'Bet365 away win odds','season':'number of season'}

# Create historical data : season 2009-2010 to 2018-2019

historical_data = pd.DataFrame()

for resource in package.resources:
        if resource.descriptor['datahub']['type'] == 'derived/csv':
            x = pd.DataFrame(data = resource.read())
            x = x.drop(x.columns[range(25,len(x.columns))], axis=1)
            x.columns = columns_package
            x = x.drop(['HTHG','HTAG','HTR','HF','AF','HC','AC','HY','AY','HR','AR'], axis = 1)
            x['season'] = resource.name[7:11]
            historical_data = historical_data.append(x)

In [17]:
# Append season 2019-2020

season_1920 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1920/SP1.csv', header = 0, usecols = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'B365H', 'B365D', 'B365A'])
season_1920['season'] = '1920'
historical_data = historical_data.append(season_1920)

In [18]:
# Current season (2020-2021)

season_2021 = pd.read_csv('http://www.football-data.co.uk/mmz4281/2021/SP1.csv', header = 0, usecols = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'B365H', 'B365D', 'B365A'])
season_2021['season'] = '2021'
historical_data = historical_data.append(season_2021)

# Changing Date type

historical_data['Date'] = pd.to_datetime(historical_data['Date'], dayfirst = True)

In [19]:
historical_data.tail()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,B365H,B365D,B365A,season
365,SP1,2021-05-16,Cadiz,Elche,1,3,A,7,20,3,5,3.2,3.1,2.35,2021
366,SP1,2021-05-16,Getafe,Levante,2,1,H,9,5,3,1,1.61,3.8,5.75,2021
367,SP1,2021-05-16,Sociedad,Valladolid,4,1,H,13,19,8,5,1.61,3.75,6.0,2021
368,SP1,2021-05-16,Valencia,Eibar,4,1,H,9,20,5,8,2.87,3.4,2.37,2021
369,SP1,2021-05-16,Villarreal,Sevilla,4,0,H,12,17,5,5,2.37,3.4,2.87,2021


Now a clasification by season/team and date by date is needed to calculate the trends in terms of points, winning/lossing streaks, goals, shoots,etc.

In [20]:
# Table by season|HomeTeam|Date

aux_table_1 = pd.DataFrame(historical_data.groupby(['season','HomeTeam', 'Date'])['Div'].count()).reset_index()
aux_table_1.drop(columns =['Div'], inplace=True)
aux_table_1.rename(columns={"HomeTeam": "Team"}, inplace = True)
aux_table_1['Home/Away'] = 'H'

# Table by season|AwayTeam|Date
aux_table_2 = pd.DataFrame(historical_data.groupby(['season','AwayTeam', 'Date'])['Div'].count()).reset_index()
aux_table_2.drop(columns =['Div'], inplace=True)
aux_table_2.rename(columns={"AwayTeam": "Team"}, inplace = True)
aux_table_2['Home/Away'] = 'A'

# Table by season|Team|Date
aux_table = aux_table_1.append(aux_table_2)
aux_table.reset_index(inplace=True)
aux_table.drop(columns =['index'], inplace=True)

# Assigning number of match
aux_table.sort_values(by=['season','Team','Date'], ascending = True, inplace = True)
aux_table['match']  = aux_table.groupby(['season', 'Team']).cumcount() + 1

In [21]:
aux_table

Unnamed: 0,season,Team,Date,Home/Away,match
0,0910,Almeria,2009-08-30,H,1
4550,0910,Almeria,2009-09-13,A,2
1,0910,Almeria,2009-09-20,H,3
4551,0910,Almeria,2009-09-23,A,4
2,0910,Almeria,2009-09-27,H,5
...,...,...,...,...,...
4546,2021,Villarreal,2021-04-25,H,33
4547,2021,Villarreal,2021-05-02,H,34
4548,2021,Villarreal,2021-05-09,H,35
9099,2021,Villarreal,2021-05-13,A,36


Now I will add the information from the historica dataset

In [22]:
# Adding the variables from historical dataset

historical_acc_1 = aux_table.merge(historical_data, how = 'inner', left_on=['Date','Team'], right_on=['Date','HomeTeam'])
historical_acc_2 = aux_table.merge(historical_data, how = 'inner', left_on=['Date','Team'], right_on=['Date','AwayTeam'])
historical_acc = historical_acc_1.append(historical_acc_2)

# Rearranging

historical_acc.sort_values(by=['season_x','Team','Date'], inplace = True)

# Calculation of the points

historical_acc['victory']  = historical_acc['Home/Away'] == historical_acc['FTR'] 
historical_acc['draw'] = historical_acc['FTR'] == 'D'
historical_acc['points'] = (historical_acc['victory']) * 3 + historical_acc['draw']
                           
# Removing useless columns
                           
historical_acc.drop(labels=['HomeTeam','AwayTeam','season_y','victory','draw'], axis = 1, inplace = True)
historical_acc.rename(columns = {'season_x': 'season'}, inplace = True)

In [23]:
historical_acc

Unnamed: 0,season,Team,Date,Home/Away,match,Div,FTHG,FTAG,FTR,HS,AS,HST,AST,B365H,B365D,B365A,points
0,0910,Almeria,2009-08-30,H,1,SP1,0,0,D,20,7,5,1,2.1,3.3,3.5,1
0,0910,Almeria,2009-09-13,A,2,SP1,1,0,H,16,7,4,0,2.38,3.25,3,0
1,0910,Almeria,2009-09-20,H,3,SP1,1,0,H,11,23,3,11,2.5,3.25,2.8,3
1,0910,Almeria,2009-09-23,A,4,SP1,2,2,D,24,12,9,7,1.44,4.33,7,1
2,0910,Almeria,2009-09-27,H,5,SP1,2,2,D,13,13,4,6,2.25,3.25,3.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4546,2021,Villarreal,2021-04-25,H,33,SP1,1,2,A,10,15,4,5,4.5,4.2,1.66,0
4547,2021,Villarreal,2021-05-02,H,34,SP1,1,0,H,4,15,3,3,2.1,2.9,4.1,3
4548,2021,Villarreal,2021-05-09,H,35,SP1,2,4,A,16,11,7,6,1.95,3.6,3.8,0
4549,2021,Villarreal,2021-05-13,A,36,SP1,0,2,A,11,9,2,3,3.8,3.5,1.95,3


In the same way I calculated the points I will calculate the Goals/Shots/Shots in Target

In [24]:
# Calculation of the goals/shots/shots on target

historical_acc['in_home']  = historical_acc['Home/Away'] == 'H'
historical_acc['away_game']  = historical_acc['Home/Away'] == 'A'

# For

historical_acc['goals_for'] = (historical_acc['in_home'] * historical_acc['FTHG']) + (historical_acc['away_game'] * historical_acc['FTAG'])
historical_acc['shots_for'] = (historical_acc['in_home'] * historical_acc['HS']) + (historical_acc['away_game'] * historical_acc['AS'])
historical_acc['shots_target_for'] = (historical_acc['in_home'] * historical_acc['HST']) + (historical_acc['away_game'] * historical_acc['AST'])

# Against

historical_acc['goals_against'] = (historical_acc['in_home'] * historical_acc['FTAG']) + (historical_acc['away_game'] * historical_acc['FTHG'])
historical_acc['shots_against'] = (historical_acc['in_home'] * historical_acc['AS']) + (historical_acc['away_game'] * historical_acc['HS'])
historical_acc['shots_target_against'] = (historical_acc['in_home'] * historical_acc['AST']) + (historical_acc['away_game'] * historical_acc['HST'])

# Cleansing

historical_acc.reset_index(inplace = True)
historical_acc.drop(columns=['index','in_home','away_game','FTHG','FTAG','FTR','HS','AS','HST','AST'], inplace = True)
historical_acc = historical_acc[['season','Date','match','Team','Div','goals_for','goals_against','shots_for','shots_against','shots_target_for','shots_target_against','points']]

Calculating accumulated numbers

In [26]:
historical_acc['goals_for_acc'] = historical_acc.groupby(['season','Team'])['goals_for'].cumsum() - historical_acc['goals_for']
historical_acc['goals_against_acc'] = historical_acc.groupby(['season','Team'])['goals_against'].cumsum() - historical_acc['goals_against']
historical_acc['shots_for_acc'] = historical_acc.groupby(['season','Team'])['shots_for'].cumsum() - historical_acc['shots_for']
historical_acc['shots_against_acc'] = historical_acc.groupby(['season','Team'])['shots_against'].cumsum() - historical_acc['shots_against']
historical_acc['shots_target_for_acc'] = historical_acc.groupby(['season','Team'])['shots_target_for'].cumsum() - historical_acc['shots_target_for']
historical_acc['shots_target_against_acc'] = historical_acc.groupby(['season','Team'])['shots_target_against'].cumsum() - historical_acc['shots_target_against']
historical_acc['points_acc'] = historical_acc.groupby(['season','Team'])['points'].cumsum() - historical_acc['points']

Calculating ranking by round for each season

In [28]:
historical_acc['ranking'] = historical_acc.groupby(['season','match'])['points_acc'].rank(method='first', ascending=True).astype('int64')

In terms of future simplicity the column 'match' will be added to historical_data

In [31]:
historical_data = pd.merge(historical_data,historical_acc[['Date','Team','match']],left_on=['Date','HomeTeam'],right_on = ['Date','Team'], how='left')
historical_data = pd.merge(historical_data,historical_acc[['Date','Team','match']],left_on=['Date','AwayTeam'],right_on = ['Date','Team'], how='left',suffixes=('_home','_away'))
historical_data = historical_data[['Div','Date','season','HomeTeam','AwayTeam','match_home','match_away','B365H','B365D','B365A','FTR']]

In [32]:
columns_to_append= ['Date','Team','goals_for_acc','goals_against_acc','shots_for_acc','shots_against_acc','shots_target_for_acc','shots_target_against_acc','points_acc','ranking']

df_to_be_trained_home = historical_data.merge(historical_acc[columns_to_append], how = 'inner', left_on = ['Date','HomeTeam'], right_on =['Date','Team']).iloc[:,[12,13,14,15,16,17,18,19]]
df_to_be_trained_home = df_to_be_trained_home.add_suffix('_home')

df_to_be_trained_away = historical_data.merge(historical_acc[columns_to_append], how = 'inner', left_on = ['Date','AwayTeam'], right_on =['Date','Team']).iloc[:,[12,13,14,15,16,17,18,19]]
df_to_be_trained_away = df_to_be_trained_away.add_suffix('_away')


df_to_be_trained_aux = pd.concat([historical_data,df_to_be_trained_home,df_to_be_trained_away], axis = 1)
df_to_be_trained_aux.rename(columns={'FTR':'result','match_home':'match'}, inplace = True)

The final dataset to be trained will be the accumulated figures difference between the teams of that match at the time of the match

In [33]:
for col in df_to_be_trained_aux.columns:
    if col[-5:]  == '_home' and col[0:5] != 'match':
        df_to_be_trained_aux[col[:-5]]= df_to_be_trained_aux[col] - df_to_be_trained_aux[col[:-5]+'_away']
        df_to_be_trained_aux.drop(columns=col, axis = 1, inplace = True)
        
    elif col[-5:] =='_away' and col[0:4] != 'Team':
        
        df_to_be_trained_aux.drop(columns=col, axis = 1, inplace = True)
            
# Setting the difference in the classification df_to_be_trained_aux['ranking'] = df_to_be_trained_aux['ranking'] * - 1

In [34]:
columns = ['Div','season','match','HomeTeam','AwayTeam','goals_for_acc','goals_against_acc', 'shots_for_acc', 'shots_against_acc','shots_target_for_acc', 'shots_target_against_acc', 'points_acc','ranking','result','B365H','B365D','B365A']
final_dataset = df_to_be_trained_aux[columns]

**VARIABLES TO ADD QUALITY DATA**

So far this would be enough to continue with the rest of the process to work on the next stages of the proccess.

Variables to add once a first draft is done:

    Potential of the team : budget ? average classification of the last 3-5-10 seasons ?
    Trends = goals / shots in the last 3-5-10 matches?
    Resting days = add matches from other competitions to see how many days the team rested and how many days they have for the next match
    Importance of the match = have they already won the ligue ?  relegated ? Is there anything at stake [important above all for the last 5 
    matches]
    Potential_AtHome_Away = % games won/lost/drawn by team

In [37]:
final_dataset.tail()

Unnamed: 0,Div,season,match,HomeTeam,AwayTeam,goals_for_acc,goals_against_acc,shots_for_acc,shots_against_acc,shots_target_for_acc,shots_target_against_acc,points_acc,ranking,result,B365H,B365D,B365A
4545,SP1,2021,37,Cadiz,Elche,4,-1,52,-47,15,-15,13,7,A,3.2,3.1,2.35
4546,SP1,2021,37,Getafe,Levante,-17,-11,-22,-140,-24,-52,-6,-3,H,1.61,3.8,5.75
4547,SP1,2021,37,Sociedad,Valladolid,22,-14,66,-135,35,-49,25,13,H,1.61,3.75,6.0
4548,SP1,2021,37,Valencia,Eibar,18,5,-40,171,7,31,9,6,H,2.87,3.4,2.37
4549,SP1,2021,37,Villarreal,Sevilla,3,13,-45,52,19,2,-19,-2,H,2.37,3.4,2.87


In [36]:
df_to_be_trained_aux

Unnamed: 0,Div,Date,season,HomeTeam,AwayTeam,match,B365H,B365D,B365A,result,goals_for_acc,goals_against_acc,shots_for_acc,shots_against_acc,shots_target_for_acc,shots_target_against_acc,points_acc,ranking
0,SP1,2018-08-17,1819,Betis,Levante,1,1.66,4,5,A,0,0,0,0,0,0,0,-8
1,SP1,2018-08-17,1819,Girona,Valladolid,1,1.75,3.6,5,D,0,0,0,0,0,0,0,-8
2,SP1,2018-08-18,1819,Barcelona,Alaves,1,1.11,10,21,H,0,0,0,0,0,0,0,3
3,SP1,2018-08-18,1819,Celta,Espanol,1,1.85,3.5,4.5,D,0,0,0,0,0,0,0,-2
4,SP1,2018-08-18,1819,Villarreal,Sociedad,1,2.04,3.4,3.8,A,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4545,SP1,2021-05-16,2021,Cadiz,Elche,37,3.2,3.1,2.35,A,4,-1,52,-47,15,-15,13,7
4546,SP1,2021-05-16,2021,Getafe,Levante,37,1.61,3.8,5.75,H,-17,-11,-22,-140,-24,-52,-6,-3
4547,SP1,2021-05-16,2021,Sociedad,Valladolid,37,1.61,3.75,6,H,22,-14,66,-135,35,-49,25,13
4548,SP1,2021-05-16,2021,Valencia,Eibar,37,2.87,3.4,2.37,H,18,5,-40,171,7,31,9,6
