In [63]:
# Importing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [64]:
# Importing datasets from AWS
season_1516 = pd.read_csv('https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data/2015_2016.csv')
season_1617 = pd.read_csv('https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data/2016_2017.csv')
season_1718 = pd.read_csv('https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data/2017_2018.csv')
season_1819 = pd.read_csv('https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data/2018_2019.csv')
season_1920 = pd.read_csv('https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data/2019_2020.csv')

In [65]:
# Importing xG datasets
path = '../data'
years = [2015, 2016, 2017, 2018, 2019]
all_seasons = os.listdir(path)
all_teams = []
dct = {}
cnt = 0

for season in all_seasons:
    
    new_path = path + '/season_' + f'{years[cnt]}'
    all_teams = os.listdir(new_path)
    
    for team in all_teams:
        path_to_file = new_path + '/' + f'{team}'
        dct[team] = pd.read_csv(path_to_file, index_col=0)
    
    cnt+=1

In [66]:
# Example of usage
dct['Liverpool_2015.csv']

Unnamed: 0,home_team,away_team,xG_home,xG_away
0,Liverpool,Bournemouth,2.15062,0.460805
1,Liverpool,West Ham,0.561737,0.97217
2,Liverpool,Norwich,1.90447,0.556271
3,Liverpool,Aston Villa,1.78331,0.892054
4,Liverpool,Southampton,0.682823,1.56383
5,Liverpool,Crystal Palace,2.15673,0.849281
6,Liverpool,Swansea,1.19358,0.367517
7,Liverpool,West Bromwich Albion,2.12583,0.835885
8,Liverpool,Leicester,1.37933,0.505167
9,Liverpool,Arsenal,1.16222,2.22912


In [67]:
#Preprocessing datasets from AWS
#need to make a copy instead of working on original data
season_1516_ = season_1516.copy()
season_1617_ = season_1617.copy()
season_1718_ = season_1718.copy()
season_1819_ = season_1819.copy()
season_1920_ = season_1920.copy()

In [68]:
#Removing bet, referee and div, date columns from data
season_1516_ = season_1516_.loc[:, 'HomeTeam':'AR'].drop(['Referee'], axis = 1)
season_1617_ = season_1617_.loc[:, 'HomeTeam':'AR'].drop(['Referee'], axis = 1)
season_1718_ = season_1718_.loc[:, 'HomeTeam':'AR'].drop(['Referee'], axis = 1)
season_1819_ = season_1819_.loc[:, 'HomeTeam':'AR'].drop(['Referee'], axis = 1)
season_1920_ = season_1920_.loc[:, 'HomeTeam':'AR'].drop(['Referee'], axis = 1)
season_1516_.info() #20 columns left 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 20 columns):
HomeTeam    380 non-null object
AwayTeam    380 non-null object
FTHG        380 non-null int64
FTAG        380 non-null int64
FTR         380 non-null object
HTHG        380 non-null int64
HTAG        380 non-null int64
HTR         380 non-null object
HS          380 non-null int64
AS          380 non-null int64
HST         380 non-null int64
AST         380 non-null int64
HF          380 non-null int64
AF          380 non-null int64
HC          380 non-null int64
AC          380 non-null int64
HY          380 non-null int64
AY          380 non-null int64
HR          380 non-null int64
AR          380 non-null int64
dtypes: int64(16), object(4)
memory usage: 59.5+ KB


In [69]:
#Changing team's names to shortcuts e.g. Leicester : LEI 
season_1516_teams_names_map = {'Arsenal' : 'ARS', 'Aston Villa' : 'AVA', 'Bournemouth' : 'BOU',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN', 'Newcastle' : 'NEW', 'Norwich' : 'NOR',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1617_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Hull' : 'HUL','Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Middlesbrough' : 'MID',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1718_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Stoke' : 'STO',  'Huddersfield' : 'HUD',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1819_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Cardiff' : 'CAR',  'Huddersfield' : 'HUD',
                               'Fulham' : 'FUL', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolves' : 'WLV', 'West Ham' : 'WHU'}

season_1920_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Norwich' : 'NOR',  'Sheffield United' : 'SHU',
                               'Aston Villa' : 'AVA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolves' : 'WLV', 'West Ham' : 'WHU'}

season_map_list = [season_1516_teams_names_map, season_1617_teams_names_map ,season_1718_teams_names_map,
                  season_1819_teams_names_map,season_1920_teams_names_map]
season_list = [season_1516_ ,season_1617_ ,season_1718_ ,season_1819_ ,season_1920_ ]

#now change...
idx = 0
for season in season_list:
    season['HomeTeam'] = season['HomeTeam'].map(season_map_list[idx])
    season['AwayTeam'] = season['AwayTeam'].map(season_map_list[idx])
    idx += 1

season_1516_.head(10)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,BOU,AVA,0,1,A,0,0,D,11,7,2,3,13,13,6,3,3,4,0,0
1,CHE,SWA,2,2,D,2,1,H,11,18,3,10,15,16,4,8,1,3,1,0
2,EVE,WAT,2,2,D,0,1,A,10,11,5,5,7,13,8,2,1,2,0,0
3,LEI,SUN,4,2,H,3,0,H,19,10,8,5,13,17,6,3,2,4,0,0
4,MUN,TOT,1,0,H,1,0,H,9,9,1,4,12,12,1,2,2,3,0,0
5,NOR,CRY,1,3,A,0,1,A,17,11,6,7,14,20,1,4,1,0,0,0
6,ARS,WHU,0,2,A,0,1,A,22,8,6,4,12,9,5,4,1,3,0,0
7,NEW,SOU,2,2,D,1,1,D,9,15,4,5,9,12,6,6,2,4,0,0
8,STO,LIV,0,1,A,0,0,D,7,8,1,3,9,16,3,5,2,4,0,0
9,WBA,MCI,0,3,A,0,2,A,9,19,2,7,12,9,6,6,4,1,0,0


In [70]:
season_1920_.tail(10)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
320,NOR,BRI,0,1,A,0,1,A,12,8,1,2,11,13,0,2,2,2,0,0
321,LEI,CRY,3,0,H,0,0,D,15,7,4,1,18,8,3,7,2,1,0,0
322,MUN,BOU,5,2,H,3,1,H,19,7,10,3,13,12,8,3,0,1,0,0
323,WLV,ARS,0,2,A,0,1,A,10,8,1,5,6,11,3,5,2,4,0,0
324,CHE,WAT,3,0,H,2,0,H,21,7,9,3,9,12,5,3,0,2,0,0
325,BUR,SHU,1,1,D,1,0,H,9,10,4,3,8,14,3,9,0,0,0,0
326,NEW,WHU,2,2,D,1,1,D,11,17,4,7,9,11,6,4,0,0,0,0
327,LIV,AVA,2,0,H,0,0,D,6,9,4,3,18,8,7,5,1,1,0,0
328,SOU,MCI,1,0,H,1,0,H,8,26,4,6,6,7,2,13,1,2,0,0
329,TOT,EVE,1,0,H,1,0,H,12,11,2,3,14,18,5,6,3,2,0,0


In [78]:
#Time to do the same with DataFrames in dictionary
from copy import deepcopy
dct_ = deepcopy(dct) # working on copied dict with DataFrames

In [80]:
season_1516_teams_names_map = {'Arsenal' : 'ARS', 'Aston Villa' : 'AVA', 'Bournemouth' : 'BOU',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Manchester City' : 'MCI',
                               'Manchester United' : 'MUN', 'Newcastle United' : 'NEW', 'Norwich' : 'NOR',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Bromwich Albion' : 'WBA', 'West Ham' : 'WHU'}

season_1617_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR', 
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Manchester City' : 'MCI',
                               'Manchester United' : 'MUN',  'Middlesbrough' : 'MID',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Bromwich Albion' : 'WBA', 'West Ham' : 'WHU',  'Hull' : 'HUL',}

season_1718_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Manchester City' : 'MCI',
                               'Manchester United' : 'MUN',  'Newcastle United' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Stoke' : 'STO',  'Huddersfield' : 'HUD',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1819_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Manchester City' : 'MCI',
                               'Manchester United' : 'MUN',  'Newcastle United' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Cardiff' : 'CAR',  'Huddersfield' : 'HUD',
                               'Fulham' : 'FUL', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolverhampton Wanderers' : 'WLV', 'West Ham' : 'WHU'}

season_1920_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Manchester City' : 'MCI',
                               'Manchester United' : 'MUN',  'Newcastle United' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Norwich' : 'NOR',  'Sheffield United' : 'SHU',
                               'Aston Villa' : 'AVA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolverhampton Wanderers' : 'WLV', 'West Ham' : 'WHU'}

season_map_list = [season_1516_teams_names_map, season_1617_teams_names_map ,season_1718_teams_names_map,
                  season_1819_teams_names_map,season_1920_teams_names_map]

idx = 0
for key, value in dct_.items():
    if f'{years[idx]}' in key:
        value['home_team'] = value['home_team'].map(season_map_list[idx])
        value['away_team'] = value['away_team'].map(season_map_list[idx])
        continue
    idx += 1


In [81]:
dct['Liverpool_2018.csv']['away_team'].sort_values().unique()

array(['Arsenal', 'Bournemouth', 'Brighton', 'Burnley', 'Cardiff',
       'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Huddersfield',
       'Leicester', 'Manchester City', 'Manchester United',
       'Newcastle United', 'Southampton', 'Tottenham', 'Watford',
       'West Ham', 'Wolverhampton Wanderers'], dtype=object)

In [87]:
dct_['Liverpool_2018.csv']['away_team'].sort_values().unique()

array(['ARS', 'BOU', 'BRI', 'BUR', 'CAR', 'CHE', 'CRY', 'EVE', 'FUL',
       'HUD', 'LEI', 'MCI', 'MUN', 'NEW', 'SOU', 'TOT', 'WAT', 'WHU',
       'WLV'], dtype=object)

In [83]:
dct_['Liverpool_2019.csv']

Unnamed: 0,home_team,away_team,xG_home,xG_away
0,LIV,NOR,2.23456,0.842407
1,LIV,ARS,2.78821,0.985542
2,LIV,NEW,2.93281,0.291073
3,LIV,LEI,3.74867,0.098211
4,LIV,TOT,2.18851,1.35143
5,LIV,MCI,1.3319,1.48104
6,LIV,BRI,1.36832,0.867601
7,LIV,EVE,2.41397,1.76453
8,LIV,WAT,1.56674,1.01166
9,LIV,WLV,1.49124,0.363764
