In [95]:
# Importing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [96]:
# Importing datasets from AWS
season_1516 = pd.read_csv('https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data/2015_2016.csv')
season_1617 = pd.read_csv('https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data/2016_2017.csv')
season_1718 = pd.read_csv('https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data/2017_2018.csv')
season_1819 = pd.read_csv('https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data/2018_2019.csv')
season_1920 = pd.read_csv('https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data/2019_2020.csv')

In [97]:
# Importing xG datasets
path = '../data'
years = [2015, 2016, 2017, 2018, 2019]
all_seasons = os.listdir(path)
all_teams = []
dct = {}
cnt = 0

for season in all_seasons:
    
    new_path = path + '/season_' + f'{years[cnt]}'
    all_teams = os.listdir(new_path)
    
    for team in all_teams:
        path_to_file = new_path + '/' + f'{team}'
        dct[team] = pd.read_csv(path_to_file, index_col=0)
    
    cnt+=1

In [98]:
# Example of usage
dct['Liverpool_2015.csv']

Unnamed: 0,home_team,away_team,xG_home,xG_away
0,Liverpool,Bournemouth,2.15062,0.460805
1,Liverpool,West Ham,0.561737,0.97217
2,Liverpool,Norwich,1.90447,0.556271
3,Liverpool,Aston Villa,1.78331,0.892054
4,Liverpool,Southampton,0.682823,1.56383
5,Liverpool,Crystal Palace,2.15673,0.849281
6,Liverpool,Swansea,1.19358,0.367517
7,Liverpool,West Bromwich Albion,2.12583,0.835885
8,Liverpool,Leicester,1.37933,0.505167
9,Liverpool,Arsenal,1.16222,2.22912


In [99]:
#Preprocessing datasets from AWS
#need to make a copy instead of working on original data
season_1516_ = season_1516.copy()
season_1617_ = season_1617.copy()
season_1718_ = season_1718.copy()
season_1819_ = season_1819.copy()
season_1920_ = season_1920.copy()

In [100]:
#Removing bet, referee and div, date columns from data
season_1516_ = season_1516_.loc[:, 'Date':'AR'].drop(['Referee'], axis = 1)
season_1617_ = season_1617_.loc[:, 'Date':'AR'].drop(['Referee'], axis = 1)
season_1718_ = season_1718_.loc[:, 'Date':'AR'].drop(['Referee'], axis = 1)
season_1819_ = season_1819_.loc[:, 'Date':'AR'].drop(['Referee'], axis = 1)
season_1920_ = season_1920_.loc[:, 'Date':'AR'].drop(['Referee'], axis = 1)
season_1516_.info() #20 columns left 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 21 columns):
Date        380 non-null object
HomeTeam    380 non-null object
AwayTeam    380 non-null object
FTHG        380 non-null int64
FTAG        380 non-null int64
FTR         380 non-null object
HTHG        380 non-null int64
HTAG        380 non-null int64
HTR         380 non-null object
HS          380 non-null int64
AS          380 non-null int64
HST         380 non-null int64
AST         380 non-null int64
HF          380 non-null int64
AF          380 non-null int64
HC          380 non-null int64
AC          380 non-null int64
HY          380 non-null int64
AY          380 non-null int64
HR          380 non-null int64
AR          380 non-null int64
dtypes: int64(16), object(5)
memory usage: 62.5+ KB


In [101]:
season_1516_['Date'] = pd.to_datetime(season_1516_['Date']) 
season_1617_['Date'] = pd.to_datetime(season_1617_['Date'])  
season_1718_['Date'] = pd.to_datetime(season_1718_['Date'])  
season_1819_['Date'] = pd.to_datetime(season_1819_['Date']) 
season_1920_['Date'] = pd.to_datetime(season_1920_['Date']) 

In [102]:
season_1516_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 21 columns):
Date        380 non-null datetime64[ns]
HomeTeam    380 non-null object
AwayTeam    380 non-null object
FTHG        380 non-null int64
FTAG        380 non-null int64
FTR         380 non-null object
HTHG        380 non-null int64
HTAG        380 non-null int64
HTR         380 non-null object
HS          380 non-null int64
AS          380 non-null int64
HST         380 non-null int64
AST         380 non-null int64
HF          380 non-null int64
AF          380 non-null int64
HC          380 non-null int64
AC          380 non-null int64
HY          380 non-null int64
AY          380 non-null int64
HR          380 non-null int64
AR          380 non-null int64
dtypes: datetime64[ns](1), int64(16), object(4)
memory usage: 62.5+ KB


In [103]:
#Changing team's names to shortcuts e.g. Leicester : LEI 
season_1516_teams_names_map = {'Arsenal' : 'ARS', 'Aston Villa' : 'AVA', 'Bournemouth' : 'BOU',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN', 'Newcastle' : 'NEW', 'Norwich' : 'NOR',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1617_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Hull' : 'HUL','Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Middlesbrough' : 'MID',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1718_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Stoke' : 'STO',  'Huddersfield' : 'HUD',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1819_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Cardiff' : 'CAR',  'Huddersfield' : 'HUD',
                               'Fulham' : 'FUL', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolves' : 'WLV', 'West Ham' : 'WHU'}

season_1920_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Norwich' : 'NOR',  'Sheffield United' : 'SHU',
                               'Aston Villa' : 'AVA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolves' : 'WLV', 'West Ham' : 'WHU'}

season_map_list = [season_1516_teams_names_map, season_1617_teams_names_map ,season_1718_teams_names_map,
                  season_1819_teams_names_map,season_1920_teams_names_map]
season_list = [season_1516_ ,season_1617_ ,season_1718_ ,season_1819_ ,season_1920_ ]

#now change...
idx = 0
for season in season_list:
    season['HomeTeam'] = season['HomeTeam'].map(season_map_list[idx])
    season['AwayTeam'] = season['AwayTeam'].map(season_map_list[idx])
    idx += 1

season_1516_.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,2015-08-08,BOU,AVA,0,1,A,0,0,D,11,...,2,3,13,13,6,3,3,4,0,0
1,2015-08-08,CHE,SWA,2,2,D,2,1,H,11,...,3,10,15,16,4,8,1,3,1,0
2,2015-08-08,EVE,WAT,2,2,D,0,1,A,10,...,5,5,7,13,8,2,1,2,0,0
3,2015-08-08,LEI,SUN,4,2,H,3,0,H,19,...,8,5,13,17,6,3,2,4,0,0
4,2015-08-08,MUN,TOT,1,0,H,1,0,H,9,...,1,4,12,12,1,2,2,3,0,0
5,2015-08-08,NOR,CRY,1,3,A,0,1,A,17,...,6,7,14,20,1,4,1,0,0,0
6,2015-09-08,ARS,WHU,0,2,A,0,1,A,22,...,6,4,12,9,5,4,1,3,0,0
7,2015-09-08,NEW,SOU,2,2,D,1,1,D,9,...,4,5,9,12,6,6,2,4,0,0
8,2015-09-08,STO,LIV,0,1,A,0,0,D,7,...,1,3,9,16,3,5,2,4,0,0
9,2015-10-08,WBA,MCI,0,3,A,0,2,A,9,...,2,7,12,9,6,6,4,1,0,0


In [104]:
season_1920_.tail(10)

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
320,2020-04-07,12:30,NOR,BRI,0,1,A,0,1,A,...,1,2,11,13,0,2,2,2,0,0
321,2020-04-07,15:00,LEI,CRY,3,0,H,0,0,D,...,4,1,18,8,3,7,2,1,0,0
322,2020-04-07,15:00,MUN,BOU,5,2,H,3,1,H,...,10,3,13,12,8,3,0,1,0,0
323,2020-04-07,17:30,WLV,ARS,0,2,A,0,1,A,...,1,5,6,11,3,5,2,4,0,0
324,2020-04-07,20:00,CHE,WAT,3,0,H,2,0,H,...,9,3,9,12,5,3,0,2,0,0
325,2020-05-07,12:00,BUR,SHU,1,1,D,1,0,H,...,4,3,8,14,3,9,0,0,0,0
326,2020-05-07,14:15,NEW,WHU,2,2,D,1,1,D,...,4,7,9,11,6,4,0,0,0,0
327,2020-05-07,16:30,LIV,AVA,2,0,H,0,0,D,...,4,3,18,8,7,5,1,1,0,0
328,2020-05-07,19:00,SOU,MCI,1,0,H,1,0,H,...,4,6,6,7,2,13,1,2,0,0
329,2020-06-07,20:00,TOT,EVE,1,0,H,1,0,H,...,2,3,14,18,5,6,3,2,0,0


In [105]:
#Time to do the same with DataFrames in dictionary
from copy import deepcopy
dct_ = deepcopy(dct) # working on copied dict with DataFrames

In [106]:
season_1516_teams_names_map = {'Arsenal' : 'ARS', 'Aston Villa' : 'AVA', 'Bournemouth' : 'BOU',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Manchester City' : 'MCI',
                               'Manchester United' : 'MUN', 'Newcastle United' : 'NEW', 'Norwich' : 'NOR',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Bromwich Albion' : 'WBA', 'West Ham' : 'WHU'}

season_1617_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR', 
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Manchester City' : 'MCI',
                               'Manchester United' : 'MUN',  'Middlesbrough' : 'MID',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Bromwich Albion' : 'WBA', 'West Ham' : 'WHU',  'Hull' : 'HUL',}

season_1718_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Manchester City' : 'MCI',
                               'Manchester United' : 'MUN',  'Newcastle United' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Stoke' : 'STO',  'Huddersfield' : 'HUD',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Bromwich Albion' : 'WBA', 'West Ham' : 'WHU'}

season_1819_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Manchester City' : 'MCI',
                               'Manchester United' : 'MUN',  'Newcastle United' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Cardiff' : 'CAR',  'Huddersfield' : 'HUD',
                               'Fulham' : 'FUL', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolverhampton Wanderers' : 'WLV', 'West Ham' : 'WHU'}

season_1920_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Manchester City' : 'MCI',
                               'Manchester United' : 'MUN',  'Newcastle United' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Norwich' : 'NOR',  'Sheffield United' : 'SHU',
                               'Aston Villa' : 'AVA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolverhampton Wanderers' : 'WLV', 'West Ham' : 'WHU'}

season_map_list = [season_1516_teams_names_map, season_1617_teams_names_map ,season_1718_teams_names_map,
                  season_1819_teams_names_map,season_1920_teams_names_map]

idx = 0
for key, value in dct_.items():
    if f'{years[idx]}' in key:
        value['home_team'] = value['home_team'].map(season_map_list[idx])
        value['away_team'] = value['away_team'].map(season_map_list[idx])
        continue
    else:
        idx += 1
        value['home_team'] = value['home_team'].map(season_map_list[idx])
        value['away_team'] = value['away_team'].map(season_map_list[idx])
        
   


In [107]:
dct['Arsenal_2018.csv']['away_team'].sort_values().unique()

array(['Bournemouth', 'Brighton', 'Burnley', 'Cardiff', 'Chelsea',
       'Crystal Palace', 'Everton', 'Fulham', 'Huddersfield', 'Leicester',
       'Liverpool', 'Manchester City', 'Manchester United',
       'Newcastle United', 'Southampton', 'Tottenham', 'Watford',
       'West Ham', 'Wolverhampton Wanderers'], dtype=object)

In [108]:
dct_['Arsenal_2018.csv']['away_team'].sort_values().unique()

array(['BOU', 'BRI', 'BUR', 'CAR', 'CHE', 'CRY', 'EVE', 'FUL', 'HUD',
       'LEI', 'LIV', 'MCI', 'MUN', 'NEW', 'SOU', 'TOT', 'WAT', 'WHU',
       'WLV'], dtype=object)

In [109]:
dct_

{'Arsenal_2015.csv':    home_team away_team   xG_home   xG_away
 0        ARS       WHU  1.331660  0.535961
 1        ARS       LIV  1.837550  1.292060
 2        ARS       STO  4.426840  0.191277
 3        ARS       MUN  2.169520  1.039530
 4        ARS       EVE  2.557540  0.373025
 5        ARS       TOT  2.017670  0.905850
 6        ARS       SUN  2.295590  1.522800
 7        ARS       MCI  1.501090  0.769378
 8        ARS       BOU  3.129580  0.343752
 9        ARS       NEW  1.930250  1.153390
 10       ARS       CHE  1.109070  1.487780
 11       ARS       SOU  3.384870  0.934050
 12       ARS       LEI  2.363190  1.312460
 13       ARS       SWA  1.492800  1.697880
 14       ARS       WAT  3.007420  0.290619
 15       ARS       CRY  1.734610  0.275806
 16       ARS       WBA  1.373480  0.839742
 17       ARS       NOR  0.781508  0.578781
 18       ARS       AVA  3.402070  0.222731,
 'Aston Villa_2015.csv':    home_team away_team   xG_home   xG_away
 0        AVA       MUN  0.2191

In [116]:
for index, row in season_1920_.iterrows():
    print(row['HomeTeam'])
    #for key, file in dct_.items():
        #if file['home_team']
season_1920_.shape[0] 

LIV
WHU
BOU
BUR
CRY
WAT
TOT
LEI
NEW
MUN
ARS
AVA
BRI
EVE
NOR
SOU
MCI
SHU
CHE
WLV
AVA
NOR
BRI
MUN
SHU
WAT
LIV
BOU
TOT
WLV
SOU
CHE
CRY
LEI
MCI
NEW
WHU
BUR
EVE
ARS
LIV
BRI
MUN
SHU
TOT
WLV
NOR
BOU
WAT
AVA
SOU
LEI
BUR
EVE
MCI
NEW
CRY
WHU
ARS
CHE
SHU
AVA
BOU
CHE
CRY
TOT
WLV
EVE
LEI
MUN
BRI
BUR
LIV
NOR
WAT
WHU
ARS
MCI
SOU
NEW
EVE
AVA
BOU
CHE
LEI
TOT
WLV
CRY
MUN
SHU
SOU
MCI
BRI
WAT
WHU
BUR
NEW
ARS
LIV
NOR
BOU
ARS
AVA
BRI
MCI
SHU
WHU
WAT
CRY
EVE
NOR
CHE
BUR
NEW
SOU
TOT
LEI
MUN
WLV
LIV
WHU
ARS
BOU
BRI
CRY
EVE
WAT
MCI
SHU
AVA
NEW
BUR
CHE
LIV
TOT
SOU
NOR
WLV
LEI
MUN
CRY
BUR
CHE
LEI
MUN
SOU
WLV
LIV
SHU
ARS
EVE
BOU
TOT
WAT
MCI
AVA
NEW
NOR
BRI
WHU
LIV
BUR
CHE
LEI
SHU
SOU
MUN
WLV
ARS
CRY
EVE
AVA
BOU
BRI
NEW
NOR
MCI
WAT
TOT
TOT
AVA
BOU
CHE
CRY
EVE
SHU
MUN
LEI
WLV
BRI
NEW
SOU
WAT
NOR
WHU
BUR
ARS
LIV
MCI
BRI
BUR
NEW
SOU
WAT
MCI
NOR
WHU
ARS
LIV
SHU
CRY
CHE
EVE
LEI
MUN
WLV
TOT
BOU
AVA
WAT
ARS
BRI
MCI
NOR
SOU
WHU
NEW
BUR
LIV
AVA
BOU
CRY
EVE
SHU
CHE
LEI
TOT
MUN
WLV
WHU
LEI
BOU
CRY
LIV
NEW
WAT
WHU
MUN
BUR
TOT


22

In [111]:
for key, file in dct_.items():
    for i in range (0,19):
        if file['home_team'][i] == row['HomeTeam'] && file['away_team'][i] == row['AwayTeam']



ARS WHU
ARS LIV
ARS STO
ARS MUN
ARS EVE
ARS TOT
ARS SUN
ARS MCI
ARS BOU
ARS NEW
ARS CHE
ARS SOU
ARS LEI
ARS SWA
ARS WAT
ARS CRY
ARS WBA
ARS NOR
ARS AVA
AVA MUN
AVA SUN
AVA WBA
AVA STO
AVA SWA
AVA MCI
AVA WAT
AVA ARS
AVA WHU
AVA CRY
AVA LEI
AVA NOR
AVA LIV
AVA EVE
AVA TOT
AVA CHE
AVA BOU
AVA SOU
AVA NEW
BOU AVA
BOU LEI
BOU SUN
BOU WAT
BOU TOT
BOU NEW
BOU EVE
BOU MUN
BOU CRY
BOU WHU
BOU NOR
BOU ARS
BOU STO
BOU SOU
BOU SWA
BOU MCI
BOU LIV
BOU CHE
BOU WBA
CHE SWA
CHE CRY
CHE ARS
CHE SOU
CHE AVA
CHE LIV
CHE NOR
CHE BOU
CHE SUN
CHE WAT
CHE WBA
CHE EVE
CHE MUN
CHE NEW
CHE STO
CHE WHU
CHE MCI
CHE TOT
CHE LEI
CRY ARS
CRY AVA
CRY MCI
CRY WBA
CRY WHU
CRY MUN
CRY SUN
CRY NEW
CRY SOU
CRY SWA
CRY CHE
CRY TOT
CRY BOU
CRY WAT
CRY LIV
CRY LEI
CRY NOR
CRY EVE
CRY STO
EVE WAT
EVE MCI
EVE CHE
EVE LIV
EVE MUN
EVE SUN
EVE AVA
EVE CRY
EVE LEI
EVE STO
EVE TOT
EVE SWA
EVE NEW
EVE WBA
EVE WHU
EVE ARS
EVE SOU
EVE BOU
EVE NOR
LEI SUN
LEI TOT
LEI AVA
LEI ARS
LEI CRY
LEI WAT
LEI MUN
LEI CHE
LEI MCI
LEI BOU
LEI STO
