In [2]:
# Importing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path
%matplotlib inline

In [3]:
# Importing main data
def import_main_data() -> dict:
    path = Path("../main_data")
    all_seasons = os.listdir(path)
    dct = {}
    for season in all_seasons:
        new_path = os.path.join(path, f'{season}')
        #print(new_path)
        dct[season] = pd.read_csv(new_path)   
    return dct

In [4]:
main_data = import_main_data()

In [5]:
# Example of usage
main_data['season_2019.csv']

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,09/08/2019,20:00,Liverpool,Norwich,4,1,H,4,0,...,3.43,-2.25,1.91,1.99,1.94,1.98,1.99,2.07,1.90,1.99
1,E0,10/08/2019,12:30,West Ham,Man City,0,5,A,0,1,...,2.91,1.75,1.95,1.95,1.96,1.97,2.07,1.98,1.97,1.92
2,E0,10/08/2019,15:00,Bournemouth,Sheffield United,1,1,D,0,0,...,1.92,-0.50,1.95,1.95,1.98,1.95,2.00,1.96,1.96,1.92
3,E0,10/08/2019,15:00,Burnley,Southampton,3,0,H,0,0,...,1.71,0.00,1.87,2.03,1.89,2.03,1.90,2.07,1.86,2.02
4,E0,10/08/2019,15:00,Crystal Palace,Everton,0,0,D,0,0,...,1.71,0.25,1.82,2.08,1.97,1.96,2.03,2.08,1.96,1.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,E0,26/07/2020,16:00,Leicester,Man United,0,2,A,0,0,...,2.03,0.50,1.89,2.01,1.88,2.05,1.94,2.05,1.86,2.02
376,E0,26/07/2020,16:00,Man City,Norwich,5,0,H,2,0,...,5.77,-3.75,2.03,1.87,2.01,1.88,2.06,1.88,2.02,1.84
377,E0,26/07/2020,16:00,Newcastle,Liverpool,1,3,A,1,1,...,2.40,1.00,1.94,1.96,1.95,1.97,2.03,2.00,1.95,1.92
378,E0,26/07/2020,16:00,Southampton,Sheffield United,3,1,H,0,1,...,2.01,-0.50,1.97,1.93,2.00,1.93,2.03,1.96,1.98,1.89


In [6]:
# Importing xG datasets
def import_xG_data() -> dict:
    path = Path("../data")
    years = [2015, 2016, 2017, 2018, 2019, 2020]
    all_seasons = os.listdir(path)
    all_teams = []
    dct = {}
    cnt = 0

    for season in all_seasons:
        new_path = os.path.join(path,"season_" f'{years[cnt]}')
        #print(new_path)
        all_teams = os.listdir(new_path)

        for team in all_teams:
            path_to_file = os.path.join(new_path, f'{team}')
            #print(path_to_file)
            dct[team] = pd.read_csv(path_to_file, index_col=0)

        cnt+=1
    
    return dct

In [7]:
xG = import_xG_data()

In [8]:
# Example of usage
xG['Liverpool_2019.csv']

Unnamed: 0,home_team,away_team,xG_home,xG_away
0,Liverpool,Norwich,2.23456,0.842407
1,Liverpool,Arsenal,2.78821,0.985542
2,Liverpool,Newcastle United,2.93281,0.291073
3,Liverpool,Leicester,3.74867,0.098211
4,Liverpool,Tottenham,2.18851,1.35143
5,Liverpool,Manchester City,1.3319,1.48104
6,Liverpool,Brighton,1.36832,0.867601
7,Liverpool,Everton,2.41397,1.76453
8,Liverpool,Watford,1.56674,1.01166
9,Liverpool,Wolverhampton Wanderers,1.49124,0.363764


In [24]:
# Preprocessing main data
from copy import deepcopy
def clear_data(data: dict) -> dict:
    data_ = deepcopy(data)
    for key in data_:
        data_[key] = data_[key].loc[:, 'Date':'AR'].drop(['Referee'], axis = 1)
        data_[key]['Date'] = pd.to_datetime(data_[key]['Date'], dayfirst = True) 
        #print(data_[key])
    return data_

In [25]:
cleared_data = clear_data(main_data)

In [26]:
# Example of usage
cleared_data['season_2019.csv']

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,2019-08-09,20:00,Liverpool,Norwich,4,1,H,4,0,H,...,7,5,9,9,11,2,0,2,0,0
1,2019-08-10,12:30,West Ham,Man City,0,5,A,0,1,A,...,3,9,6,13,1,1,2,2,0,0
2,2019-08-10,15:00,Bournemouth,Sheffield United,1,1,D,0,0,D,...,3,3,10,19,3,4,2,1,0,0
3,2019-08-10,15:00,Burnley,Southampton,3,0,H,0,0,D,...,4,3,6,12,2,7,0,0,0,0
4,2019-08-10,15:00,Crystal Palace,Everton,0,0,D,0,0,D,...,2,3,16,14,6,2,2,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2020-07-26,16:00,Leicester,Man United,0,2,A,0,0,D,...,3,3,12,11,3,3,1,4,1,0
376,2020-07-26,16:00,Man City,Norwich,5,0,H,2,0,H,...,10,4,7,4,9,0,1,1,0,0
377,2020-07-26,16:00,Newcastle,Liverpool,1,3,A,1,1,D,...,2,6,11,5,2,4,1,0,0,0
378,2020-07-26,16:00,Southampton,Sheffield United,3,1,H,0,1,A,...,4,3,9,16,9,1,0,1,0,0


In [23]:
# Changing team's names in each DataFrame in dict to be like in seasons, first copy

dct_ = deepcopy(xG) 

season_1516_teams_names_map = {'Arsenal' : 'Arsenal', 'Aston Villa' : 'Aston Villa', 'Bournemouth' : 'Bournemouth',
                               'Chelsea' : 'Chelsea', 'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United', 'Newcastle United' : 'Newcastle', 'Norwich' : 'Norwich',
                               'Southampton' : 'Southampton', 'Stoke' : 'Stoke', 'Sunderland' : 'Sunderland',
                               'Swansea' : 'Swansea', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'West Bromwich Albion' : 'West Brom', 'West Ham' : 'West Ham'}

season_1617_teams_names_map = {'Arsenal' : 'Arsenal',  'Bournemouth' : 'Bournemouth', 'Burnley' : 'Burnley', 
                               'Chelsea' : 'Chelsea', 'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United',  'Middlesbrough' : 'Middlesbrough',
                               'Southampton' : 'Southampton', 'Stoke' : 'Stoke', 'Sunderland' : 'Sunderland',
                               'Swansea' : 'Swansea', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'West Bromwich Albion' : 'West Brom', 'West Ham' : 'West Ham',  'Hull' : 'Hull'}

season_1718_teams_names_map = {'Arsenal' : 'Arsenal',  'Bournemouth' : 'Bournemouth', 'Burnley' : 'Burnley',
                               'Chelsea' : 'Chelsea', 'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United',  'Newcastle United' : 'Newcastle', 'Brighton' : 'Brighton',
                               'Southampton' : 'Southampton', 'Stoke' : 'Stoke',  'Huddersfield' : 'Huddersfield',
                               'Swansea' : 'Swansea', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'West Bromwich Albion' : 'West Brom', 'West Ham' : 'West Ham'}

season_1819_teams_names_map = {'Arsenal' : 'Arsenal',  'Bournemouth' : 'Bournemouth', 'Burnley' : 'Burnley',
                               'Chelsea' : 'Chelsea', 'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United',  'Newcastle United' : 'Newcastle', 'Brighton' : 'Brighton',
                               'Southampton' : 'Southampton', 'Cardiff' : 'Cardiff',  'Huddersfield' : 'Huddersfield',
                               'Fulham' : 'Fulham', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'Wolverhampton Wanderers' : 'Wolves', 'West Ham' : 'West Ham'}

season_1920_teams_names_map = {'Arsenal' : 'Arsenal',  'Bournemouth' : 'Bournemouth', 'Burnley' : 'Burnley',
                               'Chelsea' : 'Chelsea', 'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United',  'Newcastle United' : 'Newcastle', 'Brighton' : 'Brighton',
                               'Southampton' : 'Southampton', 'Sheffield United' : 'Sheffield United',
                               'Aston Villa' : 'Aston Villa', 'Tottenham' : 'Tottenham', 'West Bromwich Albion' : 'West Brom',
                               'Wolverhampton Wanderers' : 'Wolves', 'West Ham' : 'West Ham'}

season_2021_teams_names_map = {'Arsenal' : 'Arsenal',  'Burnley' : 'Burnley', 'Chelsea' : 'Chelsea', 
                               'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton', 'Fulham' : 'Fulham', 'Leeds' : 'Leeds',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United',  'Newcastle United' : 'Newcastle', 'Brighton' : 'Brighton',
                               'Southampton' : 'Southampton', 'Norwich' : 'Norwich',  'Sheffield United' : 'Sheffield United',
                               'Aston Villa' : 'Aston Villa', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'Wolverhampton Wanderers' : 'Wolves', 'West Ham' : 'West Ham'}

season_map_list = [season_1516_teams_names_map, season_1617_teams_names_map ,season_1718_teams_names_map,
                  season_1819_teams_names_map,season_1920_teams_names_map, season_2021_teams_names_map]

idx = 0
years = [2015, 2016, 2017, 2018, 2019, 2020]
for key, value in dct_.items():
    if f'{years[idx]}' in key:
        value['home_team'] = value['home_team'].map(season_map_list[idx])
        value['away_team'] = value['away_team'].map(season_map_list[idx])
        continue
    else:
        idx += 1
        value['home_team'] = value['home_team'].map(season_map_list[idx])
        value['away_team'] = value['away_team'].map(season_map_list[idx])

dct_['Man City_2015.csv'] = dct_.pop('Manchester City_2015.csv')
dct_['Man City_2016.csv'] = dct_.pop('Manchester City_2016.csv')
dct_['Man City_2017.csv'] = dct_.pop('Manchester City_2017.csv')
dct_['Man City_2018.csv'] = dct_.pop('Manchester City_2018.csv')
dct_['Man City_2019.csv'] = dct_.pop('Manchester City_2019.csv')
dct_['Man City_2020.csv'] = dct_.pop('Manchester City_2020.csv')
dct_['Man United_2015.csv'] = dct_.pop('Manchester United_2015.csv')
dct_['Man United_2016.csv'] = dct_.pop('Manchester United_2016.csv')
dct_['Man United_2017.csv'] = dct_.pop('Manchester United_2017.csv')
dct_['Man United_2018.csv'] = dct_.pop('Manchester United_2018.csv')
dct_['Man United_2019.csv'] = dct_.pop('Manchester United_2019.csv')
dct_['Man United_2020.csv'] = dct_.pop('Manchester United_2020.csv')
dct_['Newcastle_2015.csv'] = dct_.pop('Newcastle United_2015.csv')
dct_['Newcastle_2017.csv'] = dct_.pop('Newcastle United_2017.csv')
dct_['Newcastle_2018.csv'] = dct_.pop('Newcastle United_2018.csv')
dct_['Newcastle_2019.csv'] = dct_.pop('Newcastle United_2019.csv')
dct_['Newcastle_2020.csv'] = dct_.pop('Newcastle United_2020.csv')
dct_['West Brom_2015.csv'] = dct_.pop('West Bromwich Albion_2015.csv')
dct_['West Brom_2016.csv'] = dct_.pop('West Bromwich Albion_2016.csv')
dct_['West Brom_2017.csv'] = dct_.pop('West Bromwich Albion_2017.csv')
dct_['West Brom_2020.csv'] = dct_.pop('West Bromwich Albion_2020.csv')
dct_['Wolves_2018.csv'] = dct_.pop('Wolverhampton Wanderers_2018.csv')
dct_['Wolves_2019.csv'] = dct_.pop('Wolverhampton Wanderers_2019.csv')
dct_['Wolves_2020.csv'] = dct_.pop('Wolverhampton Wanderers_2020.csv')

In [10]:
# Attempt to merge datasets into one DataFrame...
# Creating DataFrames for each team in each season, then
# taking xG_home and xG_away from each team in each season from DICT,
# merging taken columns to sorted DataFrames for each team in each season,
# appending DataFrames to list, then created list adding to another list as SEASON...

list_of_lists_containing_data_frames = []
idx = 0
season_list = [season_1516_ ,season_1617_ ,season_1718_ ,season_1819_ ,season_1920_ ]
for season in season_list:
    list_of_data_frames_for_each_team_in_particular_season = []
    for h_team in season['HomeTeam'].sort_values().unique():
        df_sorted = season[season['HomeTeam'] == h_team].sort_values(by = ['AwayTeam'])
        xG_home_away_df = dct_[f'{h_team}_{years[idx]}.csv'].sort_values(by = ['away_team']).iloc[:,2:]
        xG_home_away_df = xG_home_away_df.reset_index().drop('index', axis = 1)
        df_sorted = df_sorted.reset_index().drop('index', axis = 1)

        df_sorted['xG_home'] = xG_home_away_df['xG_home']
        df_sorted['xG_away'] = xG_home_away_df['xG_away']
        df_ = df_sorted.set_index('Date')
        list_of_data_frames_for_each_team_in_particular_season.append(df_)
    idx += 1
    list_of_lists_containing_data_frames.append(list_of_data_frames_for_each_team_in_particular_season)


In [11]:
#Finally taking each list of seasons and creating one DataFrame for each season 
season_1516_ = pd.concat(list_of_lists_containing_data_frames[0]).sort_index()
season_1617_ = pd.concat(list_of_lists_containing_data_frames[1]).sort_index()
season_1718_ = pd.concat(list_of_lists_containing_data_frames[2]).sort_index()
season_1819_ = pd.concat(list_of_lists_containing_data_frames[3]).sort_index()
season_1920_ = pd.concat(list_of_lists_containing_data_frames[4]).sort_index()

In [12]:
# Now changing team's names to shortcuts e.g. Leicester : LEI 
season_1516_teams_names_map = {'Arsenal' : 'ARS', 'Aston Villa' : 'AVA', 'Bournemouth' : 'BOU',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN', 'Newcastle' : 'NEW', 'Norwich' : 'NOR',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1617_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Hull' : 'HUL','Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Middlesbrough' : 'MID',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1718_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Stoke' : 'STO',  'Huddersfield' : 'HUD',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1819_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Cardiff' : 'CAR',  'Huddersfield' : 'HUD',
                               'Fulham' : 'FUL', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolves' : 'WLV', 'West Ham' : 'WHU'}

season_1920_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Norwich' : 'NOR',  'Sheffield United' : 'SHU',
                               'Aston Villa' : 'AVA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolves' : 'WLV', 'West Ham' : 'WHU'}

season_map_list = [season_1516_teams_names_map, season_1617_teams_names_map ,season_1718_teams_names_map,
                  season_1819_teams_names_map,season_1920_teams_names_map]
season_list = [season_1516_ ,season_1617_ ,season_1718_ ,season_1819_ ,season_1920_ ]

#now change...
idx = 0
for season in season_list:
    season['HomeTeam'] = season['HomeTeam'].map(season_map_list[idx])
    season['AwayTeam'] = season['AwayTeam'].map(season_map_list[idx])
    idx += 1

In [13]:
#Now remove date because it is will not help in further analysis
season_1516_ = season_1516_.reset_index(drop = True)
season_1617_ = season_1617_.reset_index(drop = True)
season_1718_ = season_1718_.reset_index(drop = True)
season_1819_ = season_1819_.reset_index(drop = True)
season_1920_ = season_1920_.reset_index(drop = True)

In [14]:
season_1516_.head(10)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,...,HF,AF,HC,AC,HY,AY,HR,AR,xG_home,xG_away
0,MUN,TOT,1,0,H,1,0,H,9,9,...,12,12,1,2,2,3,0,0,0.627539,0.6746
1,CHE,SWA,2,2,D,2,1,H,11,18,...,15,16,4,8,1,3,1,0,0.64396,2.59203
2,LEI,SUN,4,2,H,3,0,H,19,10,...,13,17,6,3,2,4,0,0,2.56803,1.45946
3,NOR,CRY,1,3,A,0,1,A,17,11,...,14,20,1,4,1,0,0,0,1.13076,2.10975
4,EVE,WAT,2,2,D,0,1,A,10,11,...,7,13,8,2,1,2,0,0,0.604226,0.557892
5,BOU,AVA,0,1,A,0,0,D,11,7,...,13,13,6,3,3,4,0,0,0.876106,0.782253
6,ARS,WHU,0,2,A,0,1,A,22,8,...,12,9,5,4,1,3,0,0,1.33166,0.535961
7,NEW,SOU,2,2,D,1,1,D,9,15,...,9,12,6,6,2,4,0,0,1.54613,1.2529
8,STO,LIV,0,1,A,0,0,D,7,8,...,9,16,3,5,2,4,0,0,0.381274,0.329873
9,WBA,MCI,0,3,A,0,2,A,9,19,...,12,9,6,6,4,1,0,0,0.435238,1.9242
