In [1]:
# Importing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path
from copy import deepcopy
%matplotlib inline

In [2]:
# Importing main data
def import_main_data() -> dict:
    path = Path("../main_data")
    all_seasons = os.listdir(path)
    dct = {}
    for season in all_seasons:
        new_path = os.path.join(path, f'{season}')
        #print(new_path)
        dct[season] = pd.read_csv(new_path)   
    return dct

In [3]:
# Importing xG datasets
def import_xG_data() -> dict:
    path = Path("../data")
    years = [2015, 2016, 2017, 2018, 2019, 2020]
    all_seasons = os.listdir(path)
    all_teams = []
    dct = {}
    cnt = 0

    for season in all_seasons:
        new_path = os.path.join(path,"season_" f'{years[cnt]}')
        #print(new_path)
        all_teams = os.listdir(new_path)

        for team in all_teams:
            path_to_file = os.path.join(new_path, f'{team}')
            #print(path_to_file)
            dct[team] = pd.read_csv(path_to_file, index_col=0)

        cnt+=1
    
    return dct

In [4]:
# Preprocessing main data
def clear_data(data: dict) -> dict:
    data_ = deepcopy(data)
    for key in data_:
        data_[key] = data_[key].loc[:, 'Date':'AR'].drop(['Referee','Time'], axis = 1, errors = 'ignore').reset_index(drop = True)
        data_[key]['Date'] = pd.to_datetime(data_[key]['Date'], dayfirst = True) 
        
        #print(data_[key])
    return data_

In [5]:
season_1516_teams_names_map = {'Arsenal' : 'Arsenal', 'Aston Villa' : 'Aston Villa', 'Bournemouth' : 'Bournemouth',
                               'Chelsea' : 'Chelsea', 'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United', 'Newcastle United' : 'Newcastle', 'Norwich' : 'Norwich',
                               'Southampton' : 'Southampton', 'Stoke' : 'Stoke', 'Sunderland' : 'Sunderland',
                               'Swansea' : 'Swansea', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'West Bromwich Albion' : 'West Brom', 'West Ham' : 'West Ham'}

season_1617_teams_names_map = {'Arsenal' : 'Arsenal',  'Bournemouth' : 'Bournemouth', 'Burnley' : 'Burnley', 
                               'Chelsea' : 'Chelsea', 'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United',  'Middlesbrough' : 'Middlesbrough',
                               'Southampton' : 'Southampton', 'Stoke' : 'Stoke', 'Sunderland' : 'Sunderland',
                               'Swansea' : 'Swansea', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'West Bromwich Albion' : 'West Brom', 'West Ham' : 'West Ham',  'Hull' : 'Hull'}

season_1718_teams_names_map = {'Arsenal' : 'Arsenal',  'Bournemouth' : 'Bournemouth', 'Burnley' : 'Burnley',
                               'Chelsea' : 'Chelsea', 'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United',  'Newcastle United' : 'Newcastle', 'Brighton' : 'Brighton',
                               'Southampton' : 'Southampton', 'Stoke' : 'Stoke',  'Huddersfield' : 'Huddersfield',
                               'Swansea' : 'Swansea', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'West Bromwich Albion' : 'West Brom', 'West Ham' : 'West Ham'}

season_1819_teams_names_map = {'Arsenal' : 'Arsenal',  'Bournemouth' : 'Bournemouth', 'Burnley' : 'Burnley',
                               'Chelsea' : 'Chelsea', 'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United',  'Newcastle United' : 'Newcastle', 'Brighton' : 'Brighton',
                               'Southampton' : 'Southampton', 'Cardiff' : 'Cardiff',  'Huddersfield' : 'Huddersfield',
                               'Fulham' : 'Fulham', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'Wolverhampton Wanderers' : 'Wolves', 'West Ham' : 'West Ham'}

season_1920_teams_names_map = {'Arsenal' : 'Arsenal',  'Bournemouth' : 'Bournemouth', 'Burnley' : 'Burnley',
                               'Chelsea' : 'Chelsea', 'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United',  'Newcastle United' : 'Newcastle', 'Brighton' : 'Brighton',
                               'Southampton' : 'Southampton', 'Norwich' : 'Norwich',  'Sheffield United' : 'Sheffield United',
                               'Aston Villa' : 'Aston Villa', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'Wolverhampton Wanderers' : 'Wolves', 'West Ham' : 'West Ham'}

season_2021_teams_names_map = {'Arsenal' : 'Arsenal',  'Burnley' : 'Burnley', 'Chelsea' : 'Chelsea', 
                               'Crystal Palace' : 'Crystal Palace', 'Everton' : 'Everton', 'Fulham' : 'Fulham', 'Leeds' : 'Leeds',
                               'Leicester' : 'Leicester', 'Liverpool' : 'Liverpool', 'Manchester City' : 'Man City',
                               'Manchester United' : 'Man United',  'Newcastle United' : 'Newcastle', 'Brighton' : 'Brighton',
                               'Southampton' : 'Southampton', 'Norwich' : 'Norwich',  'Sheffield United' : 'Sheffield United',
                               'Aston Villa' : 'Aston Villa', 'Tottenham' : 'Tottenham', 'Watford' : 'Watford', 
                               'Wolverhampton Wanderers' : 'Wolves', 'West Ham' : 'West Ham'}

season_map_list = [season_1516_teams_names_map, season_1617_teams_names_map, season_1718_teams_names_map,
                  season_1819_teams_names_map,season_1920_teams_names_map, season_2021_teams_names_map]

season_1516_teams_names_map = {'Arsenal' : 'ARS', 'Aston Villa' : 'AVA', 'Bournemouth' : 'BOU',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN', 'Newcastle' : 'NEW', 'Norwich' : 'NOR',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1617_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Hull' : 'HUL','Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Middlesbrough' : 'MID',
                               'Southampton' : 'SOU', 'Stoke' : 'STO', 'Sunderland' : 'SUN',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1718_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Stoke' : 'STO',  'Huddersfield' : 'HUD',
                               'Swansea' : 'SWA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'West Brom' : 'WBA', 'West Ham' : 'WHU'}

season_1819_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Cardiff' : 'CAR',  'Huddersfield' : 'HUD',
                               'Fulham' : 'FUL', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolves' : 'WLV', 'West Ham' : 'WHU'}

season_1920_teams_names_map = {'Arsenal' : 'ARS',  'Bournemouth' : 'BOU', 'Burnley' : 'BUR',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Norwich' : 'NOR',  'Sheffield United' : 'SHU',
                               'Aston Villa' : 'AVA', 'Tottenham' : 'TOT', 'Watford' : 'WAT', 
                               'Wolves' : 'WLV', 'West Ham' : 'WHU'}

season_2021_teams_names_map = {'Arsenal' : 'ARS', 'Burnley' : 'BUR', 'Leeds' : 'LEE', 'Fulham' : 'FUL',
                               'Chelsea' : 'CHE', 'Crystal Palace' : 'CRY', 'Everton' : 'EVE',
                               'Leicester' : 'LEI', 'Liverpool' : 'LIV', 'Man City' : 'MCI',
                               'Man United' : 'MUN',  'Newcastle' : 'NEW', 'Brighton' : 'BRI',
                               'Southampton' : 'SOU', 'Sheffield United' : 'SHU',
                               'Aston Villa' : 'AVA', 'Tottenham' : 'TOT',  'West Brom' : 'WBA',
                               'Wolves' : 'WLV', 'West Ham' : 'WHU'}

shortcut_map_list = [season_1516_teams_names_map, season_1617_teams_names_map ,season_1718_teams_names_map,
                  season_1819_teams_names_map, season_1920_teams_names_map, season_2021_teams_names_map]

In [6]:
# Merging datasets into one
def merge_datasets(dct: dict, data: dict, seasons_map: list, shortcuts: list) -> dict:
    dct_ = deepcopy(dct)
    idx = 0
    years = [2015, 2016, 2017, 2018, 2019, 2020]
    for key, value in dct_.items():
        if f'{years[idx]}' in key:
            value['home_team'] = value['home_team'].map(seasons_map[idx])
            value['away_team'] = value['away_team'].map(seasons_map[idx])
            continue
        else:
            idx += 1
            value['home_team'] = value['home_team'].map(season_map_list[idx])
            value['away_team'] = value['away_team'].map(season_map_list[idx])

    dct_['Man City_2015.csv'] = dct_.pop('Manchester City_2015.csv')
    dct_['Man City_2016.csv'] = dct_.pop('Manchester City_2016.csv')
    dct_['Man City_2017.csv'] = dct_.pop('Manchester City_2017.csv')
    dct_['Man City_2018.csv'] = dct_.pop('Manchester City_2018.csv')
    dct_['Man City_2019.csv'] = dct_.pop('Manchester City_2019.csv')
    dct_['Man City_2020.csv'] = dct_.pop('Manchester City_2020.csv')
    dct_['Man United_2015.csv'] = dct_.pop('Manchester United_2015.csv')
    dct_['Man United_2016.csv'] = dct_.pop('Manchester United_2016.csv')
    dct_['Man United_2017.csv'] = dct_.pop('Manchester United_2017.csv')
    dct_['Man United_2018.csv'] = dct_.pop('Manchester United_2018.csv')
    dct_['Man United_2019.csv'] = dct_.pop('Manchester United_2019.csv')
    dct_['Man United_2020.csv'] = dct_.pop('Manchester United_2020.csv')
    dct_['Newcastle_2015.csv'] = dct_.pop('Newcastle United_2015.csv')
    dct_['Newcastle_2017.csv'] = dct_.pop('Newcastle United_2017.csv')
    dct_['Newcastle_2018.csv'] = dct_.pop('Newcastle United_2018.csv')
    dct_['Newcastle_2019.csv'] = dct_.pop('Newcastle United_2019.csv')
    dct_['Newcastle_2020.csv'] = dct_.pop('Newcastle United_2020.csv')
    dct_['West Brom_2015.csv'] = dct_.pop('West Bromwich Albion_2015.csv')
    dct_['West Brom_2016.csv'] = dct_.pop('West Bromwich Albion_2016.csv')
    dct_['West Brom_2017.csv'] = dct_.pop('West Bromwich Albion_2017.csv')
    dct_['West Brom_2020.csv'] = dct_.pop('West Bromwich Albion_2020.csv')
    dct_['Wolves_2018.csv'] = dct_.pop('Wolverhampton Wanderers_2018.csv')
    dct_['Wolves_2019.csv'] = dct_.pop('Wolverhampton Wanderers_2019.csv')
    dct_['Wolves_2020.csv'] = dct_.pop('Wolverhampton Wanderers_2020.csv')

    lst_df = []
    idx = 0
    for key in data:
        lst = []
        for h_team in data[key]['HomeTeam'].sort_values().unique():
            df_sorted = data[key][data[key]['HomeTeam'] == h_team].sort_values(by = ['AwayTeam'])
            xG_home_away_df = dct_[f'{h_team}_{years[idx]}.csv'].sort_values(by = ['away_team']).iloc[:,2:]
            xG_home_away_df = xG_home_away_df.reset_index().drop('index', axis = 1)
            df_sorted = df_sorted.reset_index().drop('index', axis = 1)
            df_sorted['xG_home'] = xG_home_away_df['xG_home']
            df_sorted['xG_away'] = xG_home_away_df['xG_away']
            df_ = df_sorted.set_index('Date')
            lst.append(df_)
        idx += 1
        lst_df.append(lst)
        for i in range (idx):
            cleared_data[key] = pd.concat(lst_df[i]).sort_index()
           
    idx = 0
    for key in data:
        data[key]['HomeTeam'] = data[key]['HomeTeam'].map(shortcuts[idx])
        data[key]['AwayTeam'] = data[key]['AwayTeam'].map(shortcuts[idx])
        data[key] = data[key].reset_index(drop = True)
        idx += 1
    
    return data

In [7]:
# Working on data
main_data = import_main_data()
xG_data = import_xG_data()
cleared_data = clear_data(main_data)
merged = merge_datasets(xG_data, cleared_data, season_map_list, shortcut_map_list) 

In [9]:
merged['season_2019.csv'].head(20)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,...,HF,AF,HC,AC,HY,AY,HR,AR,xG_home,xG_away
0,LIV,NOR,4,1,H,4,0,H,15,12,...,9,9,11,2,0,2,0,0,2.23456,0.842407
1,TOT,AVA,3,1,H,0,1,A,31,7,...,13,9,14,0,1,0,0,0,2.57262,0.639599
2,WAT,BRI,0,3,A,0,1,A,11,5,...,15,11,5,2,0,1,0,0,0.670022,0.855516
3,BOU,SHU,1,1,D,0,0,D,13,8,...,10,19,3,4,2,1,0,0,1.34099,1.59864
4,CRY,EVE,0,0,D,0,0,D,6,10,...,16,14,6,2,2,1,0,1,0.87159,1.2246
5,WHU,MCI,0,5,A,0,1,A,5,14,...,6,13,1,1,2,2,0,0,1.2003,3.18377
6,BUR,SOU,3,0,H,0,0,D,10,11,...,6,12,2,7,0,0,0,0,0.909241,1.08752
7,NEW,ARS,0,1,A,0,0,D,9,8,...,12,7,5,3,1,3,0,0,0.380551,1.13309
8,LEI,WLV,0,0,D,0,0,D,15,8,...,3,13,12,3,0,2,0,0,0.543759,0.812949
9,MUN,CHE,4,0,H,1,0,H,11,18,...,15,13,3,5,3,4,0,0,2.37442,1.09534
