This notebook deals with creating a dataset that combines the player wage and valuations data which will be the predictor variables for our model.|

In [1]:
import sys
from pathlib import Path
nb_path = Path.cwd().parent.parent 
sys.path.insert(0, str(nb_path))

import pandas as pd

# google cloud storage
from processing.gcp.storage import gcp 

pd.set_option('display.max_columns', None)

## Wages

In [2]:
def load_wages(league: str) -> pd.DataFrame:
    return gcp.read_df_from_bucket(
        bucket_name="processed_fbref_db", blob_name=f"processed_{league}-wages.csv"
    )

In [3]:
prem_wages = load_wages("Premier-League")

In [4]:
prem_wages.head()

Unnamed: 0,rk,player,nation,pos,squad,age,weekly_wages_euros,annual_wages_euros,notes,season,general_pos,age_range,country,continent,player_id
0,1,Alexis Sánchez,CHI,"MF,FW",Arsenal,28,402897,20950637,Unverified estimation,2017-2018,Midfielder,25-29,Chile,South America,2111.0
1,2,Mesut Özil,GER,"MF,FW",Arsenal,28,402897,20950637,Unverified estimation,2017-2018,Midfielder,25-29,Germany,Europe,1796.0
2,3,Eden Hazard,BEL,"FW,MF",Chelsea,26,256548,13340491,Unverified estimation,2017-2018,Forward,25-29,Belgium,Europe,1003.0
3,4,Cesc Fàbregas,ESP,MF,Chelsea,30,236735,12310226,Unverified estimation,2017-2018,Midfielder,30-34,Spain,Europe,744.0
4,5,Henrikh Mkhitaryan,ARM,"FW,MF",Arsenal,28,230227,11971793,Unverified estimation,2017-2018,Forward,25-29,Armenia,Asia,1608.0


In [5]:
prem_wages.shape

(4048, 15)

In [6]:
prem_wages.nunique()

rk                     638
player                1564
nation                  97
pos                     22
squad                   30
age                     26
weekly_wages_euros     642
annual_wages_euros     644
notes                    1
season                   7
general_pos              5
age_range                6
country                 96
continent                7
player_id             1332
dtype: int64

In [7]:
prem_wages['pos'].unique()

array(['MF,FW', 'FW,MF', 'MF', 'GK', 'FW', 'DF', nan, 'DF,MF', 'DF,FW',
       'FW,DF', 'MF,DF', 'CB', 'RW', 'CM', 'RB', 'DF,GK', 'SS', 'LB',
       'LW', 'CF', 'AM', 'DM', 'LM'], dtype=object)

## Values

In [8]:
def load_valuations(league: str) -> pd.DataFrame:
    return gcp.read_df_from_bucket(
        bucket_name="processed_transfermarkt_db", blob_name=f"processed_{league}_player_valuations.csv"
    )

In [180]:
prem_values = load_valuations("premier_league")

In [10]:
prem_values.head()

Unnamed: 0,tm_id,tm_name,player,squad_num,position,age,country,current_club,height,foot,signed_date,signed_from,signing_fee_euro_mill,contract_expiry,market_value_euro_mill,season,league,team,signed_year,player_id
0,238223,ederson,Ederson,31,Goalkeeper,24,Brazil,Manchester City,188.0,left,"Jul 1, 2017",SL Benfica,40.0,,50.0,2017,premier_league,manchester-city,2017.0,709.0
1,40204,joe-hart,Joe Hart,0,Goalkeeper,31,England,Celtic FC,196.0,right,"Jul 1, 2006",Shrewsbury Town,0.9,,10.0,2017,premier_league,manchester-city,2006.0,998.0
2,40423,claudio-bravo,Claudio Bravo,1,Goalkeeper,35,Chile,Real Betis Balompié,184.0,right,"Aug 25, 2016",FC Barcelona,18.0,,3.5,2017,premier_league,manchester-city,2016.0,339.0
3,201574,angus-gunn,Angus Gunn,54,Goalkeeper,22,Scotland,Norwich City,196.0,right,"Jul 1, 2016",Manchester City U23,,,2.0,2017,premier_league,manchester-city,2016.0,2853.0
4,186590,john-stones,John Stones,5,Centre-Back,24,England,Manchester City,188.0,right,"Aug 9, 2016",Everton FC,55.6,,50.0,2017,premier_league,manchester-city,2016.0,2274.0


In [11]:
prem_values['position'].unique()

array(['Goalkeeper', 'Centre-Back', 'Left-Back', 'Right-Back',
       'Defensive-Midfield', 'Central-Midfield', 'Attacking-Midfield',
       'Left-Winger', 'Right-Winger', 'Centre-Forward', 'Right-Midfield',
       'Left-Midfield', 'Second-Striker'], dtype=object)

In [182]:
prem_values['team'].str.replace('^(a?fc)-', '', regex=True).unique()

array(['manchester-city', 'chelsea', 'liverpool', 'manchester-united',
       'tottenham-hotspur', 'arsenal', 'everton', 'leicester-city',
       'southampton', 'west-ham-united', 'crystal-palace',
       'newcastle-united', 'watford', 'stoke-city', 'burnley',
       'west-bromwich-albion', 'swansea-city', 'brighton-amp-hove-albion',
       'bournemouth', 'huddersfield-town', 'wolverhampton-wanderers',
       'fulham', 'cardiff-city', 'aston-villa', 'norwich-city',
       'sheffield-united', 'leeds-united', 'brentford',
       'nottingham-forest', 'luton-town'], dtype=object)

In [12]:
prem_values.loc[:, 'team'] = prem_values['team'].str.replace('-', ' ').str.replace('a?fc ', '', regex=True)

In [13]:
# compare team names
val_teams = prem_values['team'].unique()
len(val_teams)

30

In [14]:
wage_teams = prem_wages['squad'].unique()
len(wage_teams)

30

In [15]:
val_teams.sort()
val_teams

array(['arsenal', 'aston villa', 'bournemouth', 'brentford',
       'brighton amp hove albion', 'burnley', 'cardiff city', 'chelsea',
       'crystal palace', 'everton', 'fulham', 'huddersfield town',
       'leeds united', 'leicester city', 'liverpool', 'luton town',
       'manchester city', 'manchester united', 'newcastle united',
       'norwich city', 'nottingham forest', 'sheffield united',
       'southampton', 'stoke city', 'swansea city', 'tottenham hotspur',
       'watford', 'west bromwich albion', 'west ham united',
       'wolverhampton wanderers'], dtype=object)

In [16]:
wage_teams.sort()
wage_teams

array(['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton',
       'Burnley', 'Cardiff City', 'Chelsea', 'Crystal Palace', 'Everton',
       'Fulham', 'Huddersfield', 'Leeds United', 'Leicester City',
       'Liverpool', 'Luton Town', 'Manchester City', 'Manchester Utd',
       'Newcastle Utd', 'Norwich City', "Nott'ham Forest",
       'Sheffield Utd', 'Southampton', 'Stoke City', 'Swansea City',
       'Tottenham', 'Watford', 'West Brom', 'West Ham', 'Wolves'],
      dtype=object)

In [17]:
team_dict = dict(zip(val_teams, wage_teams))

In [18]:
# change team names in valuations df
def change_team_names(df: pd.DataFrame, team_dict: dict) -> pd.DataFrame:
    df.loc[:, 'team'] = df['team'].map(team_dict)
    return df

In [19]:
prem_values = change_team_names(prem_values, team_dict)

In [20]:
prem_values.head()

Unnamed: 0,tm_id,tm_name,player,squad_num,position,age,country,current_club,height,foot,signed_date,signed_from,signing_fee_euro_mill,contract_expiry,market_value_euro_mill,season,league,team,signed_year,player_id
0,238223,ederson,Ederson,31,Goalkeeper,24,Brazil,Manchester City,188.0,left,"Jul 1, 2017",SL Benfica,40.0,,50.0,2017,premier_league,Manchester City,2017.0,709.0
1,40204,joe-hart,Joe Hart,0,Goalkeeper,31,England,Celtic FC,196.0,right,"Jul 1, 2006",Shrewsbury Town,0.9,,10.0,2017,premier_league,Manchester City,2006.0,998.0
2,40423,claudio-bravo,Claudio Bravo,1,Goalkeeper,35,Chile,Real Betis Balompié,184.0,right,"Aug 25, 2016",FC Barcelona,18.0,,3.5,2017,premier_league,Manchester City,2016.0,339.0
3,201574,angus-gunn,Angus Gunn,54,Goalkeeper,22,Scotland,Norwich City,196.0,right,"Jul 1, 2016",Manchester City U23,,,2.0,2017,premier_league,Manchester City,2016.0,2853.0
4,186590,john-stones,John Stones,5,Centre-Back,24,England,Manchester City,188.0,right,"Aug 9, 2016",Everton FC,55.6,,50.0,2017,premier_league,Manchester City,2016.0,2274.0


## Join wages and valuations

## Premier League

In [21]:
# redefine season variable for wages to match values season
def redefine_season(df: pd.DataFrame) -> pd.DataFrame:
    df.loc[:, 'season'] = df['season'].apply(lambda x: x.split('-')[0]).astype(int)
    return df

In [22]:
prem_wages = redefine_season(prem_wages)

In [23]:
prem_wages.head()

Unnamed: 0,rk,player,nation,pos,squad,age,weekly_wages_euros,annual_wages_euros,notes,season,general_pos,age_range,country,continent,player_id
0,1,Alexis Sánchez,CHI,"MF,FW",Arsenal,28,402897,20950637,Unverified estimation,2017,Midfielder,25-29,Chile,South America,2111.0
1,2,Mesut Özil,GER,"MF,FW",Arsenal,28,402897,20950637,Unverified estimation,2017,Midfielder,25-29,Germany,Europe,1796.0
2,3,Eden Hazard,BEL,"FW,MF",Chelsea,26,256548,13340491,Unverified estimation,2017,Forward,25-29,Belgium,Europe,1003.0
3,4,Cesc Fàbregas,ESP,MF,Chelsea,30,236735,12310226,Unverified estimation,2017,Midfielder,30-34,Spain,Europe,744.0
4,5,Henrikh Mkhitaryan,ARM,"FW,MF",Arsenal,28,230227,11971793,Unverified estimation,2017,Forward,25-29,Armenia,Asia,1608.0


In [24]:
# remove columns that are not needed
def drop_wage_columns(df: pd.DataFrame) -> pd.DataFrame:
    wage_cols_to_drop = ["nation", "pos", "notes", "rk", "general_pos", "country"]
    return df.drop(columns=wage_cols_to_drop)

In [25]:
prem_wages = drop_wage_columns(prem_wages)

In [26]:
# same for values
prem_values.head()

Unnamed: 0,tm_id,tm_name,player,squad_num,position,age,country,current_club,height,foot,signed_date,signed_from,signing_fee_euro_mill,contract_expiry,market_value_euro_mill,season,league,team,signed_year,player_id
0,238223,ederson,Ederson,31,Goalkeeper,24,Brazil,Manchester City,188.0,left,"Jul 1, 2017",SL Benfica,40.0,,50.0,2017,premier_league,Manchester City,2017.0,709.0
1,40204,joe-hart,Joe Hart,0,Goalkeeper,31,England,Celtic FC,196.0,right,"Jul 1, 2006",Shrewsbury Town,0.9,,10.0,2017,premier_league,Manchester City,2006.0,998.0
2,40423,claudio-bravo,Claudio Bravo,1,Goalkeeper,35,Chile,Real Betis Balompié,184.0,right,"Aug 25, 2016",FC Barcelona,18.0,,3.5,2017,premier_league,Manchester City,2016.0,339.0
3,201574,angus-gunn,Angus Gunn,54,Goalkeeper,22,Scotland,Norwich City,196.0,right,"Jul 1, 2016",Manchester City U23,,,2.0,2017,premier_league,Manchester City,2016.0,2853.0
4,186590,john-stones,John Stones,5,Centre-Back,24,England,Manchester City,188.0,right,"Aug 9, 2016",Everton FC,55.6,,50.0,2017,premier_league,Manchester City,2016.0,2274.0


In [27]:
def drop_value_columns(df: pd.DataFrame) -> pd.DataFrame:
    val_cols_to_drop = ['tm_id', 'tm_name', 'squad_num', 'contract_expiry', 'current_club', 'signed_date']
    return df.drop(columns=val_cols_to_drop)

In [28]:
prem_values = drop_value_columns(prem_values)

In [29]:
prem_values.head()

Unnamed: 0,player,position,age,country,height,foot,signed_from,signing_fee_euro_mill,market_value_euro_mill,season,league,team,signed_year,player_id
0,Ederson,Goalkeeper,24,Brazil,188.0,left,SL Benfica,40.0,50.0,2017,premier_league,Manchester City,2017.0,709.0
1,Joe Hart,Goalkeeper,31,England,196.0,right,Shrewsbury Town,0.9,10.0,2017,premier_league,Manchester City,2006.0,998.0
2,Claudio Bravo,Goalkeeper,35,Chile,184.0,right,FC Barcelona,18.0,3.5,2017,premier_league,Manchester City,2016.0,339.0
3,Angus Gunn,Goalkeeper,22,Scotland,196.0,right,Manchester City U23,,2.0,2017,premier_league,Manchester City,2016.0,2853.0
4,John Stones,Centre-Back,24,England,188.0,right,Everton FC,55.6,50.0,2017,premier_league,Manchester City,2016.0,2274.0


In [30]:
def join_wages_values(wages_df: pd.DataFrame, values_df: pd.DataFrame) -> pd.DataFrame:
    return pd.merge(
        left=wages_df,
        right=values_df,
        how="inner",
        left_on=["player", "season", "squad"],
        right_on=["player", "season", "team"],
        suffixes=("_wage_df", "_value_df"),
    )

In [31]:
prem_join = join_wages_values(prem_wages, prem_values)

In [32]:
prem_join.head()

Unnamed: 0,player,squad,age_wage_df,weekly_wages_euros,annual_wages_euros,season,age_range,continent,player_id_wage_df,position,age_value_df,country,height,foot,signed_from,signing_fee_euro_mill,market_value_euro_mill,league,team,signed_year,player_id_value_df
0,Alexis Sánchez,Arsenal,28,402897,20950637,2017,25-29,South America,2111.0,Centre-Forward,29,Chile,169.0,right,FC Barcelona,42.5,70.0,premier_league,Arsenal,2014.0,2111.0
1,Mesut Özil,Arsenal,28,402897,20950637,2017,25-29,Europe,1796.0,Attacking-Midfield,29,Germany,180.0,left,Real Madrid,47.0,50.0,premier_league,Arsenal,2013.0,1796.0
2,Eden Hazard,Chelsea,26,256548,13340491,2017,25-29,Europe,1003.0,Left-Winger,27,Belgium,175.0,right,LOSC Lille,35.0,110.0,premier_league,Chelsea,2012.0,1003.0
3,Cesc Fàbregas,Chelsea,30,236735,12310226,2017,30-34,Europe,744.0,Central-Midfield,31,Spain,179.0,right,FC Barcelona,33.0,30.0,premier_league,Chelsea,2014.0,744.0
4,Henrikh Mkhitaryan,Arsenal,28,230227,11971793,2017,25-29,Asia,1608.0,Central-Midfield,29,Armenia,177.0,both,Manchester United,34.0,35.0,premier_league,Arsenal,2018.0,1608.0


In [33]:
# check join
prem_join.loc[prem_join['player'] == 'Mesut Özil']

Unnamed: 0,player,squad,age_wage_df,weekly_wages_euros,annual_wages_euros,season,age_range,continent,player_id_wage_df,position,age_value_df,country,height,foot,signed_from,signing_fee_euro_mill,market_value_euro_mill,league,team,signed_year,player_id_value_df
1,Mesut Özil,Arsenal,28,402897,20950637,2017,25-29,Europe,1796.0,Attacking-Midfield,29,Germany,180.0,left,Real Madrid,47.0,50.0,premier_league,Arsenal,2013.0,1796.0
550,Mesut Özil,Arsenal,29,407470,21188462,2018,25-29,Europe,1796.0,Attacking-Midfield,30,Germany,180.0,left,Real Madrid,47.0,25.0,premier_league,Arsenal,2013.0,1796.0
1138,Mesut Özil,Arsenal,30,417398,21704678,2019,30-34,Europe,1796.0,Attacking-Midfield,31,Germany,180.0,left,Real Madrid,47.0,17.5,premier_league,Arsenal,2013.0,1796.0


In [39]:
def sort_columns(df: pd.DataFrame) -> pd.DataFrame:
    # remove duplicate cols
    cols = [col for col in df.columns if '_value_df' not in col]
    df = df[cols]
    
    # rename columns
    df.columns = [col.replace('_wage_df', '') for col in df.columns]
    
    # drop team column
    return df.drop(columns=['team'])
    

In [40]:
prem_join = sort_columns(prem_join)

In [41]:
prem_join.head()

Unnamed: 0,player,squad,age,weekly_wages_euros,annual_wages_euros,season,age_range,continent,player_id,position,country,height,foot,signed_from,signing_fee_euro_mill,market_value_euro_mill,league,signed_year
0,Alexis Sánchez,Arsenal,28,402897,20950637,2017,25-29,South America,2111.0,Centre-Forward,Chile,169.0,right,FC Barcelona,42.5,70.0,premier_league,2014.0
1,Mesut Özil,Arsenal,28,402897,20950637,2017,25-29,Europe,1796.0,Attacking-Midfield,Germany,180.0,left,Real Madrid,47.0,50.0,premier_league,2013.0
2,Eden Hazard,Chelsea,26,256548,13340491,2017,25-29,Europe,1003.0,Left-Winger,Belgium,175.0,right,LOSC Lille,35.0,110.0,premier_league,2012.0
3,Cesc Fàbregas,Chelsea,30,236735,12310226,2017,30-34,Europe,744.0,Central-Midfield,Spain,179.0,right,FC Barcelona,33.0,30.0,premier_league,2014.0
4,Henrikh Mkhitaryan,Arsenal,28,230227,11971793,2017,25-29,Asia,1608.0,Central-Midfield,Armenia,177.0,both,Manchester United,34.0,35.0,premier_league,2018.0


In [183]:
prem_join

Unnamed: 0,player,squad,age,weekly_wages_euros,annual_wages_euros,season,age_range,continent,player_id,position,country,height,foot,signed_from,signing_fee_euro_mill,market_value_euro_mill,league,signed_year
0,Alexis Sánchez,Arsenal,28,402897,20950637,2017,25-29,South America,2111.0,Centre-Forward,Chile,169.0,right,FC Barcelona,42.50,70.00,premier_league,2014.0
1,Mesut Özil,Arsenal,28,402897,20950637,2017,25-29,Europe,1796.0,Attacking-Midfield,Germany,180.0,left,Real Madrid,47.00,50.00,premier_league,2013.0
2,Eden Hazard,Chelsea,26,256548,13340491,2017,25-29,Europe,1003.0,Left-Winger,Belgium,175.0,right,LOSC Lille,35.00,110.00,premier_league,2012.0
3,Cesc Fàbregas,Chelsea,30,236735,12310226,2017,30-34,Europe,744.0,Central-Midfield,Spain,179.0,right,FC Barcelona,33.00,30.00,premier_league,2014.0
4,Henrikh Mkhitaryan,Arsenal,28,230227,11971793,2017,25-29,Asia,1608.0,Central-Midfield,Armenia,177.0,both,Manchester United,34.00,35.00,premier_league,2018.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3679,Joseph Anang,West Ham,23,4654,241990,2023,20-24,Unknown,,Goalkeeper,England,190.0,right,West Ham United U21,,0.30,premier_league,2023.0
3680,Enock Agyei,Burnley,18,4654,241990,2023,Under 20,Unknown,,Right-Winger,Belgium,172.0,left,RSCA Futures,0.35,0.80,premier_league,2023.0
3681,Denis Franchi,Burnley,20,3490,181492,2023,20-24,Europe,,Goalkeeper,Italy,189.0,right,Paris Saint-Germain,,0.25,premier_league,2022.0
3682,Jordan Beyer,Burnley,23,3490,181492,2023,20-24,Europe,2636.0,Centre-Back,Germany,187.0,right,Borussia Mönchengladbach,15.00,16.00,premier_league,2023.0


## La liga Join

In [42]:
# load wages
la_liga_wages = load_wages("La-Liga")

In [43]:
la_liga_wages.head()

Unnamed: 0,rk,player,nation,pos,squad,age,weekly_wages_euros,annual_wages_euros,notes,season,general_pos,age_range,country,continent,player_id
0,1,Lionel Messi,ARG,FW,Barcelona,30,761923,39620000,Unverified estimation,2017-2018,Forward,30-34,Argentina,South America,1580.0
1,2,Cristiano Ronaldo,POR,FW,Real Madrid,32,670192,34850000,Unverified estimation,2017-2018,Forward,30-34,Portugal,Europe,2040.0
2,3,Samuel Umtiti,FRA,DF,Barcelona,23,615385,32000000,Unverified estimation,2017-2018,Defender,20-24,France,Europe,2395.0
3,4,Gerard Piqué,ESP,DF,Barcelona,30,546154,28400000,Unverified estimation,2017-2018,Defender,30-34,Spain,Europe,1883.0
4,5,Gareth Bale,WAL,"FW,MF",Real Madrid,28,532308,27680000,Unverified estimation,2017-2018,Forward,25-29,Wales,Europe,168.0


In [44]:
# load values
la_liga_values = load_valuations("la_liga")

In [45]:
la_liga_values.head()

Unnamed: 0,tm_id,tm_name,player,squad_num,position,age,country,current_club,height,foot,signed_date,signed_from,signing_fee_euro_mill,contract_expiry,market_value_euro_mill,season,league,team,signed_year,player_id
0,74857,marc-andre-ter-stegen,Marc-André ter Stegen,1,Goalkeeper,26,Germany,FC Barcelona,187.0,right,"Jul 1, 2014",Borussia Mönchengladbach,12.0,,60.0,2017,la_liga,fc-barcelona,2014.0,2325.0
1,146227,jasper-cillessen,Jasper Cillessen,13,Goalkeeper,29,Netherlands,NEC Nijmegen,187.0,right,"Aug 25, 2016",Ajax Amsterdam,13.0,,9.0,2017,la_liga,fc-barcelona,2016.0,489.0
2,142033,adrian-ortola,Adrián Ortolá,31,Goalkeeper,24,Spain,CE Sabadell FC,187.0,left,"Jul 1, 2018",FC Barcelona B,,,1.5,2017,la_liga,fc-barcelona,2018.0,
3,126540,samuel-umtiti,Samuel Umtiti,23,Centre-Back,24,France,LOSC Lille,182.0,left,"Jul 14, 2016",Olympique Lyon,25.0,,60.0,2017,la_liga,fc-barcelona,2016.0,2395.0
4,18944,gerard-pique,Gerard Piqué,3,Centre-Back,31,Spain,Retired,194.0,right,"Jul 1, 2008",Manchester United,5.0,,50.0,2017,la_liga,fc-barcelona,2008.0,1883.0


In [46]:
# rename betis to real betis
la_liga_wages.loc[:, 'squad'] = la_liga_wages['squad'].replace('Betis', 'Real Betis')
la_liga_wages.loc[:, 'squad'] = la_liga_wages['squad'].replace('Valladolid', 'Real Valladolid')
la_liga_wages.loc[:, 'squad'] = la_liga_wages['squad'].replace('Málaga', 'Malaga')
la_liga_wages.loc[:, 'squad'] = la_liga_wages['squad'].replace('Cádiz', 'Cadiz')


In [47]:
# change team names in values df with names from wages df
liga_wage_teams = la_liga_wages['squad'].unique()
liga_wage_teams.sort()

In [48]:
la_liga_values.loc[:, 'team'] = la_liga_values['team'].str.replace('^(fc|sd|rcd|ca|ud|cd|deportivo)-', '', regex=True)

In [49]:
liga_values_teams = la_liga_values['team'].unique()
liga_values_teams.sort()

In [50]:
liga_team_dict = dict(zip(liga_values_teams, liga_wage_teams))

In [51]:
la_liga_values = change_team_names(la_liga_values, liga_team_dict)

In [52]:
la_liga_values.head()

Unnamed: 0,tm_id,tm_name,player,squad_num,position,age,country,current_club,height,foot,signed_date,signed_from,signing_fee_euro_mill,contract_expiry,market_value_euro_mill,season,league,team,signed_year,player_id
0,74857,marc-andre-ter-stegen,Marc-André ter Stegen,1,Goalkeeper,26,Germany,FC Barcelona,187.0,right,"Jul 1, 2014",Borussia Mönchengladbach,12.0,,60.0,2017,la_liga,Barcelona,2014.0,2325.0
1,146227,jasper-cillessen,Jasper Cillessen,13,Goalkeeper,29,Netherlands,NEC Nijmegen,187.0,right,"Aug 25, 2016",Ajax Amsterdam,13.0,,9.0,2017,la_liga,Barcelona,2016.0,489.0
2,142033,adrian-ortola,Adrián Ortolá,31,Goalkeeper,24,Spain,CE Sabadell FC,187.0,left,"Jul 1, 2018",FC Barcelona B,,,1.5,2017,la_liga,Barcelona,2018.0,
3,126540,samuel-umtiti,Samuel Umtiti,23,Centre-Back,24,France,LOSC Lille,182.0,left,"Jul 14, 2016",Olympique Lyon,25.0,,60.0,2017,la_liga,Barcelona,2016.0,2395.0
4,18944,gerard-pique,Gerard Piqué,3,Centre-Back,31,Spain,Retired,194.0,right,"Jul 1, 2008",Manchester United,5.0,,50.0,2017,la_liga,Barcelona,2008.0,1883.0


## Join la liga wages and values

In [53]:
la_liga_wages = la_liga_wages.pipe(redefine_season).pipe(drop_wage_columns)

In [54]:
la_liga_values = drop_value_columns(la_liga_values)

In [55]:
la_liga_join = join_wages_values(la_liga_wages, la_liga_values)

In [57]:
la_liga_join = sort_columns(la_liga_join)

In [58]:
la_liga_join.head()

Unnamed: 0,player,squad,age,weekly_wages_euros,annual_wages_euros,season,age_range,continent,player_id,position,country,height,foot,signed_from,signing_fee_euro_mill,market_value_euro_mill,league,signed_year
0,Lionel Messi,Barcelona,30,761923,39620000,2017,30-34,South America,1580.0,Right-Winger,Argentina,170.0,left,FC Barcelona B,,180.0,la_liga,2005.0
1,Cristiano Ronaldo,Real Madrid,32,670192,34850000,2017,30-34,Europe,2040.0,Centre-Forward,Portugal,187.0,right,Manchester United,94.0,100.0,la_liga,2009.0
2,Samuel Umtiti,Barcelona,23,615385,32000000,2017,20-24,Europe,2395.0,Centre-Back,France,182.0,left,Olympique Lyon,25.0,60.0,la_liga,2016.0
3,Gerard Piqué,Barcelona,30,546154,28400000,2017,30-34,Europe,1883.0,Centre-Back,Spain,194.0,right,Manchester United,5.0,50.0,la_liga,2008.0
4,Gareth Bale,Real Madrid,28,532308,27680000,2017,25-29,Europe,168.0,Right-Winger,Wales,186.0,left,Tottenham Hotspur,101.0,90.0,la_liga,2013.0


- Can do the same for the remaining datasets, but need to map teams correctly
- After mapping teams I will create python scripts that deal with joining and loading joined wages and valuations data to gcp bucket

## Bundesliga

In [59]:
bundesliga_wages = load_wages("Bundesliga")

In [61]:
bundesliga_values = load_valuations("bundesliga")

In [68]:
# map teams 
bundes_wage_teams = bundesliga_wages['squad'].unique()
bundes_wage_teams.sort()

In [69]:
bundes_values_teams = bundesliga_values['team'].unique()
bundes_values_teams.sort()

In [86]:
bundes_wage_teams
len(bundes_wage_teams)

26

In [87]:
len(bundes_values_teams)

27

In [81]:
bundesliga_values.loc[:, 'team'] = bundesliga_values['team'].str.replace('^(1-fc|fc|1-fsv|sv|vfb|sc|vfl|spvgg|tsg-1899|borussia|bayer-04|fortuna)-', '', regex=True)

In [82]:
bundes_values_teams = bundesliga_values['team'].unique()
bundes_values_teams.sort()

In [84]:
bundes_wage_teams

array(['Arminia', 'Augsburg', 'Bayern Munich', 'Bochum', 'Darmstadt 98',
       'Dortmund', 'Düsseldorf', 'Eint Frankfurt', 'Freiburg',
       'Greuther Fürth', 'Hamburger SV', 'Heidenheim', 'Hertha BSC',
       'Hoffenheim', 'Köln', 'Leverkusen', "M'Gladbach", 'Mainz 05',
       'Nürnberg', 'Paderborn 07', 'RB Leipzig', 'Schalke 04',
       'Stuttgart', 'Union Berlin', 'Werder Bremen', 'Wolfsburg'],
      dtype=object)

In [83]:
bundes_values_teams

array(['arminia-bielefeld', 'augsburg', 'bayern-munchen', 'bochum',
       'darmstadt-98', 'dortmund', 'dusseldorf', 'eintracht-frankfurt',
       'freiburg', 'greuther-furth', 'hamburger-sv', 'hannover-96',
       'heidenheim-1846', 'hertha-bsc', 'hoffenheim', 'koln',
       'leverkusen', 'mainz-05', 'monchengladbach', 'nurnberg',
       'paderborn-07', 'rasenballsport-leipzig', 'schalke-04',
       'stuttgart', 'union-berlin', 'werder-bremen', 'wolfsburg'],
      dtype=object)

In [85]:
dict(zip(bundes_values_teams, bundes_wage_teams))

{'arminia-bielefeld': 'Arminia',
 'augsburg': 'Augsburg',
 'bayern-munchen': 'Bayern Munich',
 'bochum': 'Bochum',
 'darmstadt-98': 'Darmstadt 98',
 'dortmund': 'Dortmund',
 'dusseldorf': 'Düsseldorf',
 'eintracht-frankfurt': 'Eint Frankfurt',
 'freiburg': 'Freiburg',
 'greuther-furth': 'Greuther Fürth',
 'hamburger-sv': 'Hamburger SV',
 'hannover-96': 'Heidenheim',
 'heidenheim-1846': 'Hertha BSC',
 'hertha-bsc': 'Hoffenheim',
 'hoffenheim': 'Köln',
 'koln': 'Leverkusen',
 'leverkusen': "M'Gladbach",
 'mainz-05': 'Mainz 05',
 'monchengladbach': 'Nürnberg',
 'nurnberg': 'Paderborn 07',
 'paderborn-07': 'RB Leipzig',
 'rasenballsport-leipzig': 'Schalke 04',
 'schalke-04': 'Stuttgart',
 'stuttgart': 'Union Berlin',
 'union-berlin': 'Werder Bremen',
 'werder-bremen': 'Wolfsburg'}

In [94]:
# doesn't seem to be hannover in the wage df
bundesliga_wages.loc[bundesliga_wages['season'] == '2017-2018']['squad'].nunique()

17

There is no wage data for Hannover 96 - remove them from the valuations data too

In [97]:
bundesliga_values = bundesliga_values.loc[bundesliga_values['team'] != 'hannover-96']

In [105]:
# rename monchengladbach in wage df
bundesliga_wages.loc[:, 'squad'] = bundesliga_wages['squad'].replace("M'Gladbach", 'Monchengladbach')

In [106]:
bundes_wage_teams = bundesliga_wages['squad'].unique()
bundes_wage_teams.sort()

In [107]:
bundes_values_teams = bundesliga_values['team'].unique()
bundes_values_teams.sort()

In [109]:
bundes_team_map = dict(zip(bundes_values_teams, bundes_wage_teams))

## Serie A

In [110]:
seriea_wages = load_wages("Serie-A")

In [111]:
seriea_wages.head()

Unnamed: 0,rk,player,nation,pos,squad,age,weekly_wages_euros,annual_wages_euros,notes,season,general_pos,age_range,country,continent,player_id
0,1,Leonardo Bonucci,ITA,DF,Milan,30,264231,13740000,Unverified estimation,2017-2018,Defender,30-34,Italy,Europe,318.0
1,2,Gonzalo Higuaín,ARG,FW,Juventus,29,264231,13740000,Unverified estimation,2017-2018,Forward,25-29,Argentina,South America,1036.0
2,3,Paulo Dybala,ARG,"MF,FW",Juventus,23,246731,12830000,Unverified estimation,2017-2018,Midfielder,20-24,Argentina,South America,702.0
3,4,Douglas Costa,BRA,"FW,MF",Juventus,26,211538,11000000,Unverified estimation,2017-2018,Forward,25-29,Brazil,South America,536.0
4,5,Gianluigi Donnarumma,ITA,GK,Milan,18,199808,10390000,Unverified estimation,2017-2018,Goalkeeper,Under 20,Italy,Europe,671.0


In [112]:
seriea_values = load_valuations("serie_a")

In [113]:
seriea_values.head()

Unnamed: 0,tm_id,tm_name,player,squad_num,position,age,country,current_club,height,foot,signed_date,signed_from,signing_fee_euro_mill,contract_expiry,market_value_euro_mill,season,league,team,signed_year,player_id
0,44058,wojciech-szczesny,Wojciech Szczesny,23,Goalkeeper,28,Poland,Juventus FC,196.0,right,"Jul 19, 2017",Arsenal FC,18.0,,20.0,2017,serie_a,juventus-turin,2017.0,
1,5023,gianluigi-buffon,Gianluigi Buffon,1,Goalkeeper,40,Italy,Retired,192.0,right,"Jul 4, 2019",Paris Saint-Germain,0.0,,2.0,2017,serie_a,juventus-turin,2019.0,359.0
2,75411,carlo-pinsoglio,Carlo Pinsoglio,16,Goalkeeper,28,Italy,Juventus FC,194.0,left,"Jul 1, 2014",Vicenza Calcio,0.8,,0.5,2017,serie_a,juventus-turin,2014.0,1882.0
3,265079,mattia-del-favero,Mattia Del Favero,32,Goalkeeper,20,Italy,SPAL,190.0,right,-,,0.0,,0.1,2017,serie_a,juventus-turin,,
4,386567,leonardo-loria,Leonardo Loria,35,Goalkeeper,19,Italy,Pisa Sporting Club,195.0,right,-,,0.0,,0.1,2017,serie_a,juventus-turin,,


### map teams

In [129]:
seriea_wage_teams = seriea_wages['squad'].unique()
seriea_wage_teams.sort()
seriea_wage_teams

array(['Atalanta', 'Benevento', 'Bologna', 'Brescia', 'Cagliari',
       'Chievo', 'Cremonese', 'Crotone', 'Empoli', 'Fiorentina',
       'Frosinone', 'Genoa', 'Hellas Verona', 'Inter', 'Juventus',
       'Lazio', 'Lecce', 'Milan', 'Monza', 'Napoli', 'Parma', 'Roma',
       'SPAL', 'Salernitana', 'Sampdoria', 'Sassuolo', 'Spezia', 'Torino',
       'Udinese', 'Venezia'], dtype=object)

In [115]:
seriea_value_teams = seriea_values['team'].unique()
seriea_value_teams.sort()
seriea_value_teams

array(['ac-florenz', 'ac-mailand', 'ac-monza', 'as-rom',
       'atalanta-bergamo', 'benevento-calcio', 'brescia-calcio',
       'cagliari-calcio', 'chievo-verona', 'fc-bologna', 'fc-crotone',
       'fc-empoli', 'fc-turin', 'frosinone-calcio', 'genua-cfc',
       'hellas-verona', 'inter-mailand', 'juventus-turin', 'lazio-rom',
       'parma-calcio-1913', 'sampdoria-genua', 'spal', 'spezia-calcio',
       'ssc-neapel', 'udinese-calcio', 'us-cremonese', 'us-lecce',
       'us-salernitana-1919', 'us-sassuolo', 'venezia-fc'], dtype=object)

In [117]:
len(seriea_wage_teams), len(seriea_value_teams)

(30, 30)

In [118]:
seriea_values.loc[:, 'team'] = seriea_values['team'].str.replace('^(ac|as|fc|ssc|us)-', '', regex=True)

In [119]:
seriea_value_teams = seriea_values['team'].unique()
seriea_value_teams.sort()
seriea_value_teams

array(['atalanta-bergamo', 'benevento-calcio', 'bologna',
       'brescia-calcio', 'cagliari-calcio', 'chievo-verona', 'cremonese',
       'crotone', 'empoli', 'florenz', 'frosinone-calcio', 'genua-cfc',
       'hellas-verona', 'inter-mailand', 'juventus-turin', 'lazio-rom',
       'lecce', 'mailand', 'monza', 'neapel', 'parma-calcio-1913', 'rom',
       'salernitana-1919', 'sampdoria-genua', 'sassuolo', 'spal',
       'spezia-calcio', 'turin', 'udinese-calcio', 'venezia-fc'],
      dtype=object)

In [130]:
seriea_wage_teams = [val.replace('SPAL', 'Spal') for val in seriea_wage_teams]
seriea_wage_teams.sort()

['Atalanta',
 'Benevento',
 'Bologna',
 'Brescia',
 'Cagliari',
 'Chievo',
 'Cremonese',
 'Crotone',
 'Empoli',
 'Fiorentina',
 'Frosinone',
 'Genoa',
 'Hellas Verona',
 'Inter',
 'Juventus',
 'Lazio',
 'Lecce',
 'Milan',
 'Monza',
 'Napoli',
 'Parma',
 'Roma',
 'Salernitana',
 'Sampdoria',
 'Sassuolo',
 'Spal',
 'Spezia',
 'Torino',
 'Udinese',
 'Venezia']

In [132]:
serie_a_team_map = dict(zip(seriea_value_teams, seriea_wage_teams))

## Ligue 1

In [133]:
ligue1_wages = load_wages("Ligue-1")

In [135]:
ligue1_wages.head()

Unnamed: 0,rk,player,nation,pos,squad,age,weekly_wages_euros,annual_wages_euros,notes,season,general_pos,age_range,country,continent,player_id
0,1,Neymar,BRA,FW,Paris S-G,25,832692,43300000,Unverified estimation,2017-2018,Forward,25-29,Brazil,South America,1715.0
1,2,Edinson Cavani,URU,FW,Paris S-G,30,350769,18240000,Unverified estimation,2017-2018,Forward,30-34,Uruguay,South America,451.0
2,3,Kylian Mbappé,FRA,"FW,MF",Monaco,18,341731,17770000,Unverified estimation,2017-2018,Forward,Under 20,France,Europe,1536.0
3,4,Lassana Diarra,FRA,MF,Paris S-G,32,330192,17170000,Unverified estimation,2017-2018,Midfielder,30-34,France,Europe,641.0
4,5,Thiago Silva,BRA,DF,Paris S-G,32,303077,15760000,Unverified estimation,2017-2018,Defender,30-34,Brazil,South America,2203.0


In [156]:
ligue1_values = load_valuations("ligue_1")

In [157]:
ligue1_values.head()

Unnamed: 0,tm_id,tm_name,player,squad_num,position,age,country,current_club,height,foot,signed_date,signed_from,signing_fee_euro_mill,contract_expiry,market_value_euro_mill,season,league,team,signed_year,player_id
0,120629,alphonse-areola,Alphonse Areola,16,Goalkeeper,25,France,West Ham United,195.0,right,"Jul 1, 2010",Paris Saint-Germain U19,,,15.0,2017,ligue_1,fc-paris-saint-germain,2010.0,112.0
1,45672,kevin-trapp,Kevin Trapp,1,Goalkeeper,27,Germany,Eintracht Frankfurt,189.0,right,"Jul 8, 2015",Eintracht Frankfurt,9.5,,8.0,2017,ligue_1,fc-paris-saint-germain,2015.0,2381.0
2,282028,remy-descamps,Rémy Descamps,0,Goalkeeper,22,France,FC Nantes,196.0,left,"Jul 1, 2017",Paris Saint-Germain B,,,0.6,2017,ligue_1,fc-paris-saint-germain,2017.0,5463.0
3,395251,sebastien-cibois,Sébastien Cibois,40,Goalkeeper,20,France,Rodez AF,192.0,right,"Jul 1, 2018",Paris Saint-Germain B,,,,2017,ligue_1,fc-paris-saint-germain,2018.0,4131.0
4,181767,marquinhos,Marquinhos,5,Centre-Back,24,Brazil,Paris Saint-Germain,183.0,right,"Jul 19, 2013",AS Roma,31.4,,55.0,2017,ligue_1,fc-paris-saint-germain,2013.0,1492.0


### Map teams

In [140]:
ligue1_wage_teams = ligue1_wages['squad'].unique()
ligue1_wage_teams.sort()
ligue1_wage_teams

array(['Ajaccio', 'Amiens', 'Angers', 'Auxerre', 'Bordeaux', 'Brest',
       'Caen', 'Clermont Foot', 'Dijon', 'Guingamp', 'Le Havre', 'Lens',
       'Lille', 'Lorient', 'Lyon', 'Marseille', 'Metz', 'Monaco',
       'Montpellier', 'Nantes', 'Nice', 'Nîmes', 'Paris S-G', 'Reims',
       'Rennes', 'Saint-Étienne', 'Strasbourg', 'Toulouse', 'Troyes'],
      dtype=object)

In [158]:
ligue1_values['team'].unique()

array(['fc-paris-saint-germain', 'as-monaco', 'olympique-lyon',
       'olympique-marseille', 'ogc-nizza', 'fc-girondins-bordeaux',
       'fc-stade-rennes', 'fc-toulouse', 'as-saint-etienne', 'fc-nantes',
       'losc-lille', 'montpellier-hsc', 'dijon-fco', 'amiens-sc',
       'sco-angers', 'sm-caen', 'ea-guingamp', 'fc-metz',
       'rc-strassburg-alsace', 'es-troyes-ac', 'nimes-olympique',
       'stade-reims', 'stade-brest-29', 'rc-lens', 'fc-lorient',
       'clermont-foot-63', 'aj-auxerre', 'ac-ajaccio', 'ac-le-havre'],
      dtype=object)

In [175]:
ligue1_values.loc[:, 'team'] = ligue1_values['team'].str.replace('^(fc-stade|stade|as|ogc|es|aj|ac|sm|ea|rc|fc-girondins|fc|sco|olympique|losc)-', '', regex=True)

In [176]:
ligue1_values_teams = ligue1_values['team'].unique()
ligue1_values_teams.sort()
ligue1_values_teams

array(['ajaccio', 'amiens-sc', 'angers', 'auxerre', 'bordeaux',
       'brest-29', 'caen', 'clermont-foot-63', 'dijon-fco', 'guingamp',
       'le-havre', 'lens', 'lille', 'lorient', 'lyon', 'marseille',
       'metz', 'monaco', 'montpellier-hsc', 'nantes', 'nimes-olympique',
       'nizza', 'paris-saint-germain', 'reims', 'rennes', 'saint-etienne',
       'strassburg-alsace', 'toulouse', 'troyes-ac'], dtype=object)

In [177]:
ligue1_values_teams = [val.replace('nizza', 'nice') for val in ligue1_values_teams]
ligue1_values_teams.sort()
ligue1_values_teams

['ajaccio',
 'amiens-sc',
 'angers',
 'auxerre',
 'bordeaux',
 'brest-29',
 'caen',
 'clermont-foot-63',
 'dijon-fco',
 'guingamp',
 'le-havre',
 'lens',
 'lille',
 'lorient',
 'lyon',
 'marseille',
 'metz',
 'monaco',
 'montpellier-hsc',
 'nantes',
 'nice',
 'nimes-olympique',
 'paris-saint-germain',
 'reims',
 'rennes',
 'saint-etienne',
 'strassburg-alsace',
 'toulouse',
 'troyes-ac']

In [179]:
ligue_1_team_map = dict(zip(ligue1_values_teams, ligue1_wage_teams))

The teams names have been mapped now we can make successful joins between the wage and valuations dfs for each league