Importing transfermarkt data from Data World - David Cereijo

In [273]:
import pandas as pd
import os

repo_dir = os.getcwd()  # Directory of the script
transferMKT_dir = os.path.join(repo_dir, 'transferMKT-data')

Global Functions and Variables:

In [274]:
# Global Variables:
my_leagues = ['ES1', 'IT1', 'GR1', 'GB1', 'FR1']
player_cols = ['date_y', 'player_id', 'player_name', 'player_current_club_id', 'market_value_in_eur', 'player_club_domestic_competition_id']

In [275]:
# Function that creates a CSV from a pandas df:
def make_csv(df, dir, file_name):
    file_path = os.path.join(dir, f'{file_name}.csv')
    return df.to_csv(file_path, index=True)


# Function that converts a datetime column to soccer season format (ex: 1819):
def calculate_season(date):
    year = date.year
    month = date.month
    if month in range(7,12):
        return ((year - 2000) * 100) + (year - 1999)
    else:
        return ((year - 2001) * 100) + (year - 2000)

Import and clean data using links from data world (DW):

In [276]:
# Create a pandas df for the appearances file:
appearances_df = pd.read_csv('https://query.data.world/s/2t4a5mgcrt7xb32ifpci2wijahs7fq?dws=00000')

# Create a pandas dataframe for the appearances file:
player_val_df = pd.read_csv('https://query.data.world/s/bxh6i5g3kll34aqabzszjbecgdzabm?dws=00000')

# Create a pandas for the clubs file:
clubs_df = pd.read_csv('https://query.data.world/s/4iac2yo5mskcbmy6xnsvahtxe5eakd?dws=00000')

In [None]:
# Use this cell for viewing the DW dataframes:
appearances_df

In [302]:
# Clean the 3 DW dataframes:

# Date and league filtering:
filt_appearances_df = appearances_df[(appearances_df['competition_id'] == 'GB1') & (appearances_df['date'] >= '2017-07-01')]    # EPL and 17/18 season+ only

# Slice by necessary columns only:
app_cols = ['player_id', 'player_club_id', 'date', 'player_name', 'competition_id']
filt_appearances_df = filt_appearances_df[app_cols]


# Date and league filtering:
filt_player_val_df = player_val_df[(player_val_df['player_club_domestic_competition_id'] == 'GB1') & (player_val_df['date'] >= '2017-07-01')]    # EPL and 17/18 season+ only

# Slice by necessary columns only:
val_cols = ['player_id', 'date', 'current_club_id', 'market_value_in_eur', 'player_club_domestic_competition_id']
filt_player_val_df = filt_player_val_df[val_cols]


# Date and league filtering:
filt_clubs_df = clubs_df[clubs_df['domestic_competition_id'] == 'GB1']    # EPL only

# Slice by necessary columns only:
clubs_cols = ['club_id', 'name', 'domestic_competition_id']
filt_clubs_df = filt_clubs_df[clubs_cols]


# Standardize common DW column names and sort on date column before merging:
filt_appearances_df = filt_appearances_df.rename(columns={'competition_id' : 'league_id',
                                                          'player_club_id' : 'team_id'}).sort_values('date', ascending=True)

filt_player_val_df = filt_player_val_df.rename(columns={'current_club_id' : 'team_id',
                                                        'player_club_domestic_competition_id' : 'league_id'}).sort_values('date', ascending=True)

filt_clubs_df = filt_clubs_df.rename(columns={'club_id' : 'team_id',
                                              'name' : 'team',
                                              'domestic_competition_id' : 'league_id'})

In [None]:
filt_appearances_df

In [303]:
# Create a season column in the appearances and player_val DFs:
dw_dfs = [filt_appearances_df, filt_player_val_df]
for df in dw_dfs:

    # Convert the current date column to date format:
    df['date'] = pd.to_datetime(df['date'])

    # Add the season column by calling the calculate_season() function:
    df['season'] = df['date'].apply(lambda x: calculate_season(x))

    # Drop the date column for testing:
    df.drop('date', axis=1, inplace=True)

# Aggregate the filtered appearances DF to the player-season level and reset index:
agg_appearances_df = (filt_appearances_df
                      .groupby(['season', 'team_id', 'player_name']).agg({'player_id' : 'first', 'league_id' : 'first'})
                      .reset_index()
                      )

# Aggregate the player_val DF to the player-season level:
agg_player_val_df = (filt_player_val_df
                     .groupby(['season', 'team_id', 'player_id',]).agg({'market_value_in_eur' : 'mean', 'league_id' : 'first'})
                     .reset_index()
                     )

In [None]:
# Use this cell to view the agg DFs:
agg_player_val_df

Attempt an EPL merge of the three DW files:

In [308]:
# Try to merge the two DW dataframes on player_id and season:
first_merge = pd.merge(agg_appearances_df, agg_player_val_df, on=['league_id', 'season', 'team_id', 'player_id'])
# make_csv(first_merge, repo_dir, "test_merge")

In [309]:
first_merge

Unnamed: 0,season,team_id,player_name,player_id,league_id,market_value_in_eur
0,1718,11,Granit Xhaka,111455,GB1,4.166667e+07
1,1718,11,Mohamed Elneny,160438,GB1,1.000000e+07
2,1718,11,Per Mertesacker,6710,GB1,1.750000e+06
3,1718,11,Petr Cech,5658,GB1,3.833333e+06
4,1718,11,Reiss Nelson,340325,GB1,1.700000e+07
...,...,...,...,...,...,...
2346,2223,1237,Pervis Estupiñán,349599,GB1,2.000000e+07
2347,2223,1237,Robert Sánchez,403151,GB1,2.850000e+07
2348,2223,1237,Solly March,209212,GB1,1.400000e+07
2349,2223,1237,Tariq Lamptey,504148,GB1,1.500000e+07


In [310]:
# Add team name column to the first merge using the filt_club_df:
second_merge = pd.merge(first_merge, filt_clubs_df, on=['team_id', 'league_id'])
make_csv(second_merge, repo_dir, "test_merge")
second_merge

Unnamed: 0,season,team_id,player_name,player_id,league_id,market_value_in_eur,team
0,1718,11,Granit Xhaka,111455,GB1,4.166667e+07,Arsenal FC
1,1718,11,Mohamed Elneny,160438,GB1,1.000000e+07,Arsenal FC
2,1718,11,Per Mertesacker,6710,GB1,1.750000e+06,Arsenal FC
3,1718,11,Petr Cech,5658,GB1,3.833333e+06,Arsenal FC
4,1718,11,Reiss Nelson,340325,GB1,1.700000e+07,Arsenal FC
...,...,...,...,...,...,...,...
2346,2223,703,Serge Aurier,127032,GB1,7.000000e+06,Nottingham Forest
2347,2223,703,Steve Cook,90836,GB1,3.000000e+06,Nottingham Forest
2348,2223,703,Taiwo Awoniyi,295313,GB1,2.000000e+07,Nottingham Forest
2349,2223,703,Wayne Hennessey,45494,GB1,5.000000e+05,Nottingham Forest


Attempt to merge the DW EPL merge to the FBref EPL full merge file:

In [311]:
# Import the fbref EPL data:
fbref_epl_merge = pd.read_excel(os.path.join(repo_dir, "Merged Data/ENG-Premier League_full_merge.xlsx"))

# Rename the fbref player column to player_name for merging:
fbref_epl_merge = fbref_epl_merge.rename(columns={'player' : 'player_name'})
# fbref_epl_merge.shape[0]

In [312]:
# Filter the second_merge to only 1718 season+:
dw_epl_merge = second_merge[second_merge['season'] >= 1718]

# Merge the two dataframes on the 'season' and 'player' columns:
dw_fbref_epl_merge = pd.merge(fbref_epl_merge, dw_epl_merge, on=['season', 'player_name'])
dw_fbref_epl_merge

Unnamed: 0,league,season,team_x,player_name,nationality,position,age,YOB,MP,Starts,...,onxG,onxGA,xG+/-,xG+/-90,On-Off.1,team_id,player_id,league_id,market_value_in_eur,team_y
0,ENG-Premier League,1718,Arsenal,Alex Oxlade-Chamberlain,ENG,DF,23.0,1993.0,3.0,3.0,...,4.2,4.6,-0.4,-0.14,-0.74,31,143424,GB1,3.500000e+07,Liverpool FC
1,ENG-Premier League,1718,Liverpool,Alex Oxlade-Chamberlain,ENG,"MF,FW",23.0,1993.0,32.0,14.0,...,30.7,14.9,15.8,0.95,-0.14,31,143424,GB1,3.500000e+07,Liverpool FC
2,ENG-Premier League,1718,Arsenal,Granit Xhaka,SUI,MF,24.0,1992.0,38.0,37.0,...,64.4,46.9,17.5,0.48,-1.20,11,111455,GB1,4.166667e+07,Arsenal FC
3,ENG-Premier League,1718,Arsenal,Mohamed Elneny,EGY,MF,25.0,1992.0,13.0,11.0,...,16.8,12.2,4.7,0.49,-0.07,11,160438,GB1,1.000000e+07,Arsenal FC
4,ENG-Premier League,1718,Arsenal,Per Mertesacker,GER,DF,32.0,1984.0,6.0,4.0,...,7.3,5.1,2.1,0.53,-0.01,11,6710,GB1,1.750000e+06,Arsenal FC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1841,ENG-Premier League,2223,Wolves,Pablo Sarabia,ESP,"FW,MF",30.0,1992.0,13.0,9.0,...,9.9,13.3,-3.4,-0.40,0.27,543,74230,GB1,1.933333e+07,Wolverhampton Wanderers
1842,ENG-Premier League,2223,Wolves,Pedro Neto,POR,"FW,MF",22.0,2000.0,18.0,13.0,...,10.9,14.2,-3.2,-0.30,0.42,543,487465,GB1,3.500000e+07,Wolverhampton Wanderers
1843,ENG-Premier League,2223,Wolves,Raúl Jiménez,MEX,FW,31.0,1991.0,15.0,8.0,...,10.6,16.6,-6.0,-0.64,-0.04,543,206040,GB1,1.200000e+07,Wolverhampton Wanderers
1844,ENG-Premier League,2223,Wolves,Rúben Neves,POR,MF,25.0,1997.0,35.0,33.0,...,32.5,51.7,-19.2,-0.57,0.29,543,225161,GB1,4.000000e+07,Wolverhampton Wanderers


In [313]:
# If it runs:
make_csv(dw_fbref_epl_merge, repo_dir, "final_epl_test")

In [None]:
# Check the number of overlapping player names between DW and FBref to see if it matches the merge unique name count:
fbref_epl_player_names = list(fbref_epl_merge['player_name'].unique())
dw_epl_merge_player_names = list(dw_epl_merge['player_name'].unique())

common_players = [player for player in fbref_epl_player_names if player in dw_epl_merge_player_names]
# len(common_players)

# Check number of unique names in the merged data:
full_merge_names = list(dw_fbref_epl_merge['player_name'].unique())
# len(full_merge_names)

# IN CONCLUSION: THE NUMBER OF OVERLAPPING UNIQUE NAMES BETWEEN THE FBREF DATA AND THE DW WAS 759
# THE NUMBER OF UNIQUE NAMES IN THE TWO MERGED TOGETHER WAS 749: CLOSE ENOUGH