Importing transfermarkt data from Data World - David Cereijo

In [134]:
import pandas as pd
import os

repo_dir = os.getcwd()  # Directory of the script
transferMKT_dir = os.path.join(repo_dir, 'transferMKT-data')

Global Functions and Variables:

In [135]:
# Global Variables:
my_leagues = ['ES1', 'IT1', 'GR1', 'GB1', 'FR1']
player_cols = ['date_y', 'player_id', 'player_name', 'player_current_club_id', 'market_value_in_eur', 'player_club_domestic_competition_id']

In [136]:
# Function that creates a CSV from a pandas df:
def make_csv(df, dir, file_name):
    file_path = os.path.join(dir, f'{file_name}.csv')
    return df.to_csv(file_path, index=True)


# Function that converts a datetime column to soccer season format (ex: 1819):
def calculate_season(date):
    year = date.year
    month = date.month
    if month in range(7,12):
        return ((year - 2000) * 100) + (year - 1999)
    else:
        return ((year - 2001) * 100) + (year - 2000)

Import and clean data using links from data world (DW):

In [137]:
# Create a pandas df for the appearances file:
appearances_df = pd.read_csv('https://query.data.world/s/2t4a5mgcrt7xb32ifpci2wijahs7fq?dws=00000')

# Create a pandas dataframe for the appearances file:
player_val_df = pd.read_csv('https://query.data.world/s/bxh6i5g3kll34aqabzszjbecgdzabm?dws=00000')

# Create a pandas for the clubs file:
clubs_df = pd.read_csv('https://query.data.world/s/4iac2yo5mskcbmy6xnsvahtxe5eakd?dws=00000')

In [None]:
# Use this cell for viewing the DW dataframes:
player_val_df

In [160]:
# Clean the 3 DW dataframes:

# Date and league filtering:
filt_appearances_df = appearances_df[(appearances_df['competition_id'] == 'GB1') & (appearances_df['date'] >= '2017-07-01')]    # EPL and 17/18 season+ only

# Slice by necessary columns only:
app_cols = ['player_id', 'date', 'player_name']
filt_appearances_df = filt_appearances_df[app_cols]


# Date and league filtering:
filt_player_val_df = player_val_df[(player_val_df['player_club_domestic_competition_id'] == 'GB1') & (player_val_df['date'] >= '2017-07-01')]    # EPL and 17/18 season+ only

# Slice by necessary columns only:
val_cols = ['player_id', 'date', 'current_club_id', 'market_value_in_eur', 'player_club_domestic_competition_id']
filt_player_val_df = filt_player_val_df[val_cols]


# Date and league filtering:
filt_clubs_df = clubs_df[clubs_df['domestic_competition_id'] == 'GB1']    # EPL only

# Slice by necessary columns only:
clubs_cols = ['club_id', 'name', 'domestic_competition_id']
filt_clubs_df = filt_clubs_df[clubs_cols]

In [161]:
# Standardize common DW column names and sort on date column before merging:
filt_appearances_df = filt_appearances_df.rename(columns={'competition_id' : 'league_id'}).sort_values('date', ascending=True)

filt_player_val_df = filt_player_val_df.rename(columns={'current_club_id' : 'team_id',
                                                        'player_club_domestic_competition_id' : 'league_id'}).sort_values('date', ascending=True)

filt_clubs_df = filt_clubs_df.rename(columns={'club_id' : 'team_id',
                                              'name' : 'team',
                                              'domestic_competition_id' : 'league_id'})

In [162]:
filt_player_val_df

Unnamed: 0,player_id,date,team_id,market_value_in_eur,league_id
208430,63824,2017-07-01,405,700000,GB1
208434,77812,2017-07-01,1123,250000,GB1
208540,339340,2017-07-01,985,200000,GB1
208600,369567,2017-07-02,543,75000,GB1
208602,490307,2017-07-02,1237,175000,GB1
...,...,...,...,...,...
424294,120241,2023-05-15,603,50000,GB1
424296,223790,2023-05-15,1010,25000,GB1
424298,339789,2023-05-15,379,75000,GB1
424302,486100,2023-05-15,405,75000,GB1


In [163]:
# Create a season column in the appearances and player_val DFs:
dw_dfs = [filt_appearances_df, filt_player_val_df]
for df in dw_dfs:

    # Convert the current date column to date format:
    df['date'] = pd.to_datetime(df['date'])

    # Add the season column by calling the calculate_season() function:
    df['season'] = df['date'].apply(lambda x: calculate_season(x))

    # Drop the date column for testing:
    df.drop('date', axis=1, inplace=True)

# Aggregate the filtered appearances DF to the player-season level:
agg_appearances_df = filt_appearances_df.groupby(['player_name', 'season']).agg({'player_id' : 'first'}).reset_index(inplace=True)

# Aggregate the player_val DF to the player-season level:
agg_player_val_df = filt_player_val_df.groupby(['player_id', 'season']).agg({'team_id' : 'first',
                                                                    'market_value_in_eur' : 'mean',
                                                                    'league_id' : 'first'}).reset_index(inplace=True)

In [164]:
# Use this cell to view the agg DFs:
agg_appearances_df

Merge the three DW dataframes:

In [None]:
# Merge the appearances and player_val df:
merge1 = pd.merge(appearances_df, player_val_df, on='player_id')

In [None]:
# Make a copy of merge 1 and slice it to include only the necessary columns:
merge2 = merge1.copy(deep=True)

merge3 = merge2[player_cols]
merge3

Filter and split up the merged data for each league:

In [None]:
# Filter the merged dataframe on league ids and date:
filter_date = '2017-05-31'

filtered_df = merge3[merge3['player_club_domestic_competition_id'].isin(my_leagues) & (merge3['date_y'] >= filter_date)]

In [None]:
# Generate a list of player names for each league:
league_players_lists = [list(filtered_df[filtered_df['player_club_domestic_competition_id'] == league]['player_name'].unique()) for league in my_leagues]

# Assign each league to one of the lists:
laliga_players = league_players_lists[0]
serieA_players = league_players_lists[1]
bundesliga_players = league_players_lists[2]
epl_players = league_players_lists[3]
ligue1_players = league_players_lists[4]

In [None]:
# Create a dataframe for each league using a list comprehension:
league_player_dfs = [filtered_df[filtered_df['player_club_domestic_competition_id'] == league].sort_values('date_y') for league in my_leagues]

In [None]:
league_player_dfs[0]

Bring in the EPL main merged data file for comparison with the DW file:

In [None]:
# Import the full EPL data from Merged Data directory:
fbref_epl_merge = pd.read_excel(os.path.join(repo_dir, "Merged Data/ENG-Premier League_full_merge.xlsx"))

In [None]:
# Generate a list of unique player names from the EPL merge:
fbref_epl_player_names = list(fbref_epl_merge['player'].unique())
print(f'Number of players in fbref EPL player list: {len(fbref_epl_player_names)}\n')

# Generate a list of overlapping player names between the fbref list and DW list:
common_epl_players = [player for player in fbref_epl_player_names if player in epl_players]
print(f'Number of overlapping players in fbref/DW EPL player listS: {len(common_epl_players)}')

Shrink down the DW data by creating a season variable with format "Y1Y2":

Create a merged dataframe for the overlapping EPL players: 

In [None]:
# Filter the two dataframes to only include overlapping player names, past 2022:
fbref_epl_merge1 = fbref_epl_merge[fbref_epl_merge['player'].isin(common_epl_players)]
fbref_epl_merge1 = fbref_epl_merge1[fbref_epl_merge1['season'] >= 2122].sort_values(by='player', ascending=True)

# Shrinking down the data world dataframe:
dw_epl_df = league_player_dfs[3]

# Common players and date > 2022:
dw_epl_df = dw_epl_df[dw_epl_df['player_name'].isin(common_epl_players)].rename(columns={'player_name' : 'player', 'date_y' : 'date'})
dw_epl_df = dw_epl_df[dw_epl_df['date'] >= '2022-05-31']

# Create a year column:
dw_epl_df['date'] = pd.to_datetime(dw_epl_df['date'])
dw_epl_df['year'] = dw_epl_df['date'].dt.year
dw_epl_df['month'] = dw_epl_df['date'].dt.month

# Create a season column:
dw_epl_df['season'] = dw_epl_df['date'].apply(lambda x: calculate_season(x))

# Group by season-player:
dw_epl_df = (dw_epl_df
             .groupby(['player', 'season']).agg({'player_current_club_id':'first', 'market_value_in_eur':'mean', 'player_club_domestic_competition_id':'first'})
             .reset_index()
             .sort_values(by='player', ascending=True)
             )

dw_epl_df

In [None]:
# Merge the two sliced dataframes:
dw_fbref_epl_merge = pd.merge(fbref_epl_merge1, dw_epl_df, on=['player', 'season'])
dw_fbref_epl_merge