Python script to scrape player in-game statistics data from fbref.com and merge with player valuations. Extracted data from 10 professional soccer leagues around the world.

In [1]:
# libraries
import pandas as pd
import numpy as np
import requests
import time
import random

years_special = ['2022-2023']
numbers = [11,20,9,12,30,13,40,37,23,32]
stats = ['passing','defense','shooting','keepers']
leagues = ['-Serie-A-Stats','-Bundesliga-Stats',
           '-Premier-League-Stats','-La-Liga-Stats','-Russian-Premier-League-Stats',
           '-Ligue-1-Stats','-Scottish-Premiership-Stats','-Belgian-Pro-League-Stats','-Eredivisie-Stats',
           '-Primeira-Liga-Stats']
           
league_names = ['Serie A','Bundesliga','Premier League','La Liga', 
               'Premier Liga', 'Ligue 1', 'Scottish Premiership',
               'Jupiler Pro League', 'Eredivisie', 'Liga Nos']
                
defense_list = []
passing_list = []
shooting_list = []
keepers_list = []
count = 0
for n in range(len(numbers)):
    for t in stats:
        for z in years_special:
            data = 'https://fbref.com/en/comps/' + str(numbers[count]) + '/' + z + '/' + t + '/' + z + leagues[count]
            response = requests.get(data).text.replace('<!--', '').replace('-->', '')
            df = pd.read_html(response, header=1)[2]
            df['Based'] = league_names[count]
            if t == 'passing':
                passing_list.append(df)
            elif t == 'defense':
                defense_list.append(df)
            elif t == 'shooting':
                shooting_list.append(df)
            else:
                keepers_list.append(df)
            time.sleep(3)
            print('Imported ' + t + ' data for ' + league_names[count])

    count += 1

Imported passing data for Serie A
Imported defense data for Serie A
Imported shooting data for Serie A
Imported passing data for Bundesliga
Imported defense data for Bundesliga
Imported shooting data for Bundesliga
Imported passing data for Premier League
Imported defense data for Premier League
Imported shooting data for Premier League
Imported passing data for La Liga
Imported defense data for La Liga
Imported shooting data for La Liga
Imported passing data for Premier Liga
Imported defense data for Premier Liga
Imported shooting data for Premier Liga
Imported passing data for Ligue 1
Imported defense data for Ligue 1
Imported shooting data for Ligue 1
Imported passing data for Scottish Premiership
Imported defense data for Scottish Premiership
Imported shooting data for Scottish Premiership
Imported passing data for Jupiler Pro League
Imported defense data for Jupiler Pro League
Imported shooting data for Jupiler Pro League
Imported passing data for Eredivisie
Imported defense data 

In [2]:
shot = []
defense = []
passing = []
for i in range(len(years_special)):
    shot.append(shooting_list[i::len(years_special)])
    defense.append(defense_list[i::len(years_special)])
    passing.append(passing_list[i::len(years_special)])

In [3]:
#for y in range(len(years_special)):
for k in range(3):
    if stats[k] == 'passing':
        count = 0
        for df in passing[0]:
            # rename columns
            df = df.fillna(0)

            df = df.drop(columns=['Rk','Nation','Pos','Matches'])

            df.rename(columns={'Cmp':'Total Cmp', 'Att': 'Total Att', 'Cmp%':'Total Cmp%','Cmp.1':'Short Cmp', 
                               'Att.1':'Short Att','Cmp%.1':'Short Cmp%', 'Cmp.2':'Medium Cmp', 'Att.2':'Medium Att',
                               'Cmp%.2':'Medium Cmp%', 'Cmp.3':'Long Cmp', 'Att.3':'Long Att', 'Cmp%.3':'Long Cmp%'}, inplace = True)
            passing[0][count] = df

            count += 1

    elif stats[k] == 'defense':
        count = 0
        for df in defense[0]:

            # rename columns
            df = df.fillna(0)

            df['Age'] = df['Age'].str[:2]
            df['Position_2'] = df['Pos'].str[3:]
            df['Position'] = df['Pos'].str[:2]
            df['Nation'] = df['Nation'].str.split(' ').str.get(1)

            df['Position'] = df['Position'].replace({'MF': 'Midfielder', 'DF': 'Defender', 'FW': 'Forward', 'GK': 'Goalkeeper'})
            df['Position_2'] = df['Position_2'].replace({'MF': 'Midfielder', 'DF': 'Defender',
                                                             'FW': 'Forward', 'GK': 'Goalkeeper'})

            df = df.drop(columns=['Rk','Nation','Pos','Position_2','Matches'])

            df.rename(columns={'Sh':'Blocks Sh','Pass':'Blocks Pass'}, inplace = True)
            defense[0][count] = df

            count += 1

    else:
        count = 0
        for df in shot[0]:

            # rename columns
            df = df.fillna(0)

            df['Age'] = df['Age'].str[:2]
            df['Position_2'] = df['Pos'].str[3:]
            df['Position'] = df['Pos'].str[:2]
            df['Nation'] = df['Nation'].str.split(' ').str.get(1)

            df['Position'] = df['Position'].replace({'MF': 'Midfielder', 'DF': 'Defender', 'FW': 'Forward', 'GK': 'Goalkeeper'})
            df['Position_2'] = df['Position_2'].replace({'MF': 'Midfielder', 'DF': 'Defender',
                                                             'FW': 'Forward', 'GK': 'Goalkeeper'})

            df = df.drop(columns=['Rk','Nation','Pos','Position','Position_2','Matches'])

            shot[0][count] = df

            count += 1

df_shot = pd.concat(shot[0])
df_pass = pd.concat(passing[0])
df_def = pd.concat(defense[0])

final = pd.merge(pd.merge(df_shot,df_def,on=['Player','Squad','Born','Based','Age','90s']),df_pass,on=['Player','Squad','Born','Based','Age','90s'])
final['player_code'] = final['Player'].str.lower().str.replace(' ','-')
final['Born'] = final['Born'].astype(str)
final['Born'] = final['Born'].str[-2:]

replacements = {
    'à': 'a', 'á': 'a', 'â': 'a', 'ä': 'a', 'ǎ': 'a', 'æ': 'a', 'ã': 'a', 'å': 'a', 'ā': 'a',
    'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', 'ě': 'e', 'ẽ': 'e', 'ē': 'e', 'ė': 'e', 'ę': 'e',
    'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i', 'ǐ': 'i', 'ĩ': 'i', 'ī': 'i', 'ı': 'i', 'į': 'i',
    'ò': 'o', 'ó': 'o', 'ô': 'o', 'ö': 'o', 'ǒ': 'o', 'œ': 'o', 'ø': 'o', 'õ': 'o', 'ō': 'o',
    'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u', 'ǔ': 'u', 'ũ': 'u', 'ū': 'u', 'ű': 'u', 'ů': 'u',
    'ğ':'g','ġ':'g','Ğ':'G','Ġ':'G','ç':'c','ć':'c','č':'c','ċ':'c','ł':'l','ļ':'l','ľ':'l',
    'ß':'s','ş':'s','ș':'s','ś':'s','š':'s','ķ':'k','ñ':'n','ń':'n','ņ':'n','ň':'n','ŵ':'w',
    'ź':'z','ž':'z','ż':'z','þ':'b','đ':'d',"'":'',"’":''
}

for key in replacements:
    final['player_code'] = final['player_code'].str.replace(key, replacements[key])

In [4]:
final.to_csv('player_data_2023.csv')

In [6]:
pv = pd.read_csv('transfermarkt_data.csv')

pv['Born']=pv['date_of_birth'].str[-2:].astype(str)

merged = pd.merge(final,pv,on=['player_code','Based','Born']).drop(['current_club_domestic_competition_id',
                                                             'current_club_name','last_season','first_name',
                                                             'last_name','player_club_domestic_competition_id'], axis=1)

In [7]:
#handle special case duplicate players
merged = merged.sort_values(['player_id', '90s']).drop_duplicates('player_id', keep='last')
merged[merged.duplicated(['player_id'], keep=False)]

Unnamed: 0,Player,Squad,Age,Born,90s,Gls,Sh,SoT,SoT%,Sh/90,...,player_id,name,country_of_birth,date_of_birth,sub_position,foot,height_in_cm,contract_expiration_date,date,market_value_in_eur


In [8]:
merged

Unnamed: 0,Player,Squad,Age,Born,90s,Gls,Sh,SoT,SoT%,Sh/90,...,player_id,name,country_of_birth,date_of_birth,sub_position,foot,height_in_cm,contract_expiration_date,date,market_value_in_eur
1265,James Milner,Liverpool,36,86,10.1,0,7,1,14.3,0.69,...,3333,James Milner,England,1/4/86,Central Midfield,right,175.0,6/30/23 0:00,2022-11-03,2000000
3260,Christophe Lepoint,R.F.C. Seraing,37,84,21.6,1,9,1,11.1,0.42,...,4742,Christophe Lepoint,Belgium,10/24/84,Central Midfield,right,189.0,6/30/23 0:00,2022-11-04,75000
688,Jonas Hofmann,M'Gladbach,30,92,29.8,12,44,22,50.0,1.48,...,7161,Jonas Hofmann,Germany,7/14/92,Right Winger,right,176.0,6/30/25 0:00,2022-11-09,13000000
2155,Aleksandr Ryazantsev,Torpedo Moscow,35,86,9.9,0,17,5,29.4,1.72,...,7252,Aleksandr Ryazantsev,UdSSR,9/5/86,Central Midfield,right,179.0,6/30/23 0:00,2022-11-21,100000
1661,Joaquín,Betis,41,81,5.5,0,8,3,37.5,1.45,...,7663,Joaqu√≠n,Spain,7/21/81,Right Winger,right,181.0,6/30/23 0:00,2022-11-07,1500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3412,Modou Tambedou,Zulte Waregem,19,03,15.7,1,8,2,25.0,0.51,...,1035048,Modou Tambedou,,4/4/03,Centre-Back,right,,6/30/25 0:00,2022-11-04,1200000
3301,Andy Musayev,Oostende,19,03,2.3,0,2,1,50.0,0.87,...,1035758,Andy Musayev,,4/17/03,Second Striker,right,,6/30/25 0:00,2022-11-04,200000
3116,Denys Bunchukov,R.F.C. Seraing,19,03,3.3,0,3,1,33.3,0.92,...,1041552,Denys Bunchukov,,6/20/03,Central Midfield,right,,6/30/23 0:00,2022-09-07,100000
2674,Moussa Soumano,Ajaccio,17,05,5.5,1,6,2,33.3,1.09,...,1053240,Moussa Soumano,France,7/9/05,Centre-Forward,,,6/30/26 0:00,2023-03-27,900000


In [9]:
merged.to_csv('merged_data.csv')