Python script to scrape player in-game statistics data from fbref.com and merge with player valuations. Extracted data from 10 professional soccer leagues around the world.

In [1]:
# libraries
import pandas as pd
import numpy as np
import requests
import time
import random
import fuzzywuzzy as fuzz

years_special = ['2022-2023']
numbers = [11,20,9,12,30,13,40,37,23,32]
stats = ['passing','defense','shooting','playingtime']
leagues = ['-Serie-A-Stats','-Bundesliga-Stats',
           '-Premier-League-Stats','-La-Liga-Stats','-Russian-Premier-League-Stats',
           '-Ligue-1-Stats','-Scottish-Premiership-Stats','-Belgian-Pro-League-Stats','-Eredivisie-Stats',
           '-Primeira-Liga-Stats']
           
league_names = ['Serie A','Bundesliga','Premier League','La Liga', 
               'Premier Liga', 'Ligue 1', 'Scottish Premiership',
               'Jupiler Pro League', 'Eredivisie', 'Liga Nos']
                
shot = []
defense = []
passing = []
playingtime = []
keepers = []
count = 0
for n in range(len(numbers)):
    for t in stats:
        for z in years_special:
            data = 'https://fbref.com/en/comps/' + str(numbers[count]) + '/' + z + '/' + t + '/' + z + leagues[count]
            response = requests.get(data).text.replace('<!--', '').replace('-->', '')
            df = pd.read_html(response, header=1)[2]
            df['Based'] = league_names[count]
            if t == 'passing':
                passing.append(df)
            elif t == 'defense':
                defense.append(df)
            elif t == 'shooting':
                shot.append(df)
            else:
                playingtime.append(df)
            time.sleep(3)
            print('Imported ' + t + ' data for ' + league_names[count])

    count += 1

Imported passing data for Serie A
Imported defense data for Serie A
Imported shooting data for Serie A
Imported playingtime data for Serie A
Imported passing data for Bundesliga
Imported defense data for Bundesliga
Imported shooting data for Bundesliga
Imported playingtime data for Bundesliga
Imported passing data for Premier League
Imported defense data for Premier League
Imported shooting data for Premier League
Imported playingtime data for Premier League
Imported passing data for La Liga
Imported defense data for La Liga
Imported shooting data for La Liga
Imported playingtime data for La Liga
Imported passing data for Premier Liga
Imported defense data for Premier Liga
Imported shooting data for Premier Liga
Imported playingtime data for Premier Liga
Imported passing data for Ligue 1
Imported defense data for Ligue 1
Imported shooting data for Ligue 1
Imported playingtime data for Ligue 1
Imported passing data for Scottish Premiership
Imported defense data for Scottish Premiership


In [2]:
for k in range(4):
    
    if stats[k] == 'passing':
        count = 0
        for df in passing:
            # rename columns
            df = df.drop(columns=['Rk','Nation','Pos','Matches'])

            df.rename(columns={'Cmp':'Total Cmp', 'Att': 'Total Att', 'Cmp%':'Total Cmp%','Cmp.1':'Short Cmp', 
                               'Att.1':'Short Att','Cmp%.1':'Short Cmp%', 'Cmp.2':'Medium Cmp', 'Att.2':'Medium Att',
                               'Cmp%.2':'Medium Cmp%', 'Cmp.3':'Long Cmp', 'Att.3':'Long Att', 'Cmp%.3':'Long Cmp%'}, inplace = True)
            
            passing[count] = df
            count += 1


    elif stats[k] == 'defense':
        count = 0
        for df in defense:

            # rename columns
            df['Age'] = df['Age'].str[:2]
            df['Position_2'] = df['Pos'].str[3:]
            df['Position'] = df['Pos'].str[:2]
            df['Nation'] = df['Nation'].str.split(' ').str.get(1)

            df['Position'] = df['Position'].replace({'MF': 'Midfielder', 'DF': 'Defender', 'FW': 'Forward', 'GK': 'Goalkeeper'})
            df['Position_2'] = df['Position_2'].replace({'MF': 'Midfielder', 'DF': 'Defender',
                                                             'FW': 'Forward', 'GK': 'Goalkeeper'})

            df = df.drop(columns=['Rk','Nation','Pos','Position_2','Matches'])

            df.rename(columns={'Sh':'Blocks Sh','Pass':'Blocks Pass'}, inplace = True)
            
            defense[count] = df
            count += 1
            
    elif stats[k] == 'playingtime':
        count = 0
        for df in playingtime:

            # rename columns
            df['Age'] = df['Age'].str[:2]
            df['Position_2'] = df['Pos'].str[3:]
            df['Position'] = df['Pos'].str[:2]
            df['Nation'] = df['Nation'].str.split(' ').str.get(1)

            df['Position'] = df['Position'].replace({'MF': 'Midfielder', 'DF': 'Defender', 'FW': 'Forward', 'GK': 'Goalkeeper'})
            df['Position_2'] = df['Position_2'].replace({'MF': 'Midfielder', 'DF': 'Defender',
                                                             'FW': 'Forward', 'GK': 'Goalkeeper'})

            df = df.drop(columns=['Rk','Nation','Pos','Position_2','Matches'])
            
            playingtime[count] = df
            count += 1

    else:
        count = 0
        for df in shot:

            # rename columns
            df['Age'] = df['Age'].str[:2]
            df['Position_2'] = df['Pos'].str[3:]
            df['Position'] = df['Pos'].str[:2]
            df['Nation'] = df['Nation'].str.split(' ').str.get(1)

            df['Position'] = df['Position'].replace({'MF': 'Midfielder', 'DF': 'Defender', 'FW': 'Forward', 'GK': 'Goalkeeper'})
            df['Position_2'] = df['Position_2'].replace({'MF': 'Midfielder', 'DF': 'Defender',
                                                             'FW': 'Forward', 'GK': 'Goalkeeper'})

            df = df.drop(columns=['Rk','Nation','Pos','Position','Position_2','Matches'])
            
            shot[count] = df
            count += 1
            
    df_shot = pd.concat(shot)
    df_def = pd.concat(defense)
    df_pass = pd.concat(passing)
    df_pt = pd.concat(playingtime)

final = pd.merge(pd.merge(pd.merge(df_shot,df_def,on=['Player','Squad','Born','Based','Age','90s']),
                 df_pass,on=['Player','Squad','Born','Based','Age','90s']),
                 df_pt,on=['Player','Squad','Born','Based','Age','90s'])

final['Born'] = final['Born'].astype(str)
final['Born'] = final['Born'].str[-2:]
final['player_code'] = final['Player'].str.lower().str.replace(' ','-') + final['Born'].astype(str)

replacements = {
    'à': 'a', 'á': 'a', 'â': 'a', 'ä': 'a', 'ǎ': 'a', 'æ': 'a', 'ã': 'a', 'å': 'a', 'ā': 'a',
    'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', 'ě': 'e', 'ẽ': 'e', 'ē': 'e', 'ė': 'e', 'ę': 'e',
    'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i', 'ǐ': 'i', 'ĩ': 'i', 'ī': 'i', 'ı': 'i', 'į': 'i',
    'ò': 'o', 'ó': 'o', 'ô': 'o', 'ö': 'o', 'ǒ': 'o', 'œ': 'o', 'ø': 'o', 'õ': 'o', 'ō': 'o',
    'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u', 'ǔ': 'u', 'ũ': 'u', 'ū': 'u', 'ű': 'u', 'ů': 'u',
    'ğ':'g','ġ':'g','Ğ':'G','Ġ':'G','ç':'c','ć':'c','č':'c','ċ':'c','ł':'l','ļ':'l','ľ':'l',
    'ß':'s','ş':'s','ș':'s','ś':'s','š':'s','ķ':'k','ñ':'n','ń':'n','ņ':'n','ň':'n','ŵ':'w',
    'ź':'z','ž':'z','ż':'z','þ':'b','đ':'d',"'":'',"’":'','ý':'y','i̇':'i','ă':'a','ř':'r'
}

for key in replacements:
    final['player_code'] = final['player_code'].str.replace(key, replacements[key])

In [11]:
final[final.duplicated(['player_code','Born'], keep=False)]

Unnamed: 0,Player,Squad,Age,Born,90s,Gls,Sh,SoT,SoT%,Sh/90,...,+/-,+/-90,On-Off,onxG,onxGA,xG+/-,xG+/-90,On-Off.1,Position_y,player_code


In [12]:
final = final.sort_values('90s', ascending=False).drop_duplicates(['player_code','Born']).sort_index()
final[final.duplicated(['player_code'], keep=False)]

Unnamed: 0,Player,Squad,Age,Born,90s,Gls,Sh,SoT,SoT%,Sh/90,...,+/-,+/-90,On-Off,onxG,onxGA,xG+/-,xG+/-90,On-Off.1,Position_y,player_code


In [13]:
final

Unnamed: 0,Player,Squad,Age,Born,90s,Gls,Sh,SoT,SoT%,Sh/90,...,+/-,+/-90,On-Off,onxG,onxGA,xG+/-,xG+/-90,On-Off.1,Position_y,player_code
0,James Abankwah,Udinese,18,04,0.7,0,0,0,,0.00,...,0,0.00,+0.03,0.4,0.7,-0.3,-0.37,-0.25,Defender,james-abankwah04
1,Oliver Abildgaard,Hellas Verona,26,96,6.6,0,3,2,66.7,0.46,...,-9,-1.37,-0.76,4.9,9.3,-4.4,-0.67,-0.36,Midfielder,oliver-abildgaard96
2,Tammy Abraham,Roma,24,97,24.3,8,62,28,45.2,2.55,...,+7,+0.29,-0.08,37.7,22.9,+14.8,+0.61,-0.16,Forward,tammy-abraham97
3,Christian Acella,Cremonese,20,02,0.2,0,0,0,,0.00,...,-1,-6.00,-5.15,0.0,0.1,-0.1,-0.65,+0.07,Midfielder,christian-acella02
4,Francesco Acerbi,Inter,34,88,26.9,0,18,2,11.1,0.67,...,+23,+0.85,+0.31,49.5,26.6,+22.9,+0.85,+0.07,Defender,francesco-acerbi88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5403,Zarzana,Marítimo,20,02,3.2,0,5,1,20.0,1.56,...,-7,-2.18,-1.40,,,,,,Forward,zarzana02
5404,Ivan Zlobin,Famalicão,25,97,1.0,0,0,0,,0.00,...,0,0.00,+0.24,1.5,1.6,-0.1,-0.13,+0.18,Goalkeeper,ivan-zlobin97
5405,Kévin Zohi,Vizela,25,96,10.0,2,26,9,34.6,2.60,...,-2,-0.20,-0.12,12.8,10.6,+2.1,+0.21,+0.28,Forward,kevin-zohi96
5406,Nermin Zolotić,Casa Pia,29,93,24.3,0,6,2,33.3,0.25,...,-5,-0.21,+0.21,25.5,32.3,-6.8,-0.28,+0.16,Defender,nermin-zolotic93


In [14]:
final.to_csv('player_data_2023.csv')

In [27]:
pv = pd.read_csv('transfermarkt_data.csv')
pv['Born']=pv['Born'].astype(str)

merged = pd.merge(final,pv,on=['player_code']).drop(['current_club_domestic_competition_id','first_name',
                                                             'last_name','player_club_domestic_competition_id',
                                                    'club_id','last_season','current_club_name','club_code','date',
                                                                    'current_club_id','player_id','Unnamed: 0','name','Position_y',
                                                                   'date_of_birth','Based_y','Squad_y','Born_y'], axis=1)

In [28]:
merged

Unnamed: 0,Player,Squad_x,Age,Born_x,90s,Gls,Sh,SoT,SoT%,Sh/90,...,xG+/-,xG+/-90,On-Off.1,player_code,country_of_birth,sub_position,foot,height_in_cm,contract_expiration_date,market_value_in_eur
0,James Abankwah,Udinese,18,04,0.7,0,0,0,,0.00,...,-0.3,-0.37,-0.25,james-abankwah04,Ireland,Centre-Back,right,182.0,6/30/26 0:00,200000
1,Oliver Abildgaard,Hellas Verona,26,96,6.6,0,3,2,66.7,0.46,...,-4.4,-0.67,-0.36,oliver-abildgaard96,Denmark,Defensive Midfield,left,192.0,6/30/23 0:00,5000000
2,Tammy Abraham,Roma,24,97,24.3,8,62,28,45.2,2.55,...,+14.8,+0.61,-0.16,tammy-abraham97,England,Centre-Forward,right,194.0,6/30/26 0:00,45000000
3,Christian Acella,Cremonese,20,02,0.2,0,0,0,,0.00,...,-0.1,-0.65,+0.07,christian-acella02,Italy,Central Midfield,right,,,150000
4,Francesco Acerbi,Inter,34,88,26.9,0,18,2,11.1,0.67,...,+22.9,+0.85,+0.07,francesco-acerbi88,Italy,Centre-Back,left,192.0,6/30/23 0:00,4000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4353,Moses Yaw,Arouca,23,99,3.5,0,1,1,100.0,0.29,...,-4.5,-1.32,-1.11,moses-yaw99,,Central Midfield,right,170.0,6/30/24 0:00,200000
4354,Zaydou Youssouf,Famalicão,23,99,23.8,2,24,6,25.0,1.01,...,-2.0,-0.08,+0.75,zaydou-youssouf99,France,Central Midfield,left,182.0,6/30/26 0:00,3000000
4355,Ivan Zlobin,Famalicão,25,97,1.0,0,0,0,,0.00,...,-0.1,-0.13,+0.18,ivan-zlobin97,Russia,Goalkeeper,right,191.0,6/30/25 0:00,300000
4356,Kévin Zohi,Vizela,25,96,10.0,2,26,9,34.6,2.60,...,+2.1,+0.21,+0.28,kevin-zohi96,Cote d'Ivoire,Right Winger,right,180.0,6/30/24 0:00,800000


In [29]:
merged[merged.duplicated(['player_code'], keep=False)]

Unnamed: 0,Player,Squad_x,Age,Born_x,90s,Gls,Sh,SoT,SoT%,Sh/90,...,xG+/-,xG+/-90,On-Off.1,player_code,country_of_birth,sub_position,foot,height_in_cm,contract_expiration_date,market_value_in_eur


In [18]:
merged.to_csv('merged_data.csv')