Python script to scrape player in-game statistics data from fbref.com and merge with player valuations. Extracted data from 10 professional soccer leagues around the world.

In [1]:
# libraries
import pandas as pd
import numpy as np
import requests
import time
import random
import fuzzywuzzy as fuzz

years_special = ['2021-2022','2022-2023']
numbers = [11,20,9,12,30,13,40,37,23,32,69,10,17,60,33,18,51,72]
leagues = ['-Serie-A-Stats','-Bundesliga-Stats',
           '-Premier-League-Stats','-La-Liga-Stats','-Russian-Premier-League-Stats',
           '-Ligue-1-Stats','-Scottish-Premiership-Stats','-Belgian-Pro-League-Stats','-Eredivisie-Stats',
           '-Primeira-Liga-Stats','-Challenger-Pro-League-Stats','-Championship-Stats','-Segunda-Division-Stats',
          '-Ligue-2-Stats','-2-Bundesliga-Stats','-Serie-B-Stats','-Eerste-Divisie-Stats','-Scottish-Championship-Stats']
           
league_names = ['Serie A','Bundesliga','Premier League','La Liga', 
               'Premier Liga', 'Ligue 1', 'Scottish Premiership',
               'Jupiler Pro League', 'Eredivisie', 'Liga Nos', 'Challenger Pro League', 'Championship',
               'Segunda Division','Ligue 2','2 Bundesliga','Serie B','Eerste Divisie','Scottish Championship']

weird_ones_1 = ['-Belgian-Pro-League-Stats','-Scottish-Premiership-Stats','-Challenger-Pro-League-Stats']
weird_ones_2 = ['-Belgian-Pro-League-Stats','-Scottish-Premiership-Stats',]

count = 0
prev_season_off = []
next_season_off = []
prev_season_def = []
next_season_def = []
tables = [2,3]
for n in range(len(numbers)):
    for y in years_special:
        for t in range(2):
        
            if (leagues[count] in weird_ones_2)  and y == '2021-2022':
                tables = [4,5]
            elif (leagues[count] in weird_ones_1) and y == '2022-2023':
                tables = [5,6]

            data = 'https://fbref.com/en/comps/' + str(numbers[count]) + '/' + y + '/' + y + leagues[count]
            response = requests.get(data).text.replace('<!--', '').replace('-->', '')
            df = pd.read_html(response, header=1)[tables[t]]
            tables = [2,3]
            df['Based'] = league_names[count]
            df['Season'] = y
            df = df[['Squad', 'Poss', 'Gls',
           'Ast', 'Gls.1','Ast.1', 'Based','Season']]

            if t == 0:
                df.rename(columns={'Poss':'sq_Poss', 'Gls':'sq_Gls', 'Ast':'sq_Ast', 'Gls.1':'sq_Gls/90',
                               'Ast.1':'sq_Ast/90'}, inplace = True)
            else:
                df.rename(columns={'Poss':'opp_sq_Poss', 'Gls':'opp_sq_Gls', 'Ast':'opp_sq_Ast', 'Gls.1':'opp_sq_Gls/90',
                                   'Ast.1':'opp_sq_Ast/90'}, inplace = True)

            if y == '2021-2022' and t == 0:
                prev_season_off.append(df)
            elif y == '2021-2022' and t == 1:
                prev_season_def.append(df)
            elif y == '2022-2023' and t == 1:
                next_season_def.append(df)
            else:
                next_season_off.append(df)

            time.sleep(3)
            print('Imported ' + str(tables[t]) + ' squad data for the ' + league_names[count] + ' ' + y + ' season')

    count += 1
    
prev_season_off = pd.concat(prev_season_off)
next_season_off = pd.concat(next_season_off)
prev_season_def = pd.concat(prev_season_def)
next_season_def = pd.concat(next_season_def)

Imported 2 squad data for the Serie A 2021-2022 season
Imported 3 squad data for the Serie A 2021-2022 season
Imported 2 squad data for the Serie A 2022-2023 season
Imported 3 squad data for the Serie A 2022-2023 season
Imported 2 squad data for the Bundesliga 2021-2022 season
Imported 3 squad data for the Bundesliga 2021-2022 season
Imported 2 squad data for the Bundesliga 2022-2023 season
Imported 3 squad data for the Bundesliga 2022-2023 season
Imported 2 squad data for the Premier League 2021-2022 season
Imported 3 squad data for the Premier League 2021-2022 season
Imported 2 squad data for the Premier League 2022-2023 season
Imported 3 squad data for the Premier League 2022-2023 season
Imported 2 squad data for the La Liga 2021-2022 season
Imported 3 squad data for the La Liga 2021-2022 season
Imported 2 squad data for the La Liga 2022-2023 season
Imported 3 squad data for the La Liga 2022-2023 season
Imported 2 squad data for the Premier Liga 2021-2022 season
Imported 3 squad dat

In [2]:
prev_season_def['Squad'] = prev_season_def['Squad'].str.replace('vs ','')
prev_season = pd.merge(prev_season_off,prev_season_def,on=['Squad','Based','Season']).drop(['Season','Based'], axis=1)
prev_season

Unnamed: 0,Squad,sq_Poss,sq_Gls,sq_Ast,sq_Gls/90,sq_Ast/90,opp_sq_Poss,opp_sq_Gls,opp_sq_Ast,opp_sq_Gls/90,opp_sq_Ast/90
0,Atalanta,55.0,62,48,1.63,1.26,44.9,45.0,29.0,1.18,0.76
1,Bologna,50.6,43,34,1.13,0.89,49.3,53.0,32.0,1.39,0.84
2,Cagliari,44.5,34,26,0.89,0.68,55.6,67.0,41.0,1.76,1.08
3,Empoli,47.4,47,27,1.24,0.71,52.9,64.0,44.0,1.68,1.16
4,Fiorentina,57.7,59,33,1.55,0.87,41.8,49.0,33.0,1.29,0.87
...,...,...,...,...,...,...,...,...,...,...,...
317,Inverness CT,,51,35,1.42,0.97,,34.0,22.0,0.94,0.61
318,Kilmarnock,,49,31,1.36,0.86,,27.0,17.0,0.75,0.47
319,Partick Thistle,,43,31,1.19,0.86,,38.0,23.0,1.06,0.64
320,Queens,,34,24,0.94,0.67,,51.0,29.0,1.42,0.81


In [3]:
next_season_def['Squad'] = next_season_def['Squad'].str.replace('vs ','')
next_season = pd.merge(next_season_off,next_season_def,on=['Squad','Based','Season']).drop(['Season','Based'], axis=1)
next_season

Unnamed: 0,Squad,sq_Poss,sq_Gls,sq_Ast,sq_Gls/90,sq_Ast/90,opp_sq_Poss,opp_sq_Gls,opp_sq_Ast,opp_sq_Gls/90,opp_sq_Ast/90
0,Atalanta,49.9,64,42,1.68,1.11,50.1,46,35,1.21,0.92
1,Bologna,54.7,52,41,1.37,1.08,45.3,46,29,1.21,0.76
2,Cremonese,42.1,35,18,0.92,0.47,57.9,68,46,1.79,1.21
3,Empoli,47.3,35,20,0.92,0.53,52.7,48,32,1.26,0.84
4,Fiorentina,56.1,51,37,1.34,0.97,43.9,41,28,1.08,0.74
...,...,...,...,...,...,...,...,...,...,...,...
321,Hamilton,,29,15,0.81,0.42,,62,45,1.72,1.25
322,Inverness CT,,50,28,1.39,0.78,,46,27,1.28,0.75
323,Partick Thistle,,65,53,1.81,1.47,,43,29,1.19,0.81
324,Queen's Park FC,,60,39,1.67,1.08,,52,28,1.44,0.78


In [4]:
prev_season.to_csv('squads_2021-2022.csv')
next_season.to_csv('squads_2022-2023.csv')

In [5]:
comparison = next_season.set_index(['Squad']).subtract(prev_season.set_index(['Squad'])).dropna()
comparison[comparison.isnull().any(axis=1)]

Unnamed: 0_level_0,sq_Poss,sq_Gls,sq_Ast,sq_Gls/90,sq_Ast/90,opp_sq_Poss,opp_sq_Gls,opp_sq_Ast,opp_sq_Gls/90,opp_sq_Ast/90
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [6]:
comparison.to_csv('squad_comparisons.csv')

In [7]:
comparison

Unnamed: 0_level_0,sq_Poss,sq_Gls,sq_Ast,sq_Gls/90,sq_Ast/90,opp_sq_Poss,opp_sq_Gls,opp_sq_Ast,opp_sq_Gls/90,opp_sq_Ast/90
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ADO Den Haag,-6.2,-26.0,-18.0,-0.69,-0.47,6.2,4.0,1.0,0.11,0.03
AZ Alkmaar,-2.3,3.0,-1.0,0.09,-0.03,2.5,-9.0,-6.0,-0.26,-0.18
Aberdeen,-10.8,15.0,4.0,0.40,0.10,10.8,13.0,10.0,0.35,0.26
Ajaccio,-6.7,-17.0,-14.0,-0.45,-0.36,6.5,54.0,37.0,1.42,0.97
Ajax,0.1,-8.0,-5.0,-0.23,-0.15,0.0,19.0,15.0,0.56,0.44
...,...,...,...,...,...,...,...,...,...,...
Wolves,0.4,-7.0,-10.0,-0.18,-0.26,-0.6,15.0,21.0,0.39,0.55
Zaragoza,-0.1,-2.0,0.0,-0.05,0.00,0.0,-3.0,-2.0,-0.07,-0.05
Zenit,2.5,7.0,4.0,0.23,0.13,-2.5,-7.0,-2.0,-0.23,-0.07
Zulte Waregem,0.2,7.0,7.0,0.20,0.21,-1.3,10.0,9.0,0.30,0.26
