In [5]:
import requests
from bs4 import BeautifulSoup
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [6]:
standard_columns = ['player', 'position', 'nationality','team', 'comp_level', 'age', 'games', 
                   'games_starts', 'minutes', 'minutes_90s', 'goals', 'assists', 'goals_per90', 
                    'assists_per90', 'goals_assists_per90', 'xg', 'npxg', 'xg_assist', 'xg_per90',
                    'npxg_per90', 'xg_assist_per90', "progressive_passes", "progressive_carries", "progressive_passes_received"]
#passing
passing_columns = ['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'minutes_90s',
                  'passes_completed', 'passes', 'passes_pct', 'passes_total_distance', 'passes_progressive_distance',
                  'assists', 'xg_assist', 'pass_xa', 'assisted_shots', 'passes_into_final_third', 'passes_into_penalty_area',
                  'crosses_into_penalty_area', 'progressive_passes']

#gca
gca_columns = ['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'minutes_90s', 'sca',
              'sca_per90', 'sca_passes_live', 'sca_passes_dead', 'sca_take_ons', 'sca_shots', 
              'sca_fouled', 'sca_defense', 'gca', 'gca_per90', 'gca_passes_live', 'gca_passes_dead',
              'gca_take_ons', 'gca_shots', 'gca_fouled', 'gca_defense']

shooting_columns = ['player', 'team', 'position',
                    'age', 'minutes_90s', 'goals', 'pens_made', 'shots', 'shots_on_target', 'shots_on_target_pct',
                   'shots_per90', 'shots_on_target_per90', 'goals_per_shot', 'goals_per_shot_on_target', 'average_shot_distance',
                   'xg', 'npxg', 'npxg_per_shot', 'xg_net', 'npxg_net']


#possession
possession_columns = ['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'minutes_90s', 'touches',
                     'touches_def_pen_area', 'touches_def_3rd', 'touches_mid_3rd', 'touches_att_3rd', 'touches_att_pen_area',
                     'touches_live_ball', 'take_ons_won', 'take_ons', 'take_ons_won_pct', 'take_ons_tackled', 'take_ons_tackled_pct',
                     'carries', 'carries_distance', 'carries_progressive_distance', 'progressive_carries', 'carries_into_final_third',
                     'carries_into_penalty_area', 'miscontrols', 'dispossessed',  'passes_received',
                     'progressive_passes_received']

pass_types_columns = ['player', 'passes',
                     'passes_live', 'passes_dead', 'through_balls',  'passes_switches','crosses',
                      'passes_blocked']

defense_columns = ['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'minutes_90s', 'tackles', 
                  'tackles_won', 'tackles_def_3rd', 'tackles_mid_3rd', 'tackles_att_3rd', 'challenge_tackles',
                   'challenges', 'challenge_tackles_pct', 'challenges_lost',
                   'blocked_shots', 
                  'blocked_passes', 'interceptions', 'tackles_interceptions', 'clearances', 'errors']

#misc_columns = ['player', 'nationality', 'position', 'minutes_90s', 'tram', 'comp_level', 'age', 'minutes_90s', 'cards_yellow',
#               'cards_red', 'cards_yellow_red', 'fouls', 'fouled', 'interceptions', 'tackles_won', 'pens_won',
#               'pens_conceded', 'ball_recoveries', 'aerials_won', 'aerials_lost', 'aerials_won_pct']


In [7]:
def scrape(link, cols):
    res = requests.get(link)
    soup = BeautifulSoup(res.content, 'lxml')
    tables = soup.findAll('tbody')
    rows = tables[0].findAll('tr')
    table = dict()
    
    for curElem in rows:
       
        if (curElem.find('th', {'scope': 'row'}) != None):
            for curr in cols:
              
                k = curElem.find('td', {'data-stat': curr})
                text = k.text.strip().encode()
                pretty_text = text.decode('utf-8')
                if pretty_text == '':
                    pretty_text = '0'
                if (curr!='player') and (curr!='nationality') and (curr!='position') and (curr!='team') and (curr!='comp_level'): 
                    pretty_text = float(pretty_text.replace(',',''))
                if curr in table:
                    table[curr].append(pretty_text)
                else:
                    table[curr] = [pretty_text]
    
    players = pd.DataFrame(table)
    return players

In [4]:
# stats
base = 'https://fbref.com/en/comps/Big5/'
data = str(input('Please enter what data: '))
url = base + data + '/players/Big-5-European-Leagues-Stats'

Please enter what data: stats


In [5]:
df_stats = scrape(url, standard_columns)

In [6]:
df_stats.head(n=5)

Unnamed: 0,player,position,nationality,team,comp_level,age,games,games_starts,minutes,minutes_90s,...,goals_assists_per90,xg,npxg,xg_assist,xg_per90,npxg_per90,xg_assist_per90,progressive_passes,progressive_carries,progressive_passes_received
0,Brenden Aaronson,"MF,FW",us USA,Leeds United,eng Premier League,21.0,36.0,28.0,2372.0,26.4,...,0.15,3.9,3.9,4.2,0.15,0.15,0.16,86.0,43.0,151.0
1,Paxten Aaronson,"MF,DF",us USA,Eint Frankfurt,de Bundesliga,18.0,7.0,0.0,173.0,1.9,...,0.0,0.2,0.2,0.0,0.09,0.09,0.03,6.0,8.0,15.0
2,James Abankwah,DF,ie IRL,Udinese,it Serie A,18.0,2.0,1.0,63.0,0.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,George Abbott,MF,eng ENG,Tottenham,eng Premier League,16.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Yunis Abdelhamid,DF,ma MAR,Reims,fr Ligue 1,34.0,37.0,37.0,3330.0,37.0,...,0.08,2.4,2.4,1.0,0.06,0.06,0.03,215.0,40.0,10.0


In [3]:
url='https://fbref.com/en/comps/Big5/2021-2022/shooting/players/2021-2022-Big-5-European-Leagues-Stats'

In [10]:
# shooting
base = 'https://fbref.com/en/comps/Big5/'
data = str(input('Please enter what data: '))
url = base + data + '/players/Big-5-European-Leagues-Stats'

Please enter what data: shooting


In [11]:
df_shooting = scrape(url, shooting_columns)

In [7]:
# passing
base = 'https://fbref.com/en/comps/Big5/'
data = str(input('Please enter what data: '))
url = base + data + '/players/Big-5-European-Leagues-Stats'

Please enter what data: passing


In [8]:
df_passing = scrape(url, passing_columns)

In [9]:
#gca
base = 'https://fbref.com/en/comps/Big5/'
data = str(input('Please enter what data: '))
url = base + data + '/players/Big-5-European-Leagues-Stats'

Please enter what data: gca


In [10]:
df_gca = scrape(url, gca_columns)

In [11]:
df_gca.columns

Index(['player', 'nationality', 'position', 'team', 'comp_level', 'age',
       'minutes_90s', 'sca', 'sca_per90', 'sca_passes_live', 'sca_passes_dead',
       'sca_take_ons', 'sca_shots', 'sca_fouled', 'sca_defense', 'gca',
       'gca_per90', 'gca_passes_live', 'gca_passes_dead', 'gca_take_ons',
       'gca_shots', 'gca_fouled', 'gca_defense'],
      dtype='object')

In [12]:
#possession
base = 'https://fbref.com/en/comps/Big5/'
data = str(input('Please enter what data: '))
url = base + data + '/players/Big-5-European-Leagues-Stats'

Please enter what data: possession


In [13]:
df_poss = scrape(url, possession_columns)

In [14]:
#defense
base = 'https://fbref.com/en/comps/Big5/'
data = str(input('Please enter what data: '))
url = base + data + '/players/Big-5-European-Leagues-Stats'

Please enter what data: defense


In [15]:
df_defense = scrape(url, defense_columns)

In [16]:
#base = 'https://fbref.com/en/comps/Big5/'
#data = str(input('Please enter what data: '))
#url = base + data + '/players/Big-5-European-Leagues-Stats'

In [17]:
#df_misc = scrape(url, misc_columns)

In [18]:
base = 'https://fbref.com/en/comps/Big5/'
data = str(input('Please enter what data: '))
url = base + data + '/players/Big-5-European-Leagues-Stats'

Please enter what data: passing_types


In [21]:
df_pass_types = scrape(url, pass_types_columns)

In [39]:
len(df)

2889

In [22]:
df_stats.to_csv('stats_2223.csv')
df_passing.to_csv('pass_2223.csv')
df_gca.to_csv('gca_2223.csv')
df_poss.to_csv('poss_2223.csv')
df_defense.to_csv('defense_2223.csv')
df_pass_types.to_csv('ptypes.csv')


In [7]:
df_shooting.to_csv('shooting.csv')

In [54]:
master = pd.concat([df_stats, df_passing, df_gca, df_poss, df_defense, df_pass_types], axis=1)
print(len(master))

2889


In [68]:
# Transpose the DataFrame to work with the columns as rows
master_transposed = master.T

# Drop duplicates while keeping the first occurrence
master_transposed_unique = master_transposed.drop_duplicates(keep='first')

# Transpose the DataFrame back to the original shape
master_df = master_transposed_unique.T

In [75]:
master_df = master_df.drop_duplicates(subset='player')

In [76]:
len(master_df)

2716

In [77]:
len(master_df['player'].unique())

2716

In [12]:
df_shooting[df_shooting.position=='FW'].to_excel('2022-2023_shooting_performance_pk.xlsx')