# Web Scraping Manchester United Players' Stats

In [1]:
import pandas as pd
import requests
import re
import numpy as np

url = 'https://fbref.com/robots.txt'
response = requests.get(url)
print(response.text)

User-agent:*
# Disallow: /cbb/
# Disallow: /cfb/
# Disallow: /olympics/

# Disallow: /awards/
# Disallow: /blog/
# Disallow: /boxscores/
# Disallow: /coaches/
# Disallow: /draft/
# Disallow: /executives/
# Disallow: /friv/
# Disallow: /hof/
# Disallow: /leaders/
# Disallow: /play-index/
# Disallow: /players/
# Disallow: /route.cgi
# Disallow: /schools/
# Disallow: /search/
# Disallow: /stadiums/
# Disallow: /static/
# Disallow: /teams/
# Disallow: /years/

Disallow: /feedback/
Disallow: /linker/
Disallow: /my/

Disallow: /news/
Disallow: /en/news/
Disallow: /pt/news/
Disallow: /de/news/
Disallow: /fr/news/
Disallow: /es/news/
Disallow: /it/news/
Disallow: /news/

Disallow: /req/
Disallow: /short/
Disallow: /nocdn/


User-agent: AhrefsBot
Disallow: /

User-agent: AhrefsBot/5.0
Disallow: /


## sitemaps generated by copyit/sitemaps/build_sitemaps.pl
## 
Sitemap: https://fbref.com/sitemaps/sitemap.xml



In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
from bs4 import BeautifulSoup
import requests
import time, os

In [4]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver"  # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

## 2018-2019 Season Stats

In [5]:
manutd_1819_site = 'https://fbref.com/en/squads/19538871/2018-2019/Manchester-United'

In [6]:
page = requests.get(manutd_1819_site).text
soup = BeautifulSoup(page, 'lxml')

### Standard Stats

In [7]:
#standard stats table
player_table = soup.find('table', id='stats_standard_ks_1889')

In [8]:
player_name = [header for header in player_table.find_all('th', class_='left')]
manutd_players = [name.text for th in player_name for name in th.find_all('a')]

In [9]:
player_row = [row for row in player_table.find_all('tr')]

In [10]:
players_stats = {}

for player in player_row[2:]:
    items = player.find_all('td')
    players_stats[player] = [i.text for i in items]

In [11]:
players_stats_list = [stats for stats in players_stats.values()]

In [12]:
manutd_players_stats = {}
manutd_players_stats.update(zip(manutd_players, players_stats_list))

In [13]:
#dataframe of manutd players' stats
manutd_1819 = pd.DataFrame(manutd_players_stats).T
manutd_1819.columns = [
    'Nation', 'Pos', 'Age', 'MP', 'Starts', 'Min', 'Gls', 'Ast', 'PK', 'PKatt',
    'CrdY', 'CrdR', 'P90Gls', 'P90Ast', 'P90G+A', 'P90G-PK', 'P90G+A-PK',
    'ExpectedxG', 'ExpectednpxG', 'ExpectedxA', 'P90xG', 'P90xA', 'P90xG+xA',
    'P90npxG', 'P90npxG+xA', 'Matches'
]

In [14]:
#dropping unneded columns
manutd_1819 = manutd_1819.drop([
    'Pos', 'Nation', 'PK', 'PKatt', 'CrdR', 'P90Gls', 'P90G+A', 'P90G-PK',
    'P90G+A-PK', 'ExpectedxG', 'ExpectednpxG', 'ExpectedxA', 'P90xG', 'P90xA',
    'P90xG+xA', 'P90npxG', 'P90npxG+xA', 'Matches'
],
                               axis=1)

#dropping players who never played a premier league game in the season
manutd_1819 = manutd_1819.drop(['Lee Grant', 'Sergio Romero'])

In [15]:
#data cleaning
manutd_1819 = manutd_1819.replace(r'^\s*$', np.nan, regex=True)
manutd_1819['Min'] = manutd_1819['Min'].str.replace(',', '')
manutd_1819.iloc[:-2, :] = manutd_1819.iloc[:-2, :].astype('float64')

In [16]:
manutd_1819

Unnamed: 0,Age,MP,Starts,Min,Gls,Ast,CrdY,P90Ast
David de Gea,27,38,38,3420,0,0,1,0.0
Paul Pogba,25,35,34,3006,13,9,6,0.27
Victor Lindelöf,24,30,29,2601,1,1,1,0.03
Luke Shaw,23,29,29,2591,1,4,11,0.14
Ashley Young,33,30,28,2569,2,2,9,0.07
Nemanja Matić,29,28,28,2436,1,0,7,0.0
Marcus Rashford,20,33,26,2334,10,6,3,0.23
Chris Smalling,28,24,24,2127,1,0,1,0.0
Romelu Lukaku,25,32,22,2136,12,0,4,0.0
Jesse Lingard,25,27,19,1663,4,2,3,0.11


### Shooting Stats

In [17]:
from bs4 import BeautifulSoup, Comment

page = requests.get(manutd_1819_site).text
soup = BeautifulSoup(page, 'lxml')

In [18]:
placeholder = soup.select_one('#all_kitchen_sink_shooting .placeholder')
comment = next(elem for elem in placeholder.next_siblings
               if isinstance(elem, Comment))
table_soup = BeautifulSoup(comment, 'lxml')

In [19]:
#shooting stats table
shooting_table = table_soup.find('table', id="stats_shooting_ks_1889")

In [20]:
shooting_name = [
    header for header in shooting_table.find_all('th', class_='left')
]
shooting_players = [
    name.text for th in shooting_name for name in th.find_all('a')
]

In [21]:
shooting_row = [row for row in shooting_table.find_all('tr')]

In [22]:
shooting_stats = {}

for shooter in shooting_row[2:]:
    items = shooter.find_all('td')
    shooting_stats[shooter] = [i.text for i in items]

In [23]:
shooting_stats_list = [stats for stats in shooting_stats.values()]

In [24]:
manutd_shooting_stats = {}
manutd_shooting_stats.update(zip(shooting_players, shooting_stats_list))

In [25]:
#dataframe of manutd players' shooting stats
manutd_shooting_df = pd.DataFrame(manutd_shooting_stats).T
manutd_shooting_df.columns = [
    'Nation', 'Pos', 'Age', '90s', 'Gls', 'PK', 'Pkatt', 'Shot',
    'ShotonTarget', 'FK', 'ShotonTarget%', 'Sh/90', 'SoT/90', 'G/Sh', 'G/SoT',
    'xG', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG', 'Matches'
]

In [26]:
#dropping unneded columns
manutd_shooting_df = manutd_shooting_df.drop([
    'Nation', 'Pos', 'Age', '90s', 'Gls', 'PK', 'Pkatt', 'FK', 'Sh/90',
    'SoT/90', 'G/Sh', 'G/SoT', 'xG', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG',
    'Matches'
],
                                             axis=1)

In [27]:
#data cleaning
manutd_shooting_df = manutd_shooting_df.replace(r'^\s*$', np.nan, regex=True)
manutd_shooting_df.iloc[:, :] = manutd_shooting_df.iloc[:, :].astype('float64')

In [28]:
manutd_shooting_df

Unnamed: 0,Shot,ShotonTarget,ShotonTarget%
David de Gea,0.0,0.0,
Paul Pogba,96.0,38.0,39.6
Victor Lindelöf,7.0,1.0,14.3
Luke Shaw,20.0,6.0,30.0
Ashley Young,12.0,3.0,25.0
Nemanja Matić,12.0,3.0,25.0
Marcus Rashford,83.0,40.0,48.2
Chris Smalling,12.0,4.0,33.3
Romelu Lukaku,59.0,33.0,55.9
Jesse Lingard,31.0,12.0,38.7


In [29]:
#merging with standard stats table
manutd_1819 = manutd_1819.join(manutd_shooting_df)

### Passing Stats

In [30]:
placeholder2 = soup.select_one('#all_kitchen_sink_passing .placeholder')
comment2 = next(elem for elem in placeholder2.next_siblings if isinstance(elem, Comment))
table_soup2 = BeautifulSoup(comment2, 'lxml')

In [31]:
#passing stats table
passing_table = table_soup2.find('table', id='stats_passing_ks_1889')

In [32]:
passing_name = [header for header in passing_table.find_all('th', class_='left')]
passing_players = [name.text for th in passing_name for name in th.find_all('a')]

In [33]:
passing_row = [row for row in passing_table.find_all('tr')]

In [34]:
passing_stats = {}

for passer in passing_row[2:]:
    items = passer.find_all('td')
    passing_stats[passer] = [i.text for i in items]

In [35]:
passing_stats_list = [stats for stats in passing_stats.values()]

In [36]:
manutd_passing_stats = {}
manutd_passing_stats.update(zip(passing_players, passing_stats_list))

In [37]:
#dataframe of manutd players' passing stats
manutd_passing_df = pd.DataFrame(manutd_passing_stats).T
manutd_passing_df.columns = [
    'Nation', 'Pos', 'Age', '90s', 'TotPassCmp', 'TotPassAtt', 'TotPassCmpPerc',
    'TotDist', 'TotPrgDist', 'ShortCmp', 'ShortAtt', 'ShortCmp%', 'MediumCmp',
    'MediumAtt', 'MediumCmp%', 'LongCmp', 'LongAtt', 'LongCmp%', 'Ast', 'xA',
    'A-xA', 'KP', '1/3', 'PPA', 'CrsPA', 'Prog', 'Matches'
]

In [38]:
#dropping unneded columns
manutd_passing_df = manutd_passing_df.drop([
    'Nation', 'Pos', 'Age', '90s', 'TotDist', 'TotPrgDist', 'ShortCmp',
    'ShortAtt', 'ShortCmp%', 'MediumCmp', 'MediumAtt', 'MediumCmp%', 'LongCmp',
    'LongAtt', 'LongCmp%', 'Ast', 'xA', 'A-xA', 'KP', '1/3', 'PPA', 'CrsPA',
    'Prog', 'Matches'
],
                                           axis=1)

In [39]:
#data cleaning
manutd_passing_df = manutd_passing_df.replace(r'^\s*$', np.nan, regex=True)
manutd_passing_df.iloc[:, :] = manutd_passing_df.iloc[:, :].astype('float64')

In [40]:
manutd_passing_df

Unnamed: 0,TotPassCmp,TotPassAtt,TotPassCmpPerc
David de Gea,712.0,1087.0,65.5
Paul Pogba,1728.0,2115.0,81.7
Victor Lindelöf,1323.0,1481.0,89.3
Luke Shaw,1657.0,1984.0,83.5
Ashley Young,1380.0,1904.0,72.5
Nemanja Matić,1579.0,1797.0,87.9
Marcus Rashford,538.0,728.0,73.9
Chris Smalling,800.0,924.0,86.6
Romelu Lukaku,379.0,583.0,65.0
Jesse Lingard,668.0,788.0,84.8


In [41]:
#merging with other tables
manutd_1819 = manutd_1819.join(manutd_passing_df)

## Final Check on 2018-2019 Stats

In [42]:
manutd_1819

Unnamed: 0,Age,MP,Starts,Min,Gls,Ast,CrdY,P90Ast,Shot,ShotonTarget,ShotonTarget%,TotPassCmp,TotPassAtt,TotPassCmpPerc
David de Gea,27,38,38,3420,0,0,1,0.0,0.0,0.0,,712.0,1087.0,65.5
Paul Pogba,25,35,34,3006,13,9,6,0.27,96.0,38.0,39.6,1728.0,2115.0,81.7
Victor Lindelöf,24,30,29,2601,1,1,1,0.03,7.0,1.0,14.3,1323.0,1481.0,89.3
Luke Shaw,23,29,29,2591,1,4,11,0.14,20.0,6.0,30.0,1657.0,1984.0,83.5
Ashley Young,33,30,28,2569,2,2,9,0.07,12.0,3.0,25.0,1380.0,1904.0,72.5
Nemanja Matić,29,28,28,2436,1,0,7,0.0,12.0,3.0,25.0,1579.0,1797.0,87.9
Marcus Rashford,20,33,26,2334,10,6,3,0.23,83.0,40.0,48.2,538.0,728.0,73.9
Chris Smalling,28,24,24,2127,1,0,1,0.0,12.0,4.0,33.3,800.0,924.0,86.6
Romelu Lukaku,25,32,22,2136,12,0,4,0.0,59.0,33.0,55.9,379.0,583.0,65.0
Jesse Lingard,25,27,19,1663,4,2,3,0.11,31.0,12.0,38.7,668.0,788.0,84.8


In [43]:
#further dropping unneded columns
manutd_1819 = manutd_1819.drop([
    'MP',
    'Min',
    'P90Ast',
    'ShotonTarget',
    'ShotonTarget%',
    'TotPassAtt',
    'TotPassCmp',
],
                               axis=1)

In [44]:
manutd_1819

Unnamed: 0,Age,Starts,Gls,Ast,CrdY,Shot,TotPassCmpPerc
David de Gea,27,38,0,0,1,0.0,65.5
Paul Pogba,25,34,13,9,6,96.0,81.7
Victor Lindelöf,24,29,1,1,1,7.0,89.3
Luke Shaw,23,29,1,4,11,20.0,83.5
Ashley Young,33,28,2,2,9,12.0,72.5
Nemanja Matić,29,28,1,0,7,12.0,87.9
Marcus Rashford,20,26,10,6,3,83.0,73.9
Chris Smalling,28,24,1,0,1,12.0,86.6
Romelu Lukaku,25,22,12,0,4,59.0,65.0
Jesse Lingard,25,19,4,2,3,31.0,84.8


## Exporting 2018-2019 Stats

In [45]:
manutd_1819.to_csv('manutd_1819_stats.csv')