## Scraping Premier League

**Imports**

In [1]:
import re
import pandas as pd
import requests
from requests_toolbelt import threaded
import bs4
import stringcase

**Basline crawling**

In [2]:
base_url = 'https://www.transfermarkt.co.uk'
premier_league = lambda x: base_url + f'/premier-league/marktwerte/wettbewerb/GB1/ajax/yw1/page/{x}'

In [3]:
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Host': 'www.transfermarkt.co.uk',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15',
    'Accept-Language': 'en-GB,en;q=0.9',
    'Referer': 'https://www.google.com/',
    'Connection': 'keep-alive',
}

In [4]:
def players_extractor(response):
    page_soup = bs4.BeautifulSoup(response.text, 'html.parser')
    players = list()
    for row in page_soup.select_one('#yw1 table.items').select('tbody > tr')[1:]:
        players.append({
            'player_profile_link': base_url + row.select_one('tr td:nth-child(2) a')['href'],
            'player_name': row.select_one('tr td:nth-child(2) a')['title'],
            'player_age': row.select_one('tr td:nth-child(4)').text,
            'player_club': row.select_one('tr td:nth-child(5) img')['alt'],
            'player_market_value': row.select_one('tr td:nth-child(6) a').text
        })

    players = pd.DataFrame(players)
    return players

**First Page**

In [5]:
players = players_extractor(requests.get(premier_league(1), headers=headers))
players[:3]

Unnamed: 0,player_profile_link,player_name,player_age,player_club,player_market_value
0,https://www.transfermarkt.co.uk/harry-kane/pro...,Harry Kane,28,Tottenham Hotspur,£90.00m
1,https://www.transfermarkt.co.uk/mohamed-salah/...,Mohamed Salah,29,Liverpool FC,£90.00m
2,https://www.transfermarkt.co.uk/bruno-fernande...,Bruno Fernandes,27,Manchester United,£81.00m


**All 4 pages**

In [6]:
players = pd.concat([players_extractor(requests.get(premier_league(n), headers=headers)) for n in range(1,5)], ignore_index=True)
players

Unnamed: 0,player_profile_link,player_name,player_age,player_club,player_market_value
0,https://www.transfermarkt.co.uk/harry-kane/pro...,Harry Kane,28,Tottenham Hotspur,£90.00m
1,https://www.transfermarkt.co.uk/mohamed-salah/...,Mohamed Salah,29,Liverpool FC,£90.00m
2,https://www.transfermarkt.co.uk/bruno-fernande...,Bruno Fernandes,27,Manchester United,£81.00m
3,https://www.transfermarkt.co.uk/kevin-de-bruyn...,Kevin De Bruyne,30,Manchester City,£81.00m
4,https://www.transfermarkt.co.uk/marcus-rashfor...,Marcus Rashford,24,Manchester United,£76.50m
...,...,...,...,...,...
91,https://www.transfermarkt.co.uk/john-mcginn/pr...,John McGinn,27,Aston Villa,£27.00m
92,https://www.transfermarkt.co.uk/gabriel-martin...,Gabriel Martinelli,20,Arsenal FC,£25.20m
93,https://www.transfermarkt.co.uk/john-stones/pr...,John Stones,27,Manchester City,£25.20m
94,https://www.transfermarkt.co.uk/boubakary-soum...,Boubakary Soumaré,23,Leicester City,£25.20m


**Players Profile**

In [7]:
page = requests.get(players.iloc[0]['player_profile_link'], headers=headers)
soup = bs4.BeautifulSoup(page.text)
page

<Response [200]>

In [8]:
def player_profile_extractor(response):
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    profile = dict(player_profile=response.url)
    
    def clean_values(text, snake=False):
        text = re.sub("\s+", " ", text)
        if snake:
            text = stringcase.snakecase(text).replace('__','_').replace('_c','c')
        
        text = text.replace(':','')
        
        return text
    
    for label, item in zip(
                    soup.select('.info-table.info-table--right-space.min-height-audio span.info-table__content--regular'),
                    soup.select('.info-table.info-table--right-space.min-height-audio span.info-table__content--bold')
                    ):
        profile[clean_values(label.text, snake=True)] = clean_values(item.text)
    
    
    return profile

player_profile_extractor(page)

{'player_profile': 'https://www.transfermarkt.co.uk/harry-kane/profil/spieler/132098',
 'name_in_homecountry': 'Harry Edward Kane',
 'date_of_birth': 'Jul 28, 1993 ',
 'place_of_birth': ' London ',
 'age': '28',
 'height': '1,88 m',
 'citizenship': ' England ',
 'position': ' attack - Centre-Forward ',
 'foot': 'right',
 'player_agent': ' CK66 ',
 'currentclub_': ' Tottenham Hotspur ',
 'joined': ' Jan 1, 2011 ',
 'contract_expires': 'Jun 30, 2024',
 'date_of_lastcontract_extension': 'Jun 8, 2018',
 'outfitter': 'Nike',
 'social_media': ' '}

In [9]:
def threaded_extractor(urls, extractor, headers=None):
    urls = [ dict(url=x, method='GET', headers=headers) for x in urls]
    
    extracted = list()
    responses_generator, exceptions_generator = threaded.map(urls)
    
    for response in responses_generator:
        extracted.append(extractor(response))
        
    return extracted, exceptions_generator
    

In [10]:
%%time
profiles, errors = threaded_extractor(
    urls = players.player_profile_link.values.tolist(),
    extractor = player_profile_extractor,
    headers = headers
)

CPU times: user 3.33 s, sys: 194 ms, total: 3.52 s
Wall time: 7.53 s


In [11]:
profiles = pd.DataFrame(profiles)
profiles

Unnamed: 0,player_profile,date_of_birth,place_of_birth,age,height,citizenship,position,foot,player_agent,currentclub_,joined,contract_expires,contract_option,date_of_lastcontract_extension,outfitter,social_media,name_in_homecountry,full_name,on_loan_from,contract_there_expires
0,https://www.transfermarkt.co.uk/marcus-rashfor...,"Oct 31, 1997",Manchester,24,"1,85 m",England St. Kitts & Nevis,attack - Left Winger,right,Relatives,Manchester United,"Jan 1, 2016","Jun 30, 2023",club option 1 year,"Jul 1, 2019",Nike,,,,,
1,https://www.transfermarkt.co.uk/mohamed-salah/...,,,,,,,,,,,,,,,,,,,
2,https://www.transfermarkt.co.uk/harry-kane/pro...,"Jul 28, 1993",London,28,"1,88 m",England,attack - Centre-Forward,right,CK66,Tottenham Hotspur,"Jan 1, 2011","Jun 30, 2024",,"Jun 8, 2018",Nike,,Harry Edward Kane,,,
3,https://www.transfermarkt.co.uk/jadon-sancho/p...,,,,,,,,,,,,,,,,,,,
4,https://www.transfermarkt.co.uk/raheem-sterlin...,"Dec 8, 1994",Kingston,27,"1,70 m",England Jamaica,attack - Left Winger,right,Relatives,Manchester City,"Jul 14, 2015","Jun 30, 2023",,"Nov 9, 2018",New Balance,,Raheem Shaquille Sterling,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,https://www.transfermarkt.co.uk/john-mcginn/pr...,,,,,,,,,,,,,,,,,,,
92,https://www.transfermarkt.co.uk/gabriel-martin...,,,,,,,,,,,,,,,,,,,
93,https://www.transfermarkt.co.uk/john-stones/pr...,"May 28, 1994",Barnsley,27,"1,88 m",England,Defender - Centre-Back,right,Wasserman,Manchester City,"Aug 9, 2016","Jun 30, 2026",,"Aug 10, 2021",Nike,,,,,
94,https://www.transfermarkt.co.uk/boubakary-soum...,,,,,,,,,,,,,,,,,,,
