This notebook builds a function to scrape transfer information from transfermarkt
* Website lists transfers in and out for each club in a league in a season
  * Need to scrape each individual table in and add a column for the purchasing club
* Create a loop to gather data from other leagues in other years (the url is formulaic) 

In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from selectorlib import Extractor
import bs4
import requests
import json
import time
import urllib.request
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import csv
import pandas as pd
import numpy as np
from datetime import date

In [2]:
ChromeOptions = webdriver.ChromeOptions()
ChromeOptions.add_argument('ignore-certificate-errors')
driver = webdriver.Chrome(ChromeDriverManager().install(), options=ChromeOptions)

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [/Users/ericroberts/.wdm/drivers/chromedriver/mac64/87.0.4280.88/chromedriver] found in cache


 


In [64]:
url = "https://www.transfermarkt.co.uk/premier-league/sommertransfers/wettbewerb/GB1/saison_id/2020"

driver.get(url)
html = driver.execute_script('return document.body.innerHTML;')
soup = bs4.BeautifulSoup(html,'lxml')

In [223]:
response_table = soup.find_all(class_="responsive-table")
for x in range(len(response_table)):
    if x % 2 == 0:
        ##Need to do this for all the even response tables 
        player_list = []
        club_left_list = []
        age_list = []
        position_list = []
        est_market_value = []
        fee_list = []
        nationality = []
        previous_league = []

        player = response_table[x].find_all(class_="spielprofil_tooltip tooltipstered")
        #The player tag gives two values - the full name, and first initial and 
        #last name.  Use modulo operator to grab full name (even entries).
        club_left = response_table[x].find_all(class_="vereinprofil_tooltip tooltipstered")
        #club_left returns a blank cell every other entry. First entry is blank
        #Use modulo operator to grab odds entries
        for i in range(len(player)):
            if i % 2 == 0:
                player_list.append(player[i].text)
        
        #Can't only use class_="vereinprofil_tooltip tooltipstered" because there are "without club entries"
        #that use a different tag.  The image tags for selling club appear to have same tag.  Getting the text
        #from the nested alt tag.
        club_left = response_table[x].find_all(class_="no-border-rechts zentriert")
        for i in range(len(club_left)):
            club_img = club_left[i].find_all("img")
            club_left_list.append(club_img[0]['alt'])
            #club_img comes as a single entry list so index with 0 before grabbing the 'alt' text

        age = response_table[x].find_all(class_="zentriert alter-transfer-cell")
        position = response_table[x].find_all(class_="pos-transfer-cell")
        #age and position have one entry per person and have a header
        # as the first entry - skip by indexing greater than 0
        for i in range(len(age)):
            if i > 0:
                age_list.append(age[i].text)
                position_list.append(position[i].text)

        fee = response_table[x].find_all("td", class_="rechts")
        #the fee tag grabs both estimate market value and actual fee in alternating
        #fashion
        for i in range(len(fee)):
            if i % 2 == 0:
                est_market_value.append(fee[i].text)
            if i % 2 == 1:
                fee_list.append(fee[i].text)


        nat_flag = response_table[x].find_all(class_="zentriert nat-transfer-cell")
        #Going to only grab the first nationality of a player
        for i in range(len(nat_flag)):
            if i > 0:
            #this has to start at 1 to get past the header    
                nation = nat_flag[i].find(class_="flaggenrahmen")
                nationality.append(nation['title'])

        league_flag = response_table[x].find_all(class_="no-border-links verein-flagge-transfer-cell")
        #The flaggenrahmen tag is on all flags and the indexing is messed up when dual nationals 
        #appear.  So using the outter tag on league to distinguish
        #League flag, unlike national flag doesn't have a header.  When a player is picked up without
        #a league they don't have a flag to grab from using try/except
        for i in range(len(league_flag)):
            try:
                league = league_flag[i].find(class_="flaggenrahmen")
                previous_league.append(league['title'])
            except TypeError:
                previous_league.append('Without Club')            
            
        giant_list = [player_list,age_list,nationality,position_list,
                      club_left_list,previous_league,est_market_value,fee_list]
        df = pd.DataFrame(giant_list).transpose()
        df.columns = ['player','age','nationality','position','selling_club','previous_league','est_market_value','fee']

        #Use this to get the buying club: The table-header tag is used twice on each page before
        #getting to the tables of in's and out's.  The in's and out's have same header therefore,
        # start at 2 and increment by 1 for each iteration through response table
        table_header = soup.find_all(class_='table-header')
        club = table_header[2].find_all(class_="vereinprofil_tooltip tooltipstered")
        df['buying_club'] = club[1].text
        buying_club = club[1].text
df

Unnamed: 0,player,age,nationality,position,selling_club,previous_league,est_market_value,fee,buying_club
0,Fábio Silva,18,Portugal,Centre-Forward,FC Porto,Portugal,£10.80m,£36.00m,Arsenal FC
1,Nélson Semedo,26,Portugal,Right-Back,FC Barcelona,Spain,£36.00m,£27.00m,Arsenal FC
2,Ki-Jana Hoever,18,Netherlands,Right-Back,Liverpool FC U23,England,£3.24m,£8.82m,Arsenal FC
3,Marçal,31,Brazil,Left-Back,Olympique Lyon,France,£2.16m,£1.80m,Arsenal FC
4,Toti Gomes,21,Portugal,Centre-Back,GD Estoril Praia,Portugal,-,£900Th.,Arsenal FC
5,Matija Sarkic,23,Montenegro,Goalkeeper,Aston Villa,England,£90Th.,Free transfer,Arsenal FC
6,Vitinha,20,Portugal,Central Midfield,FC Porto,Portugal,£4.50m,Loan,Arsenal FC
7,Rayan Aït Nouri,19,France,Left-Back,SCO Angers,France,£18.00m,Loan,Arsenal FC
8,Morgan Gibbs-White,21,England,Central Midfield,Swansea City,England,£5.40m,"End of loanMay 31, 2021",Arsenal FC
9,Matija Sarkic,23,Montenegro,Goalkeeper,Shrewsbury Town,England,£90Th.,"End of loanMay 31, 2021",Arsenal FC


In [226]:
def transfer_value_gather(table):
    '''function that takes in as an argument a bs4 result set that is the table of transfer
    information (for season for league).  Each webpage has 40 tables: one table for transfers
    into the club (even index) and one for transfers out (odd index) for each of the 20 clubs.
    ''' 
    player_list = []
    club_left_list = []
    age_list = []
    position_list = []
    est_market_value = []
    fee_list = []
    nationality = []
    previous_league = []

    player = table.find_all(class_="spielprofil_tooltip tooltipstered")
    #The player tag gives two values - the full name, and first initial and 
    #last name.  Use modulo operator to grab full name (even entries).
    club_left = table.find_all(class_="vereinprofil_tooltip tooltipstered")
    #club_left returns a blank cell every other entry. First entry is blank
    #Use modulo operator to grab odds entries
    for i in range(len(player)):
        if i % 2 == 0:
            player_list.append(player[i].text)
    
    #Can't only use class_="vereinprofil_tooltip tooltipstered" because there are "without club entries"
    #that use a different tag.  The image tags for selling club appear to have same tag.  Getting the text
    #from the nested alt tag.
    club_left = table.find_all(class_="no-border-rechts zentriert")
    for i in range(len(club_left)):
        club_img = club_left[i].find_all("img")
        club_left_list.append(club_img[0]['alt'])
        #club_img comes as a single entry list so index with 0 before grabbing the 'alt' text

    age = table.find_all(class_="zentriert alter-transfer-cell")
    position = table.find_all(class_="pos-transfer-cell")
    #age and position have one entry per person and have a header
    # as the first entry - skip by indexing greater than 0
    for i in range(len(age)):
        if i > 0:
            age_list.append(age[i].text)
            position_list.append(position[i].text)

    fee = table.find_all("td", class_="rechts")
    #the fee tag grabs both estimate market value and actual fee in alternating
    #fashion
    for i in range(len(fee)):
        if i % 2 == 0:
            est_market_value.append(fee[i].text)
        if i % 2 == 1:
            fee_list.append(fee[i].text)


    nat_flag = table.find_all(class_="zentriert nat-transfer-cell")
    #Going to only grab the first nationality of a player
    for i in range(len(nat_flag)):
        if i > 0:
        #this has to start at 1 to get past the header    
            nation = nat_flag[i].find(class_="flaggenrahmen")
            nationality.append(nation['title'])

    league_flag = table.find_all(class_="no-border-links verein-flagge-transfer-cell")
    #The flaggenrahmen tag is on all flags and the indexing is messed up when dual nationals 
    #appear.  So using the outter tag on league to distinguish
    #League flag, unlike national flag doesn't have a header.  When a player is picked up without
    #a league they don't have a flag to grab from using try/except
    for i in range(len(league_flag)):
        try:
            league = league_flag[i].find(class_="flaggenrahmen")
            previous_league.append(league['title'])
        except TypeError:
            previous_league.append('Without Club')            
        
    giant_list = [player_list,age_list,nationality,position_list,
                  club_left_list,previous_league,est_market_value,fee_list]
    df = pd.DataFrame(giant_list).transpose()
    df.columns = ['player','age','nationality','position','selling_club','previous_league','est_market_value','fee']

    #Use this to get the buying club: The table-header tag is used twice on each page before
    #getting to the tables of in's and out's.  The in's and out's have same header therefore,
    # start at 2 and increment by 1 for each iteration through response table
#     table_header = soup.find_all(class_='table-header')
#     club = table_header[2].find_all(class_="vereinprofil_tooltip tooltipstered")
#     df['buying_club'] = club[1].text
#     buying_club = club[1].text
    return df

In [229]:
transfer_value_gather(response_table[4])

Unnamed: 0,player,age,nationality,position,selling_club,previous_league,est_market_value,fee
0,Jakub Moder,21,Poland,Central Midfield,Lech Poznan,Poland,£2.70m,£9.90m
1,Michal Karbownik,19,Poland,Left-Back,Legia Warszawa,Poland,£5.40m,£4.95m
2,Andi Zeqiri,21,Switzerland,Centre-Forward,FC Lausanne-Sport,Switzerland,£1.35m,£3.60m
3,Jan Paul van Hecke,20,Netherlands,Centre-Back,NAC Breda,Netherlands,£1.62m,£1.80m
4,Joël Veltman,28,Netherlands,Centre-Back,Ajax Amsterdam,Netherlands,£7.20m,£900Th.
5,Adam Lallana,32,England,Attacking Midfield,Liverpool FC,England,£8.55m,Free transfer
6,Matt Clarke,24,England,Centre-Back,Derby County,England,£2.70m,"End of loanMay 31, 2021"
7,Leo Östigard,21,Norway,Centre-Back,Coventry City,England,£2.25m,"End of loanMay 31, 2021"
8,Glenn Murray,37,England,Centre-Forward,Watford FC,England,£900Th.,"End of loanMay 31, 2021"
9,Shane Duffy,29,Ireland,Centre-Back,Celtic FC,Scotland,£8.55m,"End of loanMay 31, 2021"


In [278]:
#This creates a list of the clubs in the league in that season in alphabetical order; mirrors the order that 
#response_table is ordered.  Use this along with above function when looking through response_tables to add
#the purchasing club as a column to the df.
list_of_clubs = []
table_header = soup.find_all(class_='table-header')
i=2
while i < 22:
    club = table_header[i].find_all(class_="vereinprofil_tooltip tooltipstered")
    list_of_clubs.append(club[1].text)
    i += 1
list_of_clubs

['Arsenal FC',
 'Aston Villa',
 'Brighton & Hove Albion',
 'Burnley FC',
 'Chelsea FC',
 'Crystal Palace',
 'Everton FC',
 'Fulham FC',
 'Leeds United',
 'Leicester City',
 'Liverpool FC',
 'Manchester City',
 'Manchester United',
 'Newcastle United',
 'Sheffield United',
 'Southampton FC',
 'Tottenham Hotspur',
 'West Bromwich Albion',
 'West Ham United',
 'Wolverhampton Wanderers']

In [300]:
#This removes the sold players table and makes this list equal in length to the names of the clubs.
list_of_table_of_purchases = []
response_table = soup.find_all(class_="responsive-table")
for i in range(len(response_table)):
    if i % 2 == 0:
        list_of_table_of_purchases.append(response_table[i])