# Football Data Webscraping

#### Importing libraries

In [49]:
import pandas as pd # DataFrame management
from selenium import webdriver # WebScraping driver
from selenium.webdriver.common.by import By # WebScraping property library
import time # Sleep library

### Initializing Firefox driver

In [50]:
driver = webdriver.Firefox()

### Defining base scraping functions

In [21]:
def load_link_dicts(fbr_link: dict, tm_link: dict, size_link: dict)-> None:
    '''
    Loads dictionaries with FbRef and Transfermarkt leagues links 
    
    Parameters
    ----------
    fbr_link : dict
        Dictionary containing league names as keys and FbRef links as values
    tm_link : dict
        Dictionary containing league names as keys and Transfermarkt links as values
    size_link : dict
        Dictionary containing league names as keys and number of teams it has as values
    '''
    with open('fbr_links.txt', 'r') as fbr_file:
        for line in fbr_file:
            line = line.strip().split(',')
            fbr_link[line[0]] = line[1]
    with open('tm_links.txt', 'r') as tm_file:
        for line in tm_file:
            line = line.strip().split(',')
            tm_link[line[0]] = line[1]
            size_link[line[0]] = line[2]
            

In [22]:
def click_cookie_fbref() -> None:
    '''
    Finds FbRef cookie request if present and clicks its accept button. If not found just passes
    '''
    try:
        elem_cookie = driver.find_element(By.CLASS_NAME, 'css-47sehv')
        elem_cookie.click()
    except:
        pass

In [23]:
def click_cookie_tm():
    '''
    Finds Transfermarkt cookie request if present and clicks its accept button. If not found just passes
    '''
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    try:
        iframe = driver.find_element(By.CSS_SELECTOR, "[title='Iframe title']")
        driver.switch_to.frame(iframe)
        element_button = WebDriverWait(driver, 2).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, 'button'))
        )
        element_button.click()
    except:
        pass
    finally:
        driver.switch_to.default_content()

In [40]:
def go_to_tm(tm_url):
    '''
    Directs the driver to the specified URL, waits 3 seconds and, if present, clicks accept cookies button
    
    Parameters
    ----------
    tm_url : string
        URL of the website the driver wants to go to
    '''
    driver.get(tm_url)
    time.sleep(1)
    click_cookie_tm()

### Getting Transfermarkt links for each team

In [25]:
def get_teams_links(league_name):
    '''
    
    
    Parameters
    ----------
    league_name : string
        Name of the league to get team TM links of
    '''
    tm_teams_links = driver.find_elements(By.CSS_SELECTOR, '[class="hauptlink no-border-links"]')
    with open(f"{league_name}/tm_teams_links.txt", 'w', encoding='utf-8') as tm_teams_links_files:
        for team in tm_teams_links:
            tm_teams_links_files.write(f"{team.text.strip()},{team.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')}\n")


In [26]:
def load_tm_teams_links_dict(league_name):
    tm_teams_links_dict = {}
    with open(f'{league_name}/tm_teams_links.txt', 'r') as tm_team_file:
        for line in tm_team_file:
            line = line.strip().split(',')
            tm_teams_links_dict[line[0]] = line[1]
    return tm_teams_links_dict

In [57]:
def get_players_data(league_name, tm_teams_links_dict):
    players_data = []
    for index, team in enumerate(tm_teams_links_dict.keys()):
        try:
            driver.get(tm_teams_links_dict[team])
            time.sleep(1)
            table_elements = driver.find_element(By.CSS_SELECTOR, '[class="items"]')
            table_rows = table_elements.find_elements(By.CSS_SELECTOR, 'tr')
            table_rows.pop(0)
            table_rows = table_rows[::3]
            for row in table_rows:
                line_data = row.text.split('\n')[1:]
                date = line_data[2].split()
                if len(date) > 4:
                    dob = " ".join(date[:3])
                    val = date[4]
                else:
                    date.pop(-1)
                    dob = " ".join(date)
                    val = line_data[3]
                players_data.append([line_data[0], line_data[1], team, dob, val]) 
        except:
            continue
    df = pd.DataFrame(players_data, columns =['Player', 'Role', 'Team', 'Birth', 'Value'])
    df.to_csv(f'{league_name}/tm_data.csv', index= False)
    return df 


In [28]:
def go_to_fbr(url_fbr):
    driver.get(url_fbr)
    time.sleep(1)
    click_cookie_fbref()

### Get Team Name and URL

In [29]:
def get_fbr_teams_links(league_name, league_size):
    teams_links = driver.find_elements(By.CSS_SELECTOR, '[data-stat="team"]')
    teams_links.pop(0)
    teams_links = teams_links[:league_size]
    with open(f'{league_name}/fbr_team_links.txt', 'w') as output:
        for team in teams_links:
            output.write(f'{team.text},{team.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')}\n')

### Get Players URL from each Team

In [30]:
def load_fbr_teams_links_dict(league_name):
    teams_links = {}
    with open(f'{league_name}/fbr_team_links.txt', 'r') as teams:
        for line in teams:
            data = line.strip().split(',')
            teams_links[data[0]] = data[1]
    return teams_links

In [31]:
def get_fbr_players_links(league_name, df, teams_links):
    teams_players_dict = {}
    all_players = df['Player'].values
    with open(f'{league_name}/fbr_player_links.txt', 'w', encoding='utf-8') as player_file:
        for team in teams_links.keys():
            team_players = []
            driver.get(teams_links[team])
            table = driver.find_element(By.XPATH, "//table[contains(@class, 'stats_table')]")
            tbody = table.find_element(By.CSS_SELECTOR, 'tbody')
            players = tbody.find_elements(By.CSS_SELECTOR, 'tr')
            for player in players:
                csk_val = (player.find_element(By.XPATH, './th').get_attribute('csk'))
                if csk_val is not None and len(csk_val) > 0:
                    elem = player.find_element(By.XPATH, './th/a')
                    if elem.text in all_players:
                        player_file.write(f"{elem.text},{elem.get_attribute('href')}\n")
                        team_players.append(elem.text)
            teams_players_dict[team] = team_players
    return teams_players_dict            

In [32]:
def load_fbr_players_links(league_name):
    player_links_dict = {}
    with open(f'{league_name}/fbr_player_links.txt', 'r', encoding='utf-8') as file:
        for line in file:
            data = line.strip().split(',')
            player_links_dict[data[0]] = data[1]
    return player_links_dict

In [33]:
def get_fbr_player_info(league_name, player_links_dict):
    players_data_list = []
    for index, key in enumerate(player_links_dict.keys()):
        player_d = {}
        go_to_fbr(player_links_dict[key])
        try:
            driver.find_element(By.CSS_SELECTOR, '[id="meta_more_button"]').click()
        except Exception as e:
            pass
        try:
            player_data = driver.find_elements(By.CSS_SELECTOR, '[id="meta"] p')            
            init_index = next((i for i, x in enumerate(player_data) if x.text.startswith("Position")), None)
            end_index = next((i for i, x in enumerate(player_data) if x.text.startswith("Wages")), None)
            player_data = player_data[init_index:end_index+1] if end_index is not None else player_data[init_index:]
            first_line = player_data[0].text.strip().split('▪')
            pos_block = first_line[0].split()
            second_pos = pos_block[2] if len(pos_block)>2 else None  
            foot = first_line[1].split()[1].strip() if len(first_line)>1 else None
            hw = player_data[1].text.split(' ')
            if len(player_data)>5:
                wage_line = player_data[5].text.split(" ") 
                wage = wage_line[2]
                exp = wage_line[6].strip('.')
            else:
                wage, exp = None, None
            
            
            player_d = {'Player': key, 'Role': pos_block[1], 'Position': second_pos, 'Foot': foot, 'Height': hw[0].strip(','), 'Weight': hw[1],
                    'Birth': " ".join(player_data[2].text.split(' ')[1:4]), "Nationality": player_data[3].text.split(": ")[1].split()[0],
                    'Club': player_data[4].text.split(": ")[1], 'Wage': wage, 'Expiration': exp}
            
        except Exception as e:
            print(key, e)
        try:
            player_stats = driver.find_element(By.XPATH, "//table[contains(@class, 'stats_table')]").find_elements(By.CSS_SELECTOR, 'tr')[1:]
            for elem in player_stats:
                category = elem.find_element(By.CSS_SELECTOR, "th").text.replace(" ", "")
                if category is not None and len(category)>0:
                    player_d[category] = elem.find_element(By.CSS_SELECTOR, "td").get_attribute('csk')
                    
        except Exception as e:
            pass
        players_data_list.append(player_d)
    fbr_df = pd.DataFrame(players_data_list)
    fbr_df.to_csv(f'{league_name}/fbr_data.csv', index=False)
    return fbr_df
        

In [34]:
fbr_link = {}
tm_link = {}
size_link = {}

In [58]:
load_link_dicts(fbr_link, tm_link, size_link)
for index, link in enumerate(fbr_link.keys()):
    if index==3:
        curr_league = link
        curr_size = size_link[curr_league]
        url_tm = tm_link[curr_league]
        url_fbr = fbr_link[curr_league]
        go_to_tm(url_tm)
        get_teams_links(curr_league)
        tm_teams_links_dict = load_tm_teams_links_dict(curr_league)    
        tm_df = get_players_data(curr_league, tm_teams_links_dict)
        
        go_to_fbr(url_fbr)
        get_fbr_teams_links(curr_league, int(curr_size))
        teams_links = load_fbr_teams_links_dict(curr_league)
        fbr_teams_players_dict = get_fbr_players_links(curr_league, tm_df, teams_links)
        players_links_dict = load_fbr_players_links(curr_league)
        df = get_fbr_player_info(curr_league, players_links_dict)
        df.head()
        

2
3
