In [174]:
import pandas as pd
import polars as pl
import numpy as np
from bs4 import BeautifulSoup
import concurrent.futures
from datetime import datetime
import requests
import re
import time

In [193]:
LEAGUE_PREFIX = 'https://sofifa.com/league/'

TEAM_PREFIX = 'https://sofifa.com'

LEAGUE_NUMBERS = {
    'Premier League': '13',
    'Championship': '14',
    'La Liga': '53',
    'La Liga 2': '54',
    'Bundesliga': '19',
    '2. Bundesliga': '20',
    'Serie A': '31',
    'Serie B': '32',
    'Ligue 1': '16',
    'Ligue 2': '17',
    'Eredivisie': '10',
    'Pro League': '4',
    'Süper Lig': '68',
    'Primeira Liga': '308',    
}

PLAYER_QUALITIES = [
    'Crossing', 'Finishing', 'Heading accuracy', 'Short passing', 'Volleys', 'Dribbling',
    'Curve', 'FK Accuracy', 'Long passing', 'Ball control', 'Acceleration', 'Sprint speed',
    'Agility', 'Reactions', 'Balance', 'Shot power', 'Jumping', 'Stamina', 'Strength',
    'Long shots','Aggression', 'Interceptions', 'Att. Position', 'Vision', 'Penalties', 'Composure',
    'Defensive awareness', 'Standing tackle', 'Sliding tackle', 'GK Diving', 'GK Handling', 'GK Kicking',
    'GK Positioning','GK Reflexes']

PLAYER_DETAIL_COLUMNS = ['PlayerId', 'PlayerName', 'Nationality', 'Age', 'Height', 'Weigh', 'Value', 'Wage'] + PLAYER_QUALITIES

In [250]:
def numpy_concatenate(a):
    return list(np.concatenate(a))

def scrap_page(page_link):
    """
    Fetches the content of a webpage and returns its BeautifulSoup parser object.
    
    Parameters:
    - page_link (str): The URL of the page to scrape.
    
    Returns:
    - BeautifulSoup object if successful, None otherwise.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
    }
    try:
        response = requests.get(page_link, headers=headers)
        response.raise_for_status()  # Raise an error for bad status codes
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Request error: {e}")
        return None

def extract_teams_from_league(url_prefix, league_name, league_code):
    """
    Extracts teams from a given league by scraping the league's webpage.
    
    Parameters:
    - url_prefix (str): The prefix URL to append the league code to.
    - league_name (str): The name of the league.
    - league_code (str): The specific code of the league to append to the URL prefix.
    
    Returns:
    - List of tuples containing league name, team name, and team link if successful, or an empty list on failure.
    """
    link = f"{url_prefix}{league_code}"
    page = scrap_page(link)

    if page:
        all_links = page.find_all('a', href=True)
        team_links_inclusive = [link for link in all_links if "/team/" in link['href']]
        team_info_inclusive = [(league_name, link.text.strip(), link['href']) for link in team_links_inclusive if link.text.strip()]
        return team_info_inclusive
    else:
        print("Failed to scrape the page or parse team data.")
        return [(None, None, None)]

def extract_players_from_team(url_prefixe, team_name, team_id, team_suffixe):
    link = f"{url_prefixe}{team_suffixe}"
    page = scrap_page(link)

    if not page:
        return [(team_name, team_id, None)]

    # Locate the boundaries
    start_marker = page.find('h5', string='Squad')
    end_marker = page.find('h5', string='On loan')

    # Extract content between the boundaries
    squad_content = str(page).split(str(start_marker), 1)[-1].split(str(end_marker), 1)[0]

    # Parse the extracted squad content to apply the provided code snippet
    squad_soup = BeautifulSoup(squad_content, 'html.parser')

    # Find all 'a' tags potentially containing team links, without applying a specific regex filter initially
    all_links = squad_soup.find_all('a', href=True)

    # Define a more inclusive criterion for filtering team links, focusing on common attributes
    # Here, we assume team links have a specific pattern like "/team/" followed by numbers and the team name
    team_links_inclusive = [link for link in all_links if "/player/" in link['href']]

    team_info_inclusive = [(team_name, team_id, link['href']) for link in team_links_inclusive if link.text.strip()]

    team_info_inclusive = list(dict.fromkeys(team_info_inclusive))

    return team_info_inclusive

def get_player_identity(page):
    # Assuming the "profile clearfix" div is correctly provided in the html_snippet
    # Re-parse the HTML snippet focusing on <div class="profile clearfix">
    profile_soup = page.find('div', class_="profile clearfix")

    # Player's name is directly within an <h1> tag
    player_name_corrected = profile_soup.find('h1').text

    # Nationality is in the title attribute of the link within <p>
    nationality_corrected = profile_soup.find('a', title=True)['title']

    # Extracting player info text again, assuming we might need to refine the approach
    player_info_text_corrected = profile_soup.find('p').text

    # Corrected regular expressions for age, height, and weight
    # Adjusting regex to correctly match the patterns
    age_corrected = re.search(r'(\d+)y\.o\.', player_info_text_corrected)
    height_corrected = re.search(r'(\d+)cm', player_info_text_corrected)
    weight_corrected = re.search(r'(\d+)kg', player_info_text_corrected)

    # Extracting matched groups if found, else None
    age_extracted = age_corrected.group(1) if age_corrected else None
    height_extracted = height_corrected.group(1) if height_corrected else None
    weight_extracted = weight_corrected.group(1) if weight_corrected else None

    return [player_name_corrected, nationality_corrected, age_extracted, height_extracted, weight_extracted]

def get_player_postion_preferred_foot(page):
    # Extract the value of 'Preferred foot'
    preferred_foot_label = page.find('label', string="Preferred foot")
    preferred_foot = preferred_foot_label.next_sibling.strip() if preferred_foot_label else "Not Found"

    # Extract the value of 'Position'
    position_label = page.find('label', string="Position")
    position_value = position_label.find_next_sibling('span').text if position_label else "Not Found"

    return [preferred_foot, position_value]

def get_player_wage_value(page):
    # Use regular expressions to find the divs that contain 'Value' and 'Wage' and then extract the em text
    value_div = page.find('div', string=re.compile('Value')).find_previous_sibling('em').string
    wage_div = page.find('div', string=re.compile('Wage')).find_previous_sibling('em').string

    return [value_div, wage_div]

def get_player_qualities(page, player_qualities):
    players_quality_values = []

    for quality in player_qualities:
        v = page.find('span', string=re.compile(quality)).find_previous('em').text
        players_quality_values.append((v))
    
    return players_quality_values

def get_player_information(url_prefix, player_suffix, player_id, player_qualities):   
   
    link = f"{url_prefix}{player_suffix}"
    page = scrap_page(link)

    if not page:
        return [player_id] + ['Null' for i in range(41)]

    player_identity = get_player_identity(page)
    player_position_foot = get_player_postion_preferred_foot(page)
    player_value_wage = get_player_wage_value(page)
    players_quality_values = get_player_qualities(player_qualities)

    player_info = [player_id] + player_identity + player_position_foot + player_value_wage + players_quality_values

    return player_info

def get_player_link_by_batch(temp_team_df):
    player_df = pl.DataFrame(schema=['Team', 'TeamId', 'Link'])
    while temp_team_df.shape[0] > 0:
        # Correct iteration over DataFrame rows using itertuples()
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(extract_players_from_team, TEAM_PREFIX, row['Team'], row['TeamId'], row['Link']) for row in temp_team_df.iter_rows(named=True)]
            list_players = [future.result() for future in concurrent.futures.as_completed(futures)]

        list_players_extended = numpy_concatenate(list_players)
        temp_players_df = pl.DataFrame(list_players_extended, orient='row', schema={'Team':pl.String, 'TeamId':pl.String, 'Link':pl.String})

        failed_requested_teams = temp_players_df \
            .filter(pl.col('Link').is_null()) \
            .select(pl.col('TeamId')) \
            .with_columns(pl.col("TeamId").cast(pl.Int32))

        temp_team_df = temp_team_df.filter(pl.col('TeamId').is_in(failed_requested_teams['TeamId']))

        temp_players_df = temp_players_df \
            .filter(~pl.col("Link").is_null()) \
            .with_columns(pl.col("Team").cast(pl.String)) \
            .with_columns(pl.col("TeamId").cast(pl.String)) \
            .with_columns(pl.col("Link").cast(pl.String))
        
        
        player_df = pl.concat([player_df, temp_players_df], how="vertical_relaxed")

        print(f"Number of players extracted : {player_df.shape[0]}")
        print(f"Number of failed extraction : {failed_requested_teams.shape[0]}")

        if failed_requested_teams.shape[0] > 0:
            print("We stop requesting the website for 30 seconds in order to stop overloading their server")
            time.sleep(30)  # Pauses the program for 30 seconds

    return player_df

def get_player_details_by_batch(team_prefix, temp_player_df, player_columns, player_qualities):
    player_details_df = pl.DataFrame(schema=player_columns)
    while temp_player_df.shape[0] > 0:
        # Correct iteration over DataFrame rows using itertuples()
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(get_player_information, team_prefix, row['Link'], row['PlayerId'], player_qualities) for row in temp_player_df.iter_rows(named=True)]
            list_players = [future.result() for future in concurrent.futures.as_completed(futures)]

        temp_players_detail_df = pl.DataFrame(list_players, orient='row', schema=player_columns)

        failed_requested_players = temp_players_detail_df \
            .filter(pl.col('PlayerName') == 'Null') \
            .select(pl.col('PlayerId')) \
            .with_columns(pl.col('PlayerId').cast(pl.Int32))

        temp_player_df = temp_player_df.filter(pl.col('PlayerId').is_in(failed_requested_players['PlayerId']))

        temp_players_detail_df = temp_players_detail_df.filter(~pl.col("PlayerName").is_null())

        player_details_df = pl.concat([player_details_df, temp_players_detail_df], how="vertical_relaxed")

        print(f"Number of players details extracted : {temp_players_detail_df.shape[0]}")
        print(f"Number of failed extraction : {failed_requested_players.shape[0]}")

        if failed_requested_players.shape[0] > 0:
            print("We stop requesting the website for 30 seconds in order to stop overloading their server")
            time.sleep(30)  # Pauses the program for 30 seconds

    return player_details_df

### Create Team Dataframe

In [177]:
# Using ThreadPoolExecutor to parallelize the process
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(extract_teams_from_league, LEAGUE_PREFIX, key, value) for key, value in LEAGUE_NUMBERS.items()]
    list_teams = [future.result() for future in concurrent.futures.as_completed(futures)]

In [178]:
list_teams_extended = numpy_concatenate(list_teams)
team_df = pl.DataFrame(list_teams_extended, orient='row', schema=['League', 'Team', 'Link'])

team_df = team_df.with_columns(
    pl.Series("TeamId", list(range(1, len(team_df) + 1)))
)

### Create Player Dataframe

In [None]:
i = 0
start = i
factor = 50
end = factor
final_end = team_df.shape[0] # 272

player_df = pl.DataFrame(schema=['Team', 'TeamId', 'Link'])
while start < final_end:
    print(f"Start : {start}, end : {end}.")
    part_player_df = get_player_link_by_batch(team_df[start:end])
    player_df = pl.concat([player_df, part_player_df], how="vertical_relaxed")
    print(f"Total number of players: {player_df.shape[0]}\n")

    i = i + 1
    start = start + factor
    end = start + factor

player_df = player_df \
    .with_columns(pl.Series('PlayerId', list(range(1, len(player_df) + 1)))) \
    .with_columns(pl.col("TeamId").cast(pl.Int32))

In [182]:
print(len(player_df["Team"].unique())) # should be 272

272


### Extract Player Informations

In [None]:
i = 0
start = i
factor = 50
end = factor
final_end = player_df.shape[0]

player_detail_df = pl.DataFrame(schema=PLAYER_DETAIL_COLUMNS)
while start < final_end:
    print(f"Start : {start}, end : {end}.")
    part_detail_player_df = get_player_details_by_batch(TEAM_PREFIX, player_df[start:end], PLAYER_DETAIL_COLUMNS, PLAYER_QUALITIES)
    player_detail_df = pl.concat([player_detail_df, part_detail_player_df], how="vertical_relaxed")
    player_detail_df = player_detail_df.unique(subset=['PlayerId'])
    print(f"Total number of players: {player_detail_df.shape[0]}\n")

    i = i + 1
    start = start + factor
    end = start + factor

In [251]:
def get_player_information(url_prefix, player_suffix, player_id, player_qualities):   
   
    link = f"{url_prefix}{player_suffix}"
    page = scrap_page(link)

    if not page:
        return [player_id] + ['Null' for i in range(41)]
    
    return page
    """
    # Assuming the "profile clearfix" div is correctly provided in the html_snippet
    # Re-parse the HTML snippet focusing on <div class="profile clearfix">
    profile_soup = page.find('div', class_="profile clearfix")

    # Player's name is directly within an <h1> tag
    player_name_corrected = profile_soup.find('h1').text

    # Nationality is in the title attribute of the link within <p>
    nationality_corrected = profile_soup.find('a', title=True)['title']

    # Extracting player info text again, assuming we might need to refine the approach
    player_info_text_corrected = profile_soup.find('p').text

    # Corrected regular expressions for age, height, and weight
    # Adjusting regex to correctly match the patterns
    age_corrected = re.search(r'(\d+)y\.o\.', player_info_text_corrected)
    height_corrected = re.search(r'(\d+)cm', player_info_text_corrected)
    weight_corrected = re.search(r'(\d+)kg', player_info_text_corrected)

    # Extracting matched groups if found, else None
    age_extracted = age_corrected.group(1) if age_corrected else None
    height_extracted = height_corrected.group(1) if height_corrected else None
    weight_extracted = weight_corrected.group(1) if weight_corrected else None

    player_identity = [player_name_corrected, nationality_corrected, age_extracted, height_extracted, weight_extracted]

    # Use regular expressions to find the divs that contain 'Value' and 'Wage' and then extract the em text
    value_div = page.find('div', string=re.compile('Value')).find_previous_sibling('em').string
    wage_div = page.find('div', string=re.compile('Wage')).find_previous_sibling('em').string

    player_value_wage = [value_div, wage_div]

    players_quality_values = []

    for quality in player_qualities:
        v = page.find('span', string=re.compile(quality)).find_previous('em').text
        players_quality_values.append((v))

    player_info = [player_id] + player_identity + player_value_wage + players_quality_values

    return player_info
    """

In [257]:
soup = get_player_information(TEAM_PREFIX, '/player/188377/kyle-walker/240028/', '5', PLAYER_QUALITIES)