In [199]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium_stealth import stealth
from fuzzywuzzy import process
import re

In [200]:
def stealth_scraper(link):
    options = webdriver.ChromeOptions()
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/96.0.4664.110 Safari/537.36")
    options.add_argument("start-maximized")
    options.add_argument("--headless")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    # WhoScored and TransferMarkt use cloudlfare so have to work around, not the most efficient solution but as this is a small scale scrape its acceptable
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )

    driver.get(link)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    return soup

In [201]:
def scrape_WhoScored(link):
    soup = stealth_scraper(link)
    stats_table = soup.find("div", {"id": "team-squad-stats-summary"})

    name = stats_table.find_all("span" , {"class": "iconize iconize-icon-left"})
    name_text = [n.text for n in name]
    rating = stats_table.find_all("td" , {"class": "rating"})
    mins = stats_table.find_all("td", {"class": "minsPlayed"})
    mins_and_rating = [[int(mins[idx].text.strip()), float(r.text)] for idx, r in enumerate(rating)]
    whoscored_ratings_dict = dict(zip(name_text,mins_and_rating))
    whoscored_ratings = pd.DataFrame.from_dict(whoscored_ratings_dict, orient="index", columns=["Mins", "Rating"])
    return whoscored_ratings

In [202]:
ratings = scrape_WhoScored("https://www.whoscored.com/Teams/29664/Show/USA-Austin-FC")
print(ratings)

                        Mins  Rating
Sebastián Driussi       2311    7.21
Zan Kolmanic             432    6.89
Julio Cascante          2065    6.77
Brad Stuver             3060    6.75
Daniel Pereira          2530    6.73
Emiliano Rigoni         2200    6.67
Jon Gallagher           2793    6.64
Leo Väisänen            1763    6.64
Alexander Ring          2157    6.62
Nick Lima               2419    6.60
Gyasi Zardes            1474    6.58
Owen Wolff              1885    6.55
Diego Fagundez          1031    6.54
Matt Hedges              306    6.48
Jhojan Valencia         1435    6.48
Sofiane Djeffal          312    6.45
Aleksandar Radovanovic   353    6.44
Ethan Finlay            1700    6.43
Adam Lundkvist           854    6.37
Memo Rodríguez           236    6.26
Rodney Redes             484    6.24
CJ Fodrey                 29    6.24
Maximiliano Urruti       970    6.20
Will Bruin               335    6.18
Kipp Keller              436    6.16
Hector Jimenez            51    6.13


In [203]:
def scrape_transfermarkt(link):
    soup = stealth_scraper(link)
    table = soup.find("table", {"class": "items"})
    names_and_values = table.find_all("td", {"class": "hauptlink"})
    names_and_values_text = [n.text.strip() for n in names_and_values]
    names_and_values = dict(zip(names_and_values_text[::2], names_and_values_text[1::2]))
    names_and_values = pd.DataFrame.from_dict(names_and_values, orient="index", columns=["Value"])
    return names_and_values

In [204]:
def format_data(valid_names):
    values = scrape_transfermarkt("https://www.transfermarkt.co.uk/austin-fc/kader/verein/72309/saison_id/2022")
    ratings = scrape_WhoScored("https://www.whoscored.com/Teams/29664/Show/USA-Austin-FC")
    for name in valid_names:
        # fuzzy name matching to match with valid_names list
        ratings_option = process.extractOne(name, ratings.index)
        values_option = process.extractOne(name, values.index)
        if ratings_option[1]>=85:
            ratings.rename(index={ratings_option[0]:name}, inplace=True)
        if values_option[1]>=85:
            values.rename(index={values_option[0]:name}, inplace=True)
    #ratings.sort_values("Rating", ascending=False, inplace=True)
    #values.sort_values("Value", ascending=False, inplace=True)
    for idx, row in values.iterrows():
        # extract number from string
        if "k" in row[0]:
            val = float(re.findall(r'\d+', row[0])[0])
            # convert to millions
            val = val/1000
            values.loc[idx, "Value"] = val
        else:
            val = float(re.sub("[^0-9 \.]", "", row[0]))
            values.loc[idx, "Value"] = val

    player_info = values.join(ratings)
    player_info.rename({"Value": "Value €M"}, axis=1, inplace=True)
    # WhoScored has some missing data for players that haven't played yet this season, in this case give them a rating of 0
    player_info.fillna(0, inplace=True)
    player_info.to_csv("player_info.csv")

In [205]:
player_names = ["Adam Lundqvist","Alexander Ring","Brad Stuver","Brandon Craig","Damian Las","Daniel Pereira","Emiliano Rigoni","Ethan Finlay","Gyasi Zardes","Hector Jimenez","Jhojan Valencia","Jon Gallagher","Julio Cesar Cascante","Kipp Keller","Leo Väisänen","Matt Bersano","Matt Hedges","Maxi Urruti","Memo Rodriguez","Nick Lima","Owen Wolff","Rodney Redes","Sebastian Driussi","Sofiane Djeffal","Will Bruin","Zan Kolmanic"]
format_data(player_names)

In [216]:
def player_analysis(df_path, valid_names, write_out=False):
    player_data = pd.read_csv(df_path, index_col=0)
    player_data["Rating Rank"] = player_data["Rating"].rank(ascending=0)
    player_data["Values Rank"] = player_data["Value €M"].rank(ascending=0)
    player_data["Mins Rank"] = player_data["Mins"].rank(ascending=0)
    # calculate weighted average
    player_data["Average Rank"] = player_data["Rating Rank"]*0.6+player_data["Mins Rank"]*0.25+player_data["Values Rank"]*0.15
    player_data.sort_values("Average Rank", inplace=True)
    player_data["Overall Rank"] = player_data["Average Rank"].rank(ascending=1)
    # Score players based on their ranks between 0-10 with 10 being the top ranked player and 0 being the bottom
    player_data["Score"] = 10-((player_data["Overall Rank"]-1)*(10/(len(player_data)-2)))
    player_data["Score"] = player_data["Score"].round(1)
    player_data = player_data[player_data.index.isin(valid_names)]
    # checking no names are missing from list of provided players to rank
    if len(player_data)!=len(valid_names):
        print(set(valid_names)-set(player_data.index))
        raise ValueError("Different number of players in DF and analysis task")
    print(player_data["Score"])
    if write_out:
        player_data["Score"].to_csv("player_scores.csv", index_label="Name")
player_analysis("player_info.csv", player_names, write_out=True)

Sebastian Driussi       10.0
Daniel Pereira           9.6
Julio Cesar Cascante     9.3
Brad Stuver              8.9
Emiliano Rigoni          8.5
Jon Gallagher            8.1
Leo Väisänen             7.8
Zan Kolmanic             7.4
Nick Lima                7.0
Alexander Ring           6.7
Owen Wolff               6.3
Gyasi Zardes             5.9
Jhojan Valencia          5.6
Ethan Finlay             5.2
Adam Lundqvist           4.8
Matt Hedges              4.4
Sofiane Djeffal          4.1
Maxi Urruti              3.7
Memo Rodriguez           3.3
Rodney Redes             3.0
Will Bruin               2.6
Kipp Keller              2.2
Hector Jimenez           1.5
Brandon Craig            0.9
Matt Bersano             0.4
Damian Las               0.0
Name: Score, dtype: float64
