In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

In [3]:
# FBREF, SoccerPrime, FIFA22

# SoccerPrime is the naming standard

la_liga_teams = [
    ('Real Madrid','Real Madrid','Real Madrid CF'), 
    ('Atlético Madrid','Atletico Madrid','Atlético de Madrid'), 
    ('Real Sociedad','Real Sociedad','Real Sociedad'), 
    ('Sevilla','Sevilla FC','Sevilla FC'), 
    ('Betis','Real Betis','Real Betis Balompié'), 
    # ('Rayo Vallecano',,'Rayo Vallecano'),#
    ('Barcelona', 'FC Barcelona','FC Barcelona'),
    ('Athletic Club','Athletic Bilbao','Athletic Club de Bilbao'),
    # ('Espanyol',,'RCD Espanyol de Barcelona'),#
    ('Osasuna','CA Osasuna','CA Osasuna'),
    ('Valencia','Valencia CF','Valencia CF'),
    ('Villarreal','Villarreal CF','Villarreal CF'),
    ('Celta Vigo','Celta Vigo','RC Celta de Vigo'),
    # ('Mallorca',,'RCD Mallorca'),#
    ('Alavés','Deportivo Alaves','Deportivo Alavés'),
    ('Granada','Granada CF','Granada CF'),
    ('Elche','Elche CF','Elche CF'),
    ('Cádiz','Cádiz CF','Cádiz CF'),
    ('Getafe','Getafe CF','Getafe CF'),
    ('Levante','Levante UD','Levante Unión Deportiva'),
]

In [4]:
def read_and_clean_data():
    df_players = pd.read_csv('players-2.csv') # data about the field players
    df_keepers = pd.read_csv('keepers-2.csv') # data about the keepers
    df_fifa = pd.read_csv('players_22.csv') # statistics from FIFA22

    # desired values from the field players
    df_players = df_players[[
            'season', 'player', 'position', 'squad', 'age', 
            'games', 'minutes', 'goals', 'assists', 
            'cards_yellow', 'cards_red'
    ]]

    # removing the days in age of the players
    age_data = []
    for age in np.array(df_players.age):
        if isinstance(age, int):
            age_data.append(age)
        else:
            age_data.append(int(age[:2]))

    df_players["age"] = np.array(age_data)

    # for consistency of the team names
    for team in la_liga_teams:
        df_players = df_players.replace(team[0], team[1])
    for team in la_liga_teams:
        df_fifa = df_fifa.replace(team[2], team[1])

    return df_players, df_keepers, df_fifa

In [5]:
def get_fifa_data(df_fifa, df_team, team_name):

    def data_check(array, value):
        if len(np.array(value)) > 0:
            array.append(np.array(value)[0])
        else:
            array.append(np.nan)

        return array

    overall_values = []
    potential_values = []

    pace_values = []
    shooting_values = []
    passing_values = []
    dribbling_values = []
    defending_values = []
    physic_values = []

    base = r'^{}'
    expr = '(?=.*{})'

    for name in np.array(df_team.player):
        names = name.split(' ')
        b = base.format(''.join(expr.format(w) for w in names))

        player = df_fifa[df_fifa.long_name.str.contains(b) & 
                        (df_fifa.club_name == team_name)]
        
        overall_values = data_check(overall_values, player.overall)
        potential_values = data_check(potential_values, player.potential)
                                      
        pace_values = data_check(pace_values, player.pace)
        shooting_values = data_check(shooting_values, player.shooting)
        passing_values = data_check(passing_values, player.passing)
        dribbling_values = data_check(dribbling_values, player.dribbling)
        defending_values = data_check(defending_values, player.defending)
        physic_values = data_check(physic_values, player.physic)

    df_team["overall"] = overall_values
    df_team["potential"] = potential_values

    df_team["pace"] = pace_values
    df_team["shooting"] = shooting_values
    df_team["passing"] = passing_values
    df_team["dribbling"] = dribbling_values
    df_team["defending"] = defending_values
    df_team["physic"] = physic_values 

    return df_team

### JSON maker

In [9]:
import json

# makes JSON file for our MongoDB database
def make_json(df_team, team_name, file_name):

    players = df_team.T.to_dict()

    team = [players[key] for key in players]

    final_object = {
        'team': team_name,
        'players': team
    }
        
    with open(file_name, "w") as outfile:
        json.dump(final_object, outfile)

In [7]:
def getSalaries(page):
    #page = 'https://soccerprime.com/fc-barcelona-player-salaries/'
    tree = requests.get(page, headers = headers)
    soup = BeautifulSoup(tree.content, 'html.parser')
    table = soup.find_all("table")
    playerSalaries = {}

    for index, i in enumerate(table):
        #print(index)
        #print(i)
        #print("–––––––––––––")
        #print(i.find_all("td"))
        counter = 0
        temp = i.find_all("td")
        while counter < len(temp):
            name = str(temp[counter])[len("<td>"):-len("<td>")-1]
            counter += 2
            salary = str(temp[counter])[len("<td>"):-len("<td>")-1]
            counter += 1
            #print("name: ", name, "salary", salary)
            playerSalaries[name] = salary


        #for index2, j in enumerate(i.find_all("td")):
         #   print("a")

            #print(index2, index2%3, j)
    #print(playerSalaries)
    #print("done")
    return playerSalaries

In [8]:
def getTeamsSalaryPage():
    page = 'https://soccerprime.com/la-liga-player-salaries/'
    tree = requests.get(page, headers = headers)
    soup = BeautifulSoup(tree.content, 'html.parser')
    table = soup.find("table")
    #print(table)
    teams = {}
    last = ""
    for index, j in enumerate(table.find_all("td")):
        if index % 3 == 0:
            array = j.find("a")
            link = array["href"]
            name = array.string.replace("é", "e")
            last = name + "@"
            last += link + "@"
        if index % 3 == 2:
            last += j.string
            lastString = last.split ("@")
            teams[lastString[0]] = [lastString[1], lastString[2], getSalaries(link)] # ex: {Elche: [link, totalcontractvalue]}
            #print("appending: ", lastTuple)
            
        #name = ""
    return teams

def getSingleTeamsPage(team_name=None):
    
    page = 'https://soccerprime.com/la-liga-player-salaries/'
    tree = requests.get(page, headers = headers)
    soup = BeautifulSoup(tree.content, 'html.parser')
    table = soup.find("table")
    #print(table)
    teams = {}
    last = ""
    for index, j in enumerate(table.find_all("td")):
        if index % 3 == 0:
            array = j.find("a")
            name = array.string.replace("é", "e")
            link = array["href"]
            last = name + "@"
            last += link + "@"
        if index % 3 == 2:
            last += j.string
            lastString = last.split ("@")
            teams[lastString[0]] = [lastString[1], lastString[2]] # ex: {Elche: [link, totalcontractvalue]}
            #print("appending: ", lastTuple)
    
    
    for i in teams.keys():
        if team_name is not None and i != team_name:
            continue
        teams[i].append(getSalaries(teams[i][0]))
        #name = ""
    return teams


In [9]:
def addSalaries(df, teamsSalary):
    salaries = ["-"]*(df.shape[0])
      
    for index, i in enumerate(df.iterrows()):
        #print(index/df.shape[0]*100)
        team = i[1].squad
        name = i[1].player
                #print("in")
        
        if name in teamsSalary.keys():                
            salaries[index] = teamsSalary[name]
        
    #print(salaries)
    df["salaries"] = salaries
    return df

In [10]:
def get_teamTemp(season, team_name):

    df_players, df_keepers, df_fifa = read_and_clean_data()
    
    df_team = df_players[df_players["squad"].str.contains(team_name, na=True)]
    
    df_team = df_team[df_team.season == season]
    
    df_team = get_fifa_data(df_fifa, df_team, team_name)
    
    return df_team

In [11]:
def addTeamSalaries(df, team_name):
    teamsSalaries = getSingleTeamsPage(team_name) #computing for all teams
    #print(teamsSalaries)
    df_team_with_salaries = addSalaries(df, getTeamsSalaryPage()[team_name][2])
    return df_team_with_salaries



In [12]:
def get_team(season, team_name):

    df_players, df_keepers, df_fifa = read_and_clean_data()

    df_team = df_players[df_players["squad"].str.contains(team_name, na=True)]
    df_team = df_team[df_team.season == season]

    df_team = get_fifa_data(df_fifa, df_team, team_name)
    #print("df before salaries is: ", df_team)
    df_team = addTeamSalaries(df_team, team_name)
    
    return df_team

In [14]:
df_barça = get_team(2021, "FC Barcelona")


  df_players, df_keepers, df_fifa = read_and_clean_data()
  df_players, df_keepers, df_fifa = read_and_clean_data()


In [20]:
df_barça.player.values



array(['Jordi Alba', 'Carles Aleñá', 'Ronald Araújo',
       'Martin Braithwaite', 'Sergio Busquets', 'Philippe Coutinho',
       'Ousmane Dembélé', 'Sergiño Dest', 'Ansu Fati', 'Junior Firpo',
       'Antoine Griezmann', 'Ilaix', 'Frenkie de Jong', 'Clément Lenglet',
       'Lionel Messi', 'Óscar Mingueza', 'Pedri', 'Gerard Piqué',
       'Miralem Pjanić', 'Riqui Puig', 'Sergi Roberto',
       'Francisco Trincão', 'Samuel Umtiti'], dtype=object)

In [46]:
page = 'https://www.transfermarkt.co.uk/laliga/startseite/wettbewerb/ES1'
tree = requests.get(page, headers = headers)
soup = BeautifulSoup(tree.content, 'html.parser')

def get_players_link(page):
    pageTree = requests.get(page, headers = headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')


    PlayersLink = {}

    playershtml = pageSoup.find_all("td", {"class": "hauptlink"})
    #print(pageSoup)
    players = int(str(pageSoup.find("span", {"class" : "dataValue"}))[len("<span class='dataValue'>"): -len("</span>")])
    
    for i, v in enumerate(playershtml[:(2*players)]):
        if i % 2 != 0:
            continue
            
        link = "https://www.transfermarkt.co.uk" + v.find("a")["href"]
        name = v.find("a")["href"].split("/")[1].replace("-", " ")
        PlayersLink[name] = link
        
    return PlayersLink

def transfermarkt_get_teams():
    #Create a dict of {teamName: [link, players_dict]}
    teamLinks = {}
    teams = set()
    #Extract all links with the correct CSS selector
    links = soup.select("a.vereinprofil_tooltip")
    for name in soup.find_all("td", class_="zentriert no-border-rechts"):

        teams.add(name.find("a")["title"])

        tempLink = "https://www.transfermarkt.co.uk"
        if "startseite" in name.find("a")["href"]:
            array = name.find("a")["href"].split("/")[:5]
            for i, val in enumerate(array):
                if i < 5:
                    tempLink += array[i] + "/"
            teamLinks[name.find("a")["title"]] = [tempLink] 
    
    for team in teamLinks:
        link = teamLinks[team][0]
        teamLinks[team].append(get_players_link(link))
    return teamLinks
 
teams = transfermarkt_get_teams()
#get_players_link("https://www.transfermarkt.co.uk/real-madrid/startseite/verein/418/")
       

In [72]:
import unidecode
teams["FC Barcelona"]
print(len(df_barça.player.values))
count = 0
for i in df_barça.player.values:
    name = unidecode.unidecode(i.lower())
    for j in teams["FC Barcelona"][1].keys():
        nameDict = unidecode.unidecode(j.lower())
        #print(  name == nameDict ,"i:", name, "j", nameDict)
        if name == nameDict:
            count+=1
print(count)

23
16
