In [1]:
%%capture
!pip3 install unidecode

In [2]:
TEST=False # doesn't save any files

In [3]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

In [4]:
# FBREF, TransferMarkt, FIFA22

# TransferMarkt is the naming standard

la_liga_teams = [
    ('Real Madrid','Real Madrid CF','Real Madrid CF'), 
    ('Atlético Madrid','Atlético de Madrid','Atlético de Madrid'), 
    ('Real Sociedad','Real Sociedad','Real Sociedad'), 
    ('Sevilla','Sevilla FC','Sevilla FC'), 
    ('Betis','Real Betis Balompié','Real Betis Balompié'), 
    ('Rayo Vallecano','Rayo Vallecano','Rayo Vallecano'),#
    ('Barcelona', 'FC Barcelona','FC Barcelona'),
    ('Athletic Club','Athletic Club','Athletic Club de Bilbao'),
    ('Espanyol','RCD Espanyol','RCD Espanyol de Barcelona'),#
    ('Osasuna','CA Osasuna','CA Osasuna'),
    ('Valencia','Valencia CF','Valencia CF'),
    ('Villarreal','Villarreal CF','Villarreal CF'),
    ('Celta Vigo','RC Celta de Vigo','RC Celta de Vigo'),
    ('Mallorca','RCD Mallorca','RCD Mallorca'),#
    ('Alavés','Deportivo Alavés','Deportivo Alavés'),
    ('Granada','Granada CF','Granada CF'),
    ('Elche','Elche CF','Elche CF'),
    ('Cádiz','Cádiz CF','Cádiz CF'),
    ('Getafe','Getafe CF','Getafe CF'),
    ('Levante','Levante UD','Levante Unión Deportiva'),
]

In [5]:
def read_and_clean_data():
    df_players = pd.read_csv('players-2.csv') # data about the field players
    df_keepers = pd.read_csv('keepers-2.csv') # data about the keepers
    df_fifa = pd.read_csv('players_22.csv') # statistics from FIFA22

    # desired values from the field players
    df_players = df_players[[
            'season', 'player', 'position', 'squad', 'age', 
            'games', 'minutes', 'goals', 'assists', 
            'cards_yellow', 'cards_red'
    ]]

    # removing the days in age of the players
    age_data = []
    for age in np.array(df_players.age):
        if isinstance(age, int):
            age_data.append(age)
        else:
            age_data.append(int(age[:2]))

    df_players["age"] = np.array(age_data)

    # for consistency of the team names
    for team in la_liga_teams:
        df_players = df_players.replace(team[0], team[1])
    for team in la_liga_teams:
        df_fifa = df_fifa.replace(team[2], team[1])

    return df_players, df_keepers, df_fifa

In [6]:
def get_fifa_data(df_fifa, df_team, team_name):

    def data_check(array, value):
        if len(np.array(value)) > 0:
            array.append(np.array(value)[0])
        else:
            array.append(np.nan)

        return array

    overall_values = []
    potential_values = []

    pace_values = []
    shooting_values = []
    passing_values = []
    dribbling_values = []
    defending_values = []
    physic_values = []

    league_values = []
    wage_values = []

    base = r'^{}'
    expr = '(?=.*{})'

    for name in np.array(df_team.player):
        names = name.split(' ')
        b = base.format(''.join(expr.format(w) for w in names))

        player = df_fifa[df_fifa.long_name.str.contains(b) & 
                        (df_fifa.club_name == team_name)]
        
        overall_values = data_check(overall_values, player.overall)
        potential_values = data_check(potential_values, player.potential)
                                      
        pace_values = data_check(pace_values, player.pace)
        shooting_values = data_check(shooting_values, player.shooting)
        passing_values = data_check(passing_values, player.passing)
        dribbling_values = data_check(dribbling_values, player.dribbling)
        defending_values = data_check(defending_values, player.defending)
        physic_values = data_check(physic_values, player.physic)

        league_values = data_check(league_values, player.league_name)
        wage_values = data_check(wage_values, player.wage_eur)

    df_team["overall"] = overall_values
    df_team["potential"] = potential_values

    df_team["pace"] = pace_values
    df_team["shooting"] = shooting_values
    df_team["passing"] = passing_values
    df_team["dribbling"] = dribbling_values
    df_team["defending"] = defending_values
    df_team["physic"] = physic_values 

    df_team["league"] = league_values
    df_team["wage_eur"] = wage_values

    return df_team

In [7]:
import unidecode

def get_players_link(page):
    pageTree = requests.get(page, headers = headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

    PlayersLink = {}

    playershtml = pageSoup.find_all("td", {"class": "hauptlink"})
    players = int(str(pageSoup.find("span", {"class" : "dataValue"}))[len("<span class='dataValue'>"): -len("</span>")])
    
    for i, v in enumerate(playershtml[:(2*players)]):
        if i % 2 != 0:
            continue
            
        link = "https://www.transfermarkt.co.uk" + v.find("a")["href"]
        name = v.find("a")["href"].split("/")[1].replace("-", " ")
        PlayersLink[name] = link
        
    return PlayersLink

def transfermarkt_get_teams_with_players_link():
    page = 'https://www.transfermarkt.es/laliga/startseite/wettbewerb/ES1'
    pageTree = requests.get(page, headers = headers)
    soup = BeautifulSoup(pageTree.content, 'html.parser')
    #Create a dict of {teamName: [link, players_dict]}
    teamLinks = {}
    teams = set()
    #Extract all links with the correct CSS selector
    links = soup.select("a.vereinprofil_tooltip")
    for name in soup.find_all("td", class_="zentriert no-border-rechts"):
        logo = name.find("img")["src"]
        teams.add(name.find("a")["title"])
        #print("added: ", name.find("a")["title"] )
        tempLink = "https://www.transfermarkt.es"
        if "startseite" in name.find("a")["href"]:
            array = name.find("a")["href"].split("/")[:5]
            for i, val in enumerate(array):
                if i < 5:
                    tempLink += array[i] + "/"
            teamLinks[name.find("a")["title"]] = [logo, tempLink] 
    
    for team in teamLinks:
        link = teamLinks[team][1]
        teamLinks[team].append(get_players_link(link))

    return teamLinks

def get_last_contract_signing(df, team_name, players, team_links):
    last_renov = [None]*players.shape[0]

    players_link = team_links[team_name][2]
    for index, player in enumerate(players):

        if unidecode.unidecode(player.lower()) in players_link.keys():
            last_renov[index] = get_last_renovation(players_link[unidecode.unidecode(player.lower())])
    
    df["contract_signing"] = last_renov

    return df

def get_last_renovation(page):
    
    pageTree = requests.get(page, headers = headers)
    soup = BeautifulSoup(pageTree.content, 'html.parser')
    renovation = soup.find("span", class_ = "info-table__content info-table__content--bold info-table__span--specific")
    if renovation:
        return renovation.string
    else: 
        return soup.find_all("span", class_ = "dataValue")[-2].string 

In [8]:
def get_team(season, team_name):

    df_players, df_keepers, df_fifa = read_and_clean_data()

    df_team = df_players[df_players["squad"].str.contains(team_name, na=True)]
    df_team = df_team[df_team.season == season]

    df_team = get_fifa_data(df_fifa, df_team, team_name)

    df_team = get_last_contract_signing(df_team, team_name, df_team.player.values, transfermarkt_get_teams_with_players_link())

    return df_team

In [9]:
########################################################
if not TEST:
    dfs_la_liga = []

    for _, team_name, _ in la_liga_teams:
        print(team_name)
        dfs_la_liga.append((team_name, get_team(2022, team_name)))
    
    pd.concat([team for _, team in dfs_la_liga]).to_csv('la_liga.csv')
else:
    df_team = get_team(2022, 'Real Madrid CF')
########################################################

Real Madrid CF


  exec(code_obj, self.user_global_ns, self.user_ns)


Atlético de Madrid


  exec(code_obj, self.user_global_ns, self.user_ns)


Real Sociedad


  exec(code_obj, self.user_global_ns, self.user_ns)


Sevilla FC


  exec(code_obj, self.user_global_ns, self.user_ns)


Real Betis Balompié


  exec(code_obj, self.user_global_ns, self.user_ns)


Rayo Vallecano


  exec(code_obj, self.user_global_ns, self.user_ns)


FC Barcelona


  exec(code_obj, self.user_global_ns, self.user_ns)


Athletic Club


  exec(code_obj, self.user_global_ns, self.user_ns)


RCD Espanyol


  exec(code_obj, self.user_global_ns, self.user_ns)


CA Osasuna


  exec(code_obj, self.user_global_ns, self.user_ns)


Valencia CF


  exec(code_obj, self.user_global_ns, self.user_ns)


Villarreal CF


  exec(code_obj, self.user_global_ns, self.user_ns)


RC Celta de Vigo


  exec(code_obj, self.user_global_ns, self.user_ns)


RCD Mallorca


  exec(code_obj, self.user_global_ns, self.user_ns)


Deportivo Alavés


  exec(code_obj, self.user_global_ns, self.user_ns)


Granada CF


  exec(code_obj, self.user_global_ns, self.user_ns)


Elche CF


  exec(code_obj, self.user_global_ns, self.user_ns)


Cádiz CF


  exec(code_obj, self.user_global_ns, self.user_ns)


Getafe CF


  exec(code_obj, self.user_global_ns, self.user_ns)


Levante UD


  exec(code_obj, self.user_global_ns, self.user_ns)


### JSON maker

In [11]:
import json

# makes JSON file for our MongoDB database
def team_dictionary(df_team, team_name):

    df_team = df_team.where(pd.notnull(df_team), None)

    players = df_team.T.to_dict()

    team = [players[key] for key in players]

    final_object = {
        'team': team_name,
        'players': team
    }

    return final_object
        
    with open(file_name, "w") as outfile:
        json.dump(final_object, outfile)

In [20]:
if not TEST:

    file_name = 'laliga.json'

    league_dictionary = {
        'league_name': 'Spain Primera Division',
        'teams': {},
        }

    for team_name, team_data in dfs_la_liga:
        league_dictionary['teams'][team_name] = team_dictionary(team_data, team_name)

    with open(file_name, "w") as outfile:
        json.dump(league_dictionary, outfile)