In [1]:
'''
Fetches league information for each of the competitions
and saves as a single CSV, also updating the existing JSON files
in the process.
'''

'\nFetches league information for each of the competitions\nand saves as a single CSV, also updating the existing JSON files\nin the process.\n'

In [2]:
from bs4 import BeautifulSoup
from io import BytesIO
import glob
import json
import pandas as pd
from pprint import pprint
import requests
import re
from selenium import webdriver
from tqdm.notebook import tqdm
import time

In [3]:
HEADERS = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
    }

In [4]:
def already_saved(league_id):
    '''
    Controls whether the data for this specific league
    was already saved properly. Returns true or false.
    
    Params:
    
    club_id -> string with the unique club id at Transfermarkt
    '''
    
        
    leagues = glob.glob("../output/leagues/*.json")
    leagues = list(map(lambda x: re.search("/leagues/(.+).json", x).group(1), leagues))

    return (league_id in leagues)

In [5]:
def fetch_leagues(pattern):
    '''
    Fetches all files with league information
    and keeps unique entries to scrape.
    
    Params:
    
    pattern -> str, a glob pattern
    '''
    
    league_files = glob.glob(pattern)
    
    dfs = []
    for file in league_files:
        with open(file, "r") as f:
            data = json.load(f)
            
        df = pd.DataFrame(data)
        dfs.append(df)
        
    dfs = pd.concat(dfs, ignore_index=True)
    
    dfs = dfs[['league_url', 'league_name', 'league_id']]
    dfs = dfs.drop_duplicates().reset_index(drop=True)
    
    return dfs

In [6]:
def save_json(data, outpath):
    '''
    Saves a JSON file in the specified output path
    '''
    with open(outpath, 'w+') as f:
        json.dump(data, f)

In [7]:
def get_league_info(league_url):
    '''
    Scrapes the relevant club information.
    
    Params:
    
    driver -> a selenium webdriver
    club_url -> part of the url for the transfermakt club page
    '''
    
    url = f"https://transfermarkt.com{league_url}"
    print(url)
    
    r = requests.get(url, headers=HEADERS)
    
    soup = BeautifulSoup(r.text)
    
    data = {}
    
    # Cup
    if 'pokalwettbewerb' in league_url:
        content = soup.find('div', class_='dataContent')
        data_value = content.find('span', class_='dataValue')
    
        country = data_value.find('img')
        if not country: 
            country = 'no country'
        else:
            country = country['alt'].strip()
            
        league_tier = data_value.text.strip()
        
        data['country'] = country
        data['league_tier'] = league_tier
        data['type'] = 'cup'
        
    # League
    elif 'wettbewerb' in league_url:
        
        box = soup.find("div", class_="data-header__box--big")
        country = box.find("span", class_="data-header__club").find('a').text.strip()
        league_tier = box.find("span", class_="data-header__label").find('span').text.strip()
        
        data['country'] = country
        data['league_tier'] = league_tier
        data['type'] = 'league'

   
    return data

In [8]:
def main():
    
    leagues_to_scrape = fetch_leagues("../output/players/league-info-*.json")
        
    for index, row in tqdm(leagues_to_scrape.iterrows(), total=leagues_to_scrape.shape[0]):
  
        league_url = row.league_url
        league_id = row.league_id
        league_name = row.league_name
        
        print(f"{league_id}, {league_name}")
        
        if already_saved(league_id):
            print('Already saved')
            print()
            continue
    
        data = get_league_info(league_url)
        data['league_url'] = league_url
        data['league_id'] = league_id
        data['league_name']= league_name
        
        pprint(data)

        with open(f"../output/leagues/{league_id}.json", "w+") as f:
            json.dump(data, f)
            
        print()

In [9]:
if __name__ == "__main__":
    main()

  0%|          | 0/489 [00:00<?, ?it/s]

L1, Bundesliga
Already saved

CL, Champions League
Already saved

BJ3, U17-Bundesliga Süd/Südwest
Already saved

L3, 3. Liga
Already saved

AJ3, U19-BL S/SW
Already saved

DFB, DFB-Pokal
Already saved

DFL, DFL-Supercup
Already saved

GB18, U18 Premier League
Already saved

KLUB, Club World Cup
Already saved

19YL, UEFA Youth League
Already saved

USC, UEFA Super Cup
Already saved

IT1, Serie A
Already saved

AR1N, Liga Profesional
Already saved

CIT, Italy Cup
Already saved

EL, Europa League
Already saved

CLI, Libertadores
Already saved

CS, Copa Sudamericana
Already saved

ARCA, Copa Argentina
Already saved

SCI, Supercoppa Italiana
Already saved

QSL, Stars League
Already saved

QEC1, Amir Cup
Already saved

QSCP, QSC
Already saved

GB1, Premier League
Already saved

NL1, Eredivisie
Already saved

PO1, Liga Portugal
Already saved

NLP, KNVB beker
Already saved

FAC, FA Cup
Already saved

CGB, EFL Cup
Already saved

CLQ, Champions League Qu.
Already saved

UEFA, UEFA Cup
Already sa

{'country': 'Ecuador',
 'league_id': 'EL1S',
 'league_name': 'Serie A Segunda Etapa',
 'league_tier': 'First Tier',
 'league_url': '/ligapro-serie-a-segunda-etapa/startseite/wettbewerb/EL1S',
 'type': 'league'}

EL1A, Serie A Primera Etapa
https://transfermarkt.com/ligapro-serie-a-primera-etapa/startseite/wettbewerb/EL1A
{'country': 'Ecuador',
 'league_id': 'EL1A',
 'league_name': 'Serie A Primera Etapa',
 'league_tier': 'First Tier',
 'league_url': '/ligapro-serie-a-primera-etapa/startseite/wettbewerb/EL1A',
 'type': 'league'}

ECUP, Copa Ecuador
https://transfermarkt.com/copa-ecuador/startseite/pokalwettbewerb/ECUP
{'country': 'Ecuador',
 'league_id': 'ECUP',
 'league_name': 'Copa Ecuador',
 'league_tier': 'Domestic Cup',
 'league_url': '/copa-ecuador/startseite/pokalwettbewerb/ECUP',
 'type': 'cup'}

LPPL, Serie A Tercera Etapa
https://transfermarkt.com/ligapro-serie-a-tercera-etapa/startseite/pokalwettbewerb/LPPL
{'country': 'Ecuador',
 'league_id': 'LPPL',
 'league_name': 'Serie A

Already saved

BRCG, Carioca - Taça Guanabara
Already saved

BRCF, Campeonato Carioca - Final
Already saved

BRCR, Carioca - Taça Rio
Already saved

CA1F, CanPL Fall Season
Already saved

CAN1, CanPL Spring Season
Already saved

CAPL, CanPL Playoffs
Already saved

C1P1, Liga DIMAYOR Liguilla
Already saved

URP1, Copa AUF Uruguay
Already saved

IJ2A, Primavera 2 - A
Already saved

BE2E, Final Proximus League
Already saved

OBLB, Oberliga Baden-Württemberg
Already saved

SLI, Sachsenliga
Already saved

WAL1, Cymru Premier
Already saved

WALE, Welsh Cup
Already saved

CLPD, Primera División
https://transfermarkt.com/primera-division-de-chile/startseite/wettbewerb/CLPD
{'country': 'Chile',
 'league_id': 'CLPD',
 'league_name': 'Primera División',
 'league_tier': 'First Tier',
 'league_url': '/primera-division-de-chile/startseite/wettbewerb/CLPD',
 'type': 'league'}

ITJ6, Under 17 - C
Already saved

ITJ7, Under 18 - A
Already saved

ITJF, Under 17 - finals
Already saved

TSP, MOL Cup
Alrea