In [1]:
'''
Fetches information about club origin for each of the clubs
and saves as a single CSV, also updating the existing JSON files
in the process.
'''

'\nFetches information about club origin for each of the clubs\nand saves as a single CSV, also updating the existing JSON files\nin the process.\n'

In [2]:
from bs4 import BeautifulSoup
from io import BytesIO
import glob
import json
import pandas as pd
from pprint import pprint
import requests
import re
from selenium import webdriver
from tqdm.notebook import tqdm
import time

In [3]:
HEADERS = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
    }

In [4]:
def already_saved(club_id):
    '''
    Controls whether the data for this specific club
    was already saved properly. Returns true or false.
    
    Params:
    
    club_id -> string with the unique club id at Transfermarkt
    '''
    
        
    clubs = glob.glob("../output/clubs/*.json")
    clubs = list(map(lambda x: re.search("\d+", x).group(0), clubs))

    return (club_id in clubs)

In [5]:
def fetch_clubs():
    '''
    Fetches all files with club information,
    including those listed only on transfers,
    and keeps unique entries to scrape.
    '''
    
    club_files = glob.glob("../output/players/club-info-*.json")
    transfer_files = glob.glob("../output/players/transfer-info-*.json")
    
    clubs = []
    
    # First the clubs in the club files
    for file in club_files:
        
        with open(file, "r") as f:
            data = json.load(f)
    
        for datum in data:
        
            club_info = {
                'club_url': datum['club_url'],
                'club_name': datum['club_name'],
                'club_id': datum['club_id'],
                #'original_file': file
            }

            clubs.append(club_info)
    
    # Then the clubs in the transfer files
    for file in transfer_files:
        
        with open(file, "r") as f:
            data = json.load(f)
    
        for datum in data:
            
            # First the left club
            club_info = {
                'club_name': datum['left'],
                'club_url': datum['left_url'],
                'club_id': datum['left_club_id'],
                #'original_file': file

            }
            
            clubs.append(club_info)
            
            # Then the joined club
            club_info = {
                'club_name': datum['joined'],
                'club_url': datum['joined_url'],
                'club_id': datum['joined_club_id'],
                'original_file': file
            }
            
            clubs.append(club_info)
        
    clubs = pd.DataFrame(clubs)
        
    clubs = clubs.drop_duplicates(subset='club_id').reset_index(drop=True)
    
    return clubs

In [6]:
def save_json(data, outpath):
    '''
    Saves a JSON file in the specified output path
    '''
    with open(outpath, 'w+') as f:
        json.dump(data, f)

In [7]:
def get_club_info(club_url):
    '''
    Scrapes the relevant club information.
    
    Params:
    
    driver -> a selenium webdriver
    club_url -> part of the url for the transfermakt club page
    '''
    
    url = f"https://transfermarkt.com{club_url}"
    print(url)
    
    r = requests.get(url, headers=HEADERS)
    
    time.sleep(.5)
        
    soup = BeautifulSoup(r.text)
    
    keywords = soup.find("meta", {"name": "keywords"})
    
    # Exceptions with comma in name
    if club_url in ['/towol-middle-school-07-2011-2016-/transfers/verein/88377/saison_id/2003',
                    '/chungju-commercial-high-school-2013-2017-/transfers/verein/80477/saison_id/2017']:
        country = 'Korea, South'
        return country
    
    if "Korea, South" in keywords['content']:
        club_name, country_a, country_b = keywords['content'].split(',')
        country = f"{country_a.strip()}, {country_b.strip()}"
        
    else:
        club_name, country = keywords['content'].split(',')
    
    print(f"{club_name}, {country}")
    
    return country
   

In [8]:
def main():
    
    clubs_to_scrape = fetch_clubs()
        
    for index, row in tqdm(clubs_to_scrape.iterrows(), total=clubs_to_scrape.shape[0]):
          
        club_url = row.club_url
        club_id = row.club_id
        club_name = row.club_name
        
        if already_saved(club_id):
            print('Already saved')
            print()
            continue
        
        if club_name == 'Unknown':
            data = {
                'club_url': '-',
                'club_id': '75',
                'club_name': 'Unknown',
                'country': '-',
            }
            
        elif club_name == 'Without Club':
            data = {
                'club_url': '-',
                'club_id': '515',
                'club_name': 'Without Club',
                'country': '-',
            }
            
        elif club_name == 'Career break':
            data = {
                'club_url': '-',
                'club_id': '2113',
                'club_name': 'Career break',
                'country': '-',
            }
            
        elif club_name == 'Retired':
            data = {
                'club_url': '-',
                'club_id': '123',
                'club_name': 'Retired',
                'country': '-',
            }

        elif club_name == 'Own Youth':
            data = {
                'club_url': '-',
                'club_id': '12604',
                'club_name': 'Own Youth',
                'country': '-',
            }
            
        elif club_name in ['Ban', 'Disqualification']:
            data = {
                'club_url': '-',
                'club_id': '2077',
                'club_name': 'Disqualification',
                'country': '-',
            }
            
        else:
    
            country = get_club_info(club_url)
            data = {
                'club_url': club_url,
                'club_id': club_id,
                'club_name': club_name,
                'country': country,
            }
        
        with open(f"../output/clubs/{club_id}.json", "w+") as f:
            json.dump(data, f)
            
        print()

In [9]:
if __name__ == "__main__":
    main()

  0%|          | 0/2169 [00:00<?, ?it/s]

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already sa

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already sa

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already sa

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already saved

Already sa