In [1]:
'''
Here we will combine and analyze the data
we gathered on the previous scripts
'''

'\nHere we will combine and analyze the data\nwe gathered on the previous scripts\n'

In [2]:
import glob
import json
import numpy as np
import pandas as pd
import re
import seaborn as sns

In [3]:
def read_players():
    player_files = glob.glob("../output/players/basic-info-*.json")
    players = []

    for path in player_files:

        with open(path, "r") as f:
            data = json.load(f)
            data['player_id'] = re.search('\d+', path).group(0)

        players.append(data)

    players = pd.DataFrame(players)
    
    # Removes birthday greetings
    players['date_of_birth'] = players.date_of_birth.str.replace("Happy Birthday", "").str.strip()
    
    players['date_of_birth'] = pd.to_datetime(players.date_of_birth, format="%b %d, %Y")
    
    return players

In [4]:
def read_clubs():
    club_files = glob.glob("../output/clubs/*.json")
    clubs = []

    for path in club_files:

        with open(path, "r") as f:
            data = json.load(f)
            
        if 'built-in' in data['country']:
            data['country'] = 'Korea, South'

        clubs.append(data)

    clubs = pd.DataFrame(clubs)
    
    return clubs

In [5]:
def read_leagues():
    league_files = glob.glob("../output/leagues/*.json")
    leagues = []
    
    for path in league_files:
        
        with open(path, "r") as f:
            data = json.load(f)
            
        leagues.append(data)
    
    leagues = pd.DataFrame(leagues)
    
    return leagues

In [6]:
players = read_players()

In [7]:
clubs = read_clubs()

In [8]:
leagues = read_leagues()

In [9]:
def share_of_matches_local(player_id, leagues):
    
    with open(f"../output/players/basic-info-{player_id}.json", "r") as f:
        basic_info = json.load(f)
        
    with open(f"../output/players/league-info-{player_id}.json", "r") as f:
        league_info = json.load(f)
        
    citizenship = basic_info['citizenship']
        
    league_info = pd.DataFrame(league_info)
    league_info = league_info.merge(leagues, on='league_id', suffixes=("", "_y"))
    league_info = league_info.drop(columns=[col for col in league_info.columns if '_y' in col])
    
    league_info['matches'] = league_info.matches.str.replace("-", "0")
    
    # Keep only league matches
    league_info = league_info[league_info.type=='league']
    
    # Sum of local league played
    local_matches = league_info[league_info.country==citizenship].matches.astype(int).sum()
    foreign_matches = league_info[league_info.country!=citizenship].matches.astype(int).sum()
    total_matches = league_info.matches.astype(int).sum()
    
    return round(local_matches / total_matches, 3) * 100
    

In [10]:
def load_player_info(player_id):
    '''
    Loads the relevant player info files
    '''
    
    with open(f"../output/players/basic-info-{player_id}.json", "r") as f:
        basic_info = json.load(f)

    with open(f"../output/players/transfer-info-{player_id}.json", "r") as f:
        transfer_info = json.load(f)
    
    return basic_info, transfer_info

In [11]:
def make_transfers_df(transfer_info):
    '''
    Creates a dataframe with the player
    transfer information
    '''
    # Formats the transfers data as a nice dataframe
    transfers = pd.DataFrame(transfer_info)

    transfers_left = transfers[['season', 'date', 'left', 'left_url', 'left_club_id']]
    transfers_joined = transfers[['season', 'date', 'joined', 'joined_url', 'joined_club_id']]
    

    transfers_left = transfers_left.merge(clubs, left_on='left_club_id', right_on='club_id')
    transfers_joined = transfers_joined.merge(clubs, left_on='joined_club_id', right_on='club_id')
    

    transfers_left = transfers_left.drop(columns=['left', 'club_id', 'club_url'])\
                        .rename(columns={"club_name": "left", 'country': 'left_country'})
    transfers_joined = transfers_joined.drop(columns=['joined', 'club_id', 'club_url', 'season', 'date']) \
                        .rename(columns={"club_name": "joined", 'country': 'joined_country'})
    

    transfers = pd.concat([transfers_left, transfers_joined], axis=1)
    
    transfers["date"] = pd.to_datetime(transfers.date, format="%b %d, %Y", errors='coerce')

    # Sorts by date so the earliest is at the beggining and resets index
    transfers = transfers.sort_values(by='date').reset_index(drop=True)
    
    return transfers

In [12]:
def first_left(player_id, player_info, transfers):
    '''
    Returns the details about player
    transfers for the calculation.
    '''
    
    # Defines player citizenship
    citizenship = player_info['citizenship']
    
    datapoint = {}
    
    # Edge case: player with an unknown double transfer in the beginning of the career,
    # but he likely never left Portugal
    if player_id == '550550':
        
        datapoint['first_left'] = np.nan
        datapoint['age_first_left'] = np.nan
        datapoint['started_abroad'] = False
        datapoint['no_transfers'] = False
    
    # Player never had a transfer
    if transfers.shape[0] == 0:
        
        # Select the player current club
        current_club_id = player_info['current_club_id']
        with open(f"../output/clubs/{current_club_id}.json", "r") as f:
            club_data = json.load(f)
        
        # If the club is from the country, never played abroad
        if club_data['country'] == citizenship:
            
            datapoint['first_left'] = np.nan
            datapoint['age_first_left'] = np.nan
            datapoint['started_abroad'] = False
            datapoint['no_transfers'] = True
            
            
            return datapoint
        
        # If it's not, he started abroad
        else:
            datapoint['first_left'] = np.nan
            datapoint['age_first_left'] = np.nan
            datapoint['started_abroad'] = True
            datapoint['no_transfers'] = True

            return datapoint
            
    # If the first row has Unknown or Own Youth, we will remove it
    if transfers.loc[0, 'left'] in ['Unknown', 'Own Youth']:
        
        # If this is the only transfer, we will not presume anything
        # about the Unknown club – we will assume the player started
        # his career in the first known entry
        if transfers.shape[0] == 1: # one single row
            
            if transfers.loc[0, 'joined_country'] !=  citizenship:
                # Started abroad
                datapoint['first_left'] = np.nan
                datapoint['age_first_left'] = np.nan
                datapoint['started_abroad'] = True
                
                return datapoint

            else:
                # never played abroad
                datapoint['first_left'] = np.nan
                datapoint['age_first_left'] = np.nan
                datapoint['started_abroad'] = False
                datapoint['no_transfers'] = True
                
                return datapoint

        # Removes the entry with the unknown origin
        transfers = transfers.drop([0]).reset_index()

    # The player may have started the career abroad. To check for this case, let's make sure that the
    # first country in the transfers is different than citizenship country.
    first_left = transfers.loc[0, 'left_country'] 
    if first_left != citizenship:
        
        datapoint['first_left'] = np.nan
        datapoint['age_first_left'] = np.nan
        datapoint['started_abroad'] = True
        datapoint['no_transfers'] = True

        return datapoint

    # Player may also have never left
    transfers_to_abroad = transfers[(transfers.left_country==citizenship) & (transfers.joined_country!=citizenship)]
    if transfers_to_abroad.shape[0] == 0:
        
        datapoint['first_left'] = np.nan
        datapoint['age_first_left'] = np.nan
        datapoint['started_abroad'] = False
        datapoint['no_transfers'] = False
        
        return datapoint

    # If this is not the case, find the first instance where he joined a club from abroad
    else:
        
        datapoint['first_left'] = transfers_to_abroad.date.min()
        datapoint['started_abroad'] = False
        datapoint['no_transfers'] = False
        datapoint['age_first_left'] = (datapoint['first_left'] - pd.to_datetime(player_info['date_of_birth']) ) / np.timedelta64(1, 'Y')

        return datapoint

In [13]:
def years_abroad(player_id, player_info, transfers):
    '''
    Returns how many of the
    total career years of the player
    as an adult (starting at 16) he spent
    playing home and abroad.
    '''
    
    # Define player citizenship
    citizenship = player_info['citizenship']
    
    # Define the timestamp when the player was 18 years old
    date_of_birth = pd.to_datetime(player_info['date_of_birth'])
    birthday_18 = date_of_birth + np.timedelta64(16, 'Y')
    
    # For each transfer, compute total days spent in the club he left
    transfers['time_in_club'] = transfers.date.diff()
            
    # The first row should show the time spent between his 18th birthday and the first transfer date
    transfers.loc[0, 'time_in_club'] = transfers.loc[0, 'date'] - birthday_18
    
    data = [ ]
    for index, row in transfers.iterrows():
        # Determine if the club is from home
        club = row.left
        club_id = row.left_club_id
        club_country = row.left_country
        club_end = row.date
        time_in_club = row.time_in_club
        if index > 0:
            club_start = transfers.loc[index-1, 'date']
        else: 
            club_start = np.nan
            
        if club_country == citizenship:
            club_situation = 'home'
            
        else:
            club_situation = 'abroad'

        
        datapoint =  {
            'club': club,
            'club_id': club_id,
            'club_country': club_country,
            'club_situation': club_situation,
            'club_start': club_start,
            'club_end': club_end,
            'time_in_club_over_16': time_in_club
        }
        
        data.append(datapoint)
            
    
    
    # And a new entry with the time spent in the current club at the moment the World Cup started
    world_cup_start = pd.to_datetime("2022-11-20")
    
    last_index = transfers.shape[0] - 1
    
    club = transfers.loc[last_index, 'joined']
    club_id = transfers.loc[last_index, 'joined_club_id']
    club_country = transfers.loc[last_index, 'joined_country']
    club_start = transfers.loc[last_index, 'date']
    club_end = np.nan
    time_in_club = world_cup_start - club_start
    club_situation = 'abroad' if club_country != citizenship else 'home'
    
    datapoint = {
        'club': club,
        'club_country': club_country,
        'club_id': club_id,
        'club_situation': club_situation,
        'club_start': club_start,
        'club_end': club_start,
        'time_in_club_over_16': time_in_club
    }
    
    
    data.append(datapoint)
    
    data = pd.DataFrame(data)
    
    data = {
        'days_abroad': data[data.club_situation=='abroad'].time_in_club_over_16.sum().days,
        'days_home':  data[data.club_situation=='home'].time_in_club_over_16.sum().days,
        'currently_abroad': club_country != citizenship # only last club
    }
    
    # If the sum at home is negative, he left before turning 18
    if data['days_home'] <= 0:
        data['days_home'] = 0
        
    if data['days_abroad'] <= 0:
        data['days_abroad'] = 0
        
    # Now get the total career days over 16
    data['total_days'] = data['days_abroad'] + data['days_home']
    
    if data['days_home'] > 0:
        data['has_played_home_over_16'] = True
    else:
        data['has_played_home_over_16'] = False
    
    return data

In [14]:
def get_career_details(player_id, clubs):
    
    player_id = row.player_id
    
    # Loads the information from scrapped data
    basic_info, transfer_info = load_player_info(player_id)
    
    # Cleans the birthday congratulation
    basic_info['date_of_birth'] = basic_info['date_of_birth'].replace("Happy Birthday", "").strip()

    # Defines the citizenship/national team player plays for
    citizenship = basic_info['citizenship']

    # Formats the transfers data as a nice dataframe
    transfers = make_transfers_df(transfer_info)
    
#     # If there's weirdness on transfers, let me know
#     if 'Unknown' in transfers.left.unique() or 'Unknown' in transfers.joined.unique():
        
#         # Problem happens only when unknown is not on first row        
#         print(player_id)
#         display(transfers[['left', 'joined', 'left_country', 'joined_country']])
        
    first_left_data = first_left(player_id, basic_info, transfers)
    years_abroad_data = years_abroad(player_id, basic_info, transfers)
    
    # Concatenate dictionaties
    # https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression
    data = { **first_left_data, **years_abroad_data }    
    
    return data

In [15]:
# When did each player first left, and with which age?
new_info = []
for index, row in players.iterrows():
    row = get_career_details(row.player_id, clubs)
    new_info.append(row)
    
new_info = pd.DataFrame(new_info)
full_player_data = pd.concat([players, new_info], axis=1)

In [16]:
full_player_data.citizenship.value_counts()

Morocco          26
Portugal         26
United States    26
Costa Rica       26
Croatia          26
Cameroon         26
Ghana            26
Belgium          26
Qatar            26
Saudi Arabia     26
Poland           26
Argentina        26
Canada           26
Japan            26
Denmark          26
Brazil           26
Spain            26
England          26
Korea, South     26
Netherlands      26
Wales            26
Uruguay          26
Australia        26
France           26
Germany          26
Mexico           26
Ecuador          26
Senegal          26
Switzerland      26
Tunisia          26
Serbia           26
Iran             25
Name: citizenship, dtype: int64

In [17]:
full_player_data.to_csv("../output/2022-full-player-data.csv", index=False)