In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../data/processed/merged_data_with_team_ratings.csv')
bio_stats = pd.read_csv('../data/tm/engineered/bio-status/tm_player_bio_status_all_1617-2122_latest.csv')
trans_hist = pd.read_csv('../data/tm/engineered/transfer_history/tm_player_transfer_history_latest.csv')

In [4]:
position_map = {
    'Goalkeeper': 'Goalkeeper',
    'Centre-Back': 'Centre-Back',
    'Left-Back': 'Wing-Back',
    'Right-Back': 'Wing-Back',
    'Defensive Midfield': 'Defensive Midfielder',
    'Central Midfield': 'Central Midfielder',
    'Attacking Midfield': 'Attacking Midfielder',
    'Second Striker': 'Attacking Midfielder',
    'Left Midfield': 'Winger',
    'Right Midfield': 'Winger',
    'Left Winger': 'Winger',
    'Right Winger': 'Winger',
    'Centre-Forward': 'Centre-Forward'
}

trans_hist['position'] = trans_hist['position'].map(position_map)

In [5]:
# Keep only the specified leagues
leagues_to_keep = ['L1', 'FR1', 'GB1', 'ES1', 'IT1']
trans_hist = trans_hist[trans_hist['league_code'].isin(leagues_to_keep)]

In [6]:
# Replace NaN values with 0 in fee_cleaned column
trans_hist['fee_cleaned'] = trans_hist['fee_cleaned'].fillna(0)

# Drop entries where 'fee' contains 'Loan' or 'loan'
trans_hist = trans_hist[~trans_hist['fee'].str.contains('Loan|loan', case=False, na=False)]

In [7]:
# Create unique player identifier
trans_hist['unique_player'] = trans_hist['player_name'] + '_' + trans_hist['position'].str.replace(' ', '_')

In [8]:
# Remove duplicate 'out' transfers while keeping 'in' transfers
trans_hist = trans_hist.sort_values(by=['unique_player', 'season', 'transfer_movement'])
trans_hist = trans_hist.drop_duplicates(subset=['unique_player', 'season', 'fee_cleaned'], keep='first')

In [9]:
# Special cases for Chema and Luis Suarez
players_to_deduplicate = ['Chema', 'Luis Suárez']

# Create a mask for the players we want to deduplicate
mask = trans_hist['player_name'].isin(players_to_deduplicate)

# Apply deduplication only for the specified players
trans_hist.loc[mask] = trans_hist.loc[mask].drop_duplicates(subset=['unique_player', 'season'], keep='first')

# Additional step for Luis Suárez: remove the third entry
suarez_entries = trans_hist[trans_hist['player_name'] == 'Luis Suárez'].sort_values('season')
if len(suarez_entries) >= 3:
    third_entry_index = suarez_entries.index[2]
    trans_hist = trans_hist.drop(third_entry_index)

In [10]:
# Identify unique players by player_name and position, handling NaN values
trans_hist['unique_player'] = trans_hist['player_name'] + '_' + trans_hist['position'].fillna('Unknown').str.replace(' ', '_')

# Sort the dataframe to ensure consistent results, with the most recent entries first
trans_hist = trans_hist.sort_values(['unique_player', 'year'], ascending=[True, False])

# Create a mask for entries up to and including 2017
mask_2017 = trans_hist['year'] <= 2017

# Group by unique player and get the most recent entry up to 2017
trans_hist_2017 = trans_hist[mask_2017].groupby('unique_player', dropna=False).first().reset_index()

# Get all entries after 2017
trans_hist_after_2017 = trans_hist[trans_hist['year'] > 2017]

# Combine the filtered 2017 data with all entries after 2017
trans_hist_filtered = pd.concat([trans_hist_2017, trans_hist_after_2017])

# Sort the dataframe to maintain a consistent order
trans_hist_filtered = trans_hist_filtered.sort_values(by='year')

# Reset the index
trans_hist_filtered = trans_hist_filtered.reset_index(drop=True)

In [14]:
# Sort the dataframe by unique_player, season, and fee_cleaned (descending)
trans_hist_filtered = trans_hist_filtered.sort_values(['unique_player', 'season', 'fee_cleaned'], ascending=[True, True, False])

# Keep only the first occurrence (highest fee_cleaned) for each unique_player and season
trans_hist_filtered = trans_hist_filtered.drop_duplicates(subset=['unique_player', 'season'], keep='first')

# Reset the index
trans_hist_filtered = trans_hist_filtered.reset_index(drop=True)

In [15]:
import unicodedata
import string
import re

# Function to normalize names
def normalize_name(text):
    # Custom replacements for specific characters
    custom_replacements = {
        'Ł': 'L',
        'ł': 'l',
        # Add more replacements here if needed
    }
    
    # Apply custom replacements
    for original, replacement in custom_replacements.items():
        text = text.replace(original, replacement)
    
    # Normalize to NFKD form to separate accents from characters
    text = unicodedata.normalize('NFKD', text)
    
    # Remove combining diacritics
    text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
    
    # Remove punctuation, but keep spaces
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase and strip leading/trailing whitespace
    text = text.lower().strip()
    
    return text

# Normalize player names in both dataframes
df['normalized_name'] = df['Player'].apply(normalize_name)
trans_hist_filtered['normalized_name'] = trans_hist_filtered['player_name'].apply(normalize_name)

# Update the dictionaries with normalized names
player_transfer_fees = {}
player_name_transfer_fees = {}

for _, row in trans_hist_filtered.iterrows():
    # Handle None or NaN values in the position column
    position = row['position'] if pd.notna(row['position']) else 'Unknown'
    unique_player = row['normalized_name'] + '_' + position.replace(' ', '_')
    normalized_name = row['normalized_name']
    season = row['season']
    fee = row['fee_cleaned']
    
    if unique_player not in player_transfer_fees:
        player_transfer_fees[unique_player] = {}
    player_transfer_fees[unique_player][season] = fee
    
    if normalized_name not in player_name_transfer_fees:
        player_name_transfer_fees[normalized_name] = {}
    player_name_transfer_fees[normalized_name][season] = fee

# Update the get_transfer_fee function
def get_transfer_fee(normalized_name, position, season):
    # Handle None or NaN values in the position parameter
    position = position if pd.notna(position) else 'Unknown'
    unique_player = normalized_name + '_' + position.replace(' ', '_')
    
    # First, try with unique_player
    if unique_player in player_transfer_fees:
        if season in player_transfer_fees[unique_player]:
            return player_transfer_fees[unique_player][season]
        previous_fees = [(s, f) for s, f in player_transfer_fees[unique_player].items() if s <= season]
        if previous_fees:
            return max(previous_fees, key=lambda x: x[0])[1]
    
    # If no match found, try with normalized_name
    if normalized_name in player_name_transfer_fees:
        if season in player_name_transfer_fees[normalized_name]:
            return player_name_transfer_fees[normalized_name][season]
        previous_fees = [(s, f) for s, f in player_name_transfer_fees[normalized_name].items() if s <= season]
        if previous_fees:
            return max(previous_fees, key=lambda x: x[0])[1]
    
    return 0

# Apply the updated function to create the 'Latest_Transfer_Fee' column
df['Latest_Transfer_Fee'] = df.apply(lambda row: get_transfer_fee(
    row['normalized_name'],
    row['Position'],
    row['Season']
), axis=1)

In [17]:
name_mapping = {
    'ji dongwon': 'dongwon ji',
    'pierre leesmelou': 'pierre lees melou',
    'martin agirregabiria': 'martin aguirregabiria',
    'papakouli diop': 'pape diop',
    'samir santos': 'xabier santos',
    'alfreð finnbogason': 'alfred finnbogason',
    'mickael cuisance': 'michael cuisance',
    'lukas kubler': 'lukas kubler',
    'johann berg guðmundsson': 'johann berg gudmundsson',
    'pape cheikh diop': 'pape cheikh',
    'kwon changhoon': 'changhoon kwon',
    'son heungmin': 'heungmin son',
    'jose maria gimenez': 'jose gimenez',
    'dani carvajal': 'daniel carvajal',
    'geronimo rulli': 'gero rulli',
    'ohis felix uduokhai': 'felix uduokhai',
    'fabian ruiz pena': 'fabian ruiz',
    'dalbert henrique de souza': 'dalbert',
    'xabier etxeita': 'xabi etxeita',
    'fernando marcal': 'marcal',
    'pierre højbjerg': 'pierreemile hojbjerg',
    'andrefrank zambo anguissa': 'andre zambo anguissa',
    'levin oztunalı': 'levin oztunali',
    'marcelo junior': 'marcelo',
    'thiago alcantara': 'thiago',
    'john brooks': 'john anthony brooks',
    'gylfi sigurðsson': 'gylfi sigurdsson',
    'daniel parejo': 'dani parejo',
    'simon kjær': 'simon kjaer',
    'danilo larangeira': 'danilo',
    'idrissa gana gueye': 'idrissa gueye',
    'tomas pina isla': 'tomas pina',
    'kostas manolas': 'konstantinos manolas',
    'yunus mallı': 'yunus malli',
    'noah joel sarenren bazee': 'noah sarenren bazee',
    'charalambos lykogiannis': 'charalampos lykogiannis',
    'filip đuricic': 'filip djuricic',
    'dimitris siovas': 'dimitrios siovas',
    'jose luis gaya': 'jose gaya',
    'sehrou guirassy': 'serhou guirassy',
    'kepa arrizabalaga': 'kepa' 
}

# First, let's create a set of the players we're mapping
mapped_players = set(name_mapping.keys())

# Function to check if a player is in our mapping
def is_mapped_player(row):
    return row['normalized_name'] in mapped_players

# Apply the get_transfer_fee function to all players except those in our mapping
df['Latest_Transfer_Fee'] = df.apply(lambda row: 
    get_transfer_fee(row['normalized_name'], row['Position'], row['Season'])
    if not is_mapped_player(row) else None, 
    axis=1
)

# Now apply the mapping to the remaining players
df['mapped_name'] = df['normalized_name'].map(name_mapping)

# Get transfer fees for the mapped players
for index, row in df[df['Latest_Transfer_Fee'].isnull()].iterrows():
    mapped_name = row['mapped_name']
    if pd.notna(mapped_name):
        df.at[index, 'Latest_Transfer_Fee'] = get_transfer_fee(mapped_name, row['Position'], row['Season'])

In [19]:
# Create a dictionary with manual transfer fees, now including seasons
manual_transfer_fees = {
    'Ashley Barnes': {'2017/2018': 0.545, '2018/2019': 0.545, '2019/2020': 0.545, '2020/2021': 0.545},
    'James Tarkowski': {'2017/2018': 4.0, '2018/2019': 4.0, '2019/2020': 4.0, '2020/2021': 4.0},
    'Marcel Halstenberg': {'2017/2018': 3.5, '2018/2019': 3.5, '2019/2020': 3.5, '2020/2021': 3.5},
    'Vincent Manceau': {'2017/2018': 0.0, '2018/2019': 0.0, '2019/2020': 0.0, '2020/2021': 0.0},
    'Yussuf Poulsen': {'2017/2018': 1.55, '2018/2019': 1.55, '2019/2020': 1.55, '2020/2021': 1.55},
    'Lukas Kübler': {'2017/2018': 0.0, '2018/2019': 0.0, '2019/2020': 0.0, '2020/2021': 0.0},
    'Isaac Hayden': {'2017/2018': 2.9, '2018/2019': 2.9, '2019/2020': 2.9, '2020/2021': 2.9},
    'Kike': {'2017/2018': 2.0, '2018/2019': 2.0, '2019/2020': 2.0, '2020/2021': 2.0},
    'Solly March': {'2017/2018': 0.0, '2018/2019': 0.0, '2019/2020': 0.0, '2020/2021': 0.0},
    'Gabriel Dos Santos': {'2017/2018': 3.0, '2018/2019': 3.0, '2019/2020': 3.0, '2020/2021': 3.0},
    'Jamie Vardy': {'2017/2018': 1.24, '2018/2019': 1.24, '2019/2020': 1.24, '2020/2021': 1.24},
    'Emerson Palmieri': {'2017/2018': 20.0, '2018/2019': 20.0, '2019/2020': 20.0, '2020/2021': 20.0},
    'Lewis Dunk': {'2017/2018': 0.0, '2018/2019': 0.0, '2019/2020': 0.0, '2020/2021': 0.0},
    'Yassine Bounou': {'2017/2018': 0.36, '2018/2019': 0.36, '2019/2020': 0.36, '2020/2021': 4.0},
    'Hugo Mallo': {'2017/2018': 0.0, '2018/2019': 0.0, '2019/2020': 0.0, '2020/2021': 0.0},
    'Mark Noble': {'2017/2018': 0.0, '2018/2019': 0.0, '2019/2020': 0.0, '2020/2021': 0.0},
    'Jonathan Calleri': {'2017/2018': 11.0, '2018/2019': 11.0, '2019/2020': 11.0, '2020/2021': 11.0},
    'Jordan Marié': {'2017/2018': 0.0, '2018/2019': 0.0, '2019/2020': 0.0, '2020/2021': 0.0},
    'Dimitri Liénard': {'2017/2018': 0.0, '2018/2019': 0.0, '2019/2020': 0.0, '2020/2021': 0.0},
    'Jonny Castro': {'2017/2018': 0.0, '2018/2019': 21.0, '2019/2020': 21.0, '2020/2021': 21.0},
    'Rober': {'2017/2018': 0.0, '2018/2019': 0.0, '2019/2020': 0.0, '2020/2021': 0.0},
    'Jaume Costa': {'2017/2018': 0.0, '2018/2019': 0.0, '2019/2020': 0.0, '2020/2021': 0.0},
    'Wes Morgan': {'2017/2018': 1.13, '2018/2019': 1.13, '2019/2020': 1.13, '2020/2021': 1.13},
    'Lukas Klostermann': {'2017/2018': 1.0, '2018/2019': 1.0, '2019/2020': 1.0, '2020/2021': 1.0},
    'Amir Abrashi': {'2017/2018': 0.15, '2018/2019': 0.15, '2019/2020': 0.15, '2020/2021': 0.15},
    'Marcel Sabitzer': {'2017/2018': 2.0, '2018/2019': 2.0, '2019/2020': 2.0, '2020/2021': 2.0},
    'Emil Forsberg': {'2017/2018': 3.70, '2018/2019': 3.70, '2019/2020': 3.70, '2020/2021': 3.70},
    'Dalbert Henrique': {'2017/2018': 21.0, '2018/2019': 21.0, '2019/2020': 21.0, '2020/2021': 21.0},
    'Vitorino Hilton': {'2017/2018': 5.0, '2018/2019': 5.0, '2019/2020': 5.0, '2020/2021': 5.0},
    'Manu Trigueros': {'2017/2018': 0.1, '2018/2019': 0.1, '2019/2020': 0.1, '2020/2021': 0.1}
}

In [20]:
# Function to update transfer fee
def update_transfer_fee(row):
    player = row['Player']
    season = row['Season']
    if player in manual_transfer_fees:
        return manual_transfer_fees[player].get(season, row['Latest_Transfer_Fee'])
    return row['Latest_Transfer_Fee']

# Apply the function to update the Latest_Transfer_Fee column
df['Latest_Transfer_Fee'] = df.apply(update_transfer_fee, axis=1)

In [28]:
# Convert MV1 and MV2 to millions and round all three columns to 3 decimal places
df['MV1'] = (df['MV1'] / 1000000).round(3)
df['MV2'] = (df['MV2'] / 1000000).round(3)
df['Latest_Transfer_Fee'] = df['Latest_Transfer_Fee'].round(3)

# Display the updated dataframe
df

Unnamed: 0,Player,Team,Age,Nationality,Season,MV1,MV2,Position,Comp,MP,Starts,Min,90s,Overall_Performance_Index,Team_Rating,Latest_Transfer_Fee
0,Aaron Cresswell,West Ham,27,England,2017/2018,12.0,12.0,Wing-Back,eng Premier League,36,35,3069.0,34.1,-0.180051,-0.500,4.280
1,Aaron Cresswell,West Ham,28,England,2018/2019,10.0,10.0,Wing-Back,eng Premier League,20,18,1589.0,17.7,-0.082492,-0.075,4.280
2,Aaron Cresswell,West Ham,29,England,2019/2020,8.0,6.5,Wing-Back,eng Premier League,31,31,2727.0,30.3,-0.070486,-0.325,4.280
3,Aaron Cresswell,West Ham,30,England,2020/2021,6.5,5.0,Wing-Back,eng Premier League,36,36,3170.0,35.2,-0.124919,0.375,4.280
4,Aaron Wan-Bissaka,Crystal Palace,19,England,2017/2018,1.0,1.0,Wing-Back,eng Premier League,7,7,627.0,7.0,-0.058601,-0.250,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4499,Łukasz Fabiański,West Ham,35,Poland,2020/2021,2.5,1.5,Goalkeeper,eng Premier League,35,35,3150.0,35.0,0.070433,0.375,7.200
4500,Łukasz Skorupski,Roma,26,Poland,2017/2018,7.0,7.0,Goalkeeper,it Serie A,1,1,90.0,1.0,0.340835,0.825,0.801
4501,Łukasz Skorupski,Bologna,27,Poland,2018/2019,7.0,7.0,Goalkeeper,it Serie A,38,38,3420.0,38.0,0.155131,-0.200,8.100
4502,Łukasz Skorupski,Bologna,28,Poland,2019/2020,7.0,5.5,Goalkeeper,it Serie A,37,37,3330.0,37.0,0.056816,-0.325,8.100


In [26]:
df.drop(columns=['mapped_name', 'normalized_name'], inplace=True)

In [31]:
df.to_csv('../data/processed/final_data.csv', index=False)