In [None]:
# !pip install thefuzz

In [None]:
import pandas as pd
import sys
import numpy as np
import re
from thefuzz import process

In [None]:
bundesliga = pd.read_csv('1-bundesliga.csv')
eredivisie = pd.read_csv('eredivisie.csv')
liganos = pd.read_csv('liga-nos.csv')
ligue1 = pd.read_csv('ligue-1.csv')
premier = pd.read_csv('premier-league.csv')
premierliga = pd.read_csv('premier-liga.csv')
liga = pd.read_csv('primera-division.csv')
seriea = pd.read_csv('serie-a.csv')

In [None]:
df = pd.concat([bundesliga, eredivisie, liganos, ligue1, premier, premierliga, liga, seriea], ignore_index = True)
df.head()

In [None]:
len(df)

In [None]:
# Remove duplicated rows
df = df.drop_duplicates()
len(df)

In [None]:
# CLEANING

# Avoid deleting transfers involving team 'Willem II'
willem = df[df['club_involved_name'] == 'Willem II']

# Remove 'Career break', 'Unknown', 'Ban', 'Retired' and 'Without Club'
df = df[df['club_involved_name'] != 'Career break']
df = df[df['club_involved_name'] != 'Unknown']
df = df[df['club_involved_name'] != 'Ban']
df = df[df['club_involved_name'] != 'Retired']
df = df[df['club_involved_name'] != 'Without Club']

# Remove df[df['fee'] == '0'] - Only one observation
df = df[df['fee'] != '0']

# Remove transfers involving seocond teams/youth sectors
remove = ['II', 'U19', 'U20', ' B', ' C', '-D', 'Youth', 'U21', 'U17', 'U23', 'U18', 
          'Sub-23', ' 2', 'Res.', 'Juve Next Gen', 'Arsenal-2 Tula', 'FShM Torpedo 19', 
          'US Palermo Yout', 'Cremonese Giov.', 'Amkar-Junior', 'Blackburn Acad.', 'Reggina Primaver', 
          'Real Oviedo You', 'Tom-2 Tomsk', 'DYuSSh Spartak', 'CA Osasuna Prom', 'Y19', 
          'Yth.', 'ACR Messina You', 'Akademia KSS', 'Valladolid Prom', 'Southampton Aca', 
          'Siena Junior', 'Leeds Reserves', 'Anzhi-Yunior', 'Barça Atlètic', 'Akademia Ufa']
remove_escaped = [re.escape(x) for x in remove]
pattern = r'\b(?:' + '|'.join(remove_escaped) + r')(?!\w)'
df = df[~df['club_involved_name'].str.contains(pattern, regex = True)]

df = pd.concat([df, willem], ignore_index = True)
        
len(df)

In [None]:
# Add 'id' column
df['id'] = range(0, len(df))

In [None]:
#Reorder columns
new_order = ['id', 'club_name', 'player_name', 'age', 'position', 
             'club_involved_name', 'fee', 'transfer_movement', 
             'transfer_period', 'fee_cleaned', 'league_name', 
             'year', 'season', 'country']

df = df[new_order]

In [None]:
df.reset_index(drop = True, inplace = True)
df.head()

In [None]:
df.info()

In [None]:
def match_name(name, standard_names):
    return process.extractOne(name, standard_names)

df['transfer_id'] = None

datain = df[df['transfer_movement'] == 'in']
dataout = df[df['transfer_movement'] == 'out']

k = 0

CLEAR_LINE = '\033[K'

for i, transfer_in in datain.iterrows():
    index_in = transfer_in['id']
    if transfer_in['transfer_id'] is not None:
        continue
    search = dataout[dataout['season'] == transfer_in['season']]
    search = search[search['transfer_period'] == transfer_in['transfer_period']]
    search = search[search['player_name'] == transfer_in['player_name']]
    search = search[search['age'] == transfer_in['age']]
    search = search[search['position'] == transfer_in['position']]
    search = search[search['fee'] == transfer_in['fee']]
    search = search[search['transfer_id'].isnull()]
    if len(search) > 1:
        teams = list(search['club_involved_name'])
        target = match_name(transfer_in['club_name'], teams)[0]
        search = search[search['club_involved_name'] == target]
    for j, transfer_out in search.iterrows():
        index_out = transfer_out['id']
        datain.loc[datain['id'] == index_in, 'transfer_id'] = k
        dataout.loc[dataout['id'] == index_out, 'transfer_id'] = k
        k += 1

    message = f"Iteration {i}, {np.round(i*100/len(df), 2)}%"
    sys.stdout.write('\r' + CLEAR_LINE + message)
    sys.stdout.flush()

In [None]:
in_notnull = datain[datain['transfer_id'].notnull()].copy()
in_notnull['country2'] = None
out_notnull = dataout[dataout['transfer_id'].notnull()].copy()

for index, row_in in in_notnull.iterrows():
    id = row_in['transfer_id']
    row_out = out_notnull[out_notnull['transfer_id'] == id]
    team_out = row_out['club_name'].values[0]
    contryout = row_out['country'].values[0]
    in_notnull.loc[index, 'club_involved_name'] = team_out
    in_notnull.loc[index, 'country2'] = contryout
    
in_notnull.head() 

In [None]:
network_data = in_notnull[['club_name', 'country', 'club_involved_name', 'country2', 'fee', 'fee_cleaned', 'season', 'transfer_period']].copy()
network_data.rename(columns = {'club_name': 'target', 'country': 'target_country', 'club_involved_name': 'source', 'country2': 'source_country'}, inplace = True)
network_data.reset_index(drop = True, inplace = True)
network_data.head()

In [None]:
len(network_data)

In [None]:
network_data.info()

# Team name cleaning


In [None]:
len(network_data['source'].unique()), len(network_data['target'].unique())

## Serie A

In [None]:
italian_source_teams = network_data[network_data['source_country'] == 'Italy']['source'].unique()
italian_target_teams = network_data[network_data['target_country'] == 'Italy']['target'].unique()

In [None]:
sorted_italian_source_teams = sorted(italian_source_teams)
print("Italian Source Teams in Alphabetical Order:")
for team in sorted_italian_source_teams:
    print(team)

sorted_italian_target_teams = sorted(italian_target_teams)
print("\nItalian Target Teams in Alphabetical Order:")
for team in sorted_italian_target_teams:
    print(team)

In [None]:
#source teams
network_data['source'] = network_data['source'].replace('Empoli FC', 'FC Empoli')
network_data['source'] = network_data['source'].replace('FC Internazionale', 'Inter Milan')
network_data['source'] = network_data['source'].replace('ACF Fiorentina', 'AC Fiorentina')
network_data['source'] = network_data['source'].replace('Milan AC', 'AC Milan')
network_data['source'] = network_data['source'].replace('Parma Calcio 1913', 'Parma FC')
network_data['source'] = network_data['source'].replace('AC Parma', 'Parma FC')
network_data['source'] = network_data['source'].replace('AC Venezia 1907', 'Venezia FC')
network_data['source'] = network_data['source'].replace('Torino Calcio', 'Torino FC')
network_data['source'] = network_data['source'].replace('US Salernitana', 'US Salernitana 1919')
network_data['source'] = network_data['source'].replace('Genoa CFC', 'Genoa 1893')


#target team
network_data['target'] = network_data['target'].replace('Empoli FC', 'FC Empoli')
network_data['target'] = network_data['target'].replace('FC Internazionale', 'Inter Milan')
network_data['target'] = network_data['target'].replace('ACF Fiorentina', 'AC Fiorentina')
network_data['target'] = network_data['target'].replace('Milan AC', 'AC Milan')
network_data['target'] = network_data['target'].replace('Parma Calcio 1913', 'Parma FC')
network_data['target'] = network_data['target'].replace('AC Parma', 'Parma FC')
network_data['target'] = network_data['target'].replace('AC Venezia 1907', 'Venezia FC')
network_data['target'] = network_data['target'].replace('Torino Calcio', 'Torino FC')
network_data['target'] = network_data['target'].replace('US Salernitana', 'US Salernitana 1919')
network_data['target'] = network_data['target'].replace('Genoa CFC', 'Genoa 1893')
network_data['target'] = network_data['target'].replace('SPAL 2013', 'SPAL')

## Premier League

In [None]:
english_source_teams = network_data[network_data['source_country'] == 'England']['source'].unique()
english_target_teams = network_data[network_data['target_country'] == 'England']['target'].unique()

In [None]:
sorted_english_source_teams = sorted(english_source_teams)
print("English Source Teams in Alphabetical Order:")
for team in sorted_english_source_teams:
    print(team)

sorted_english_target_teams = sorted(english_target_teams)
print("\nEnglish Target Teams in Alphabetical Order:")
for team in sorted_english_target_teams:
    print(team)

## Ligue 1

In [None]:
french_source_teams = network_data[network_data['source_country'] == 'France']['source'].unique()
french_target_teams = network_data[network_data['target_country'] == 'France']['target'].unique()

In [None]:
sorted_french_source_teams = sorted(french_source_teams)
print("French Source Teams in Alphabetical Order:")
for team in sorted_french_source_teams:
    print(team)

sorted_french_target_teams = sorted(french_target_teams)
print("\nFrench Target Teams in Alphabetical Order:")
for team in sorted_french_target_teams:
    print(team)

In [None]:
#source teams
network_data['source'] = network_data['source'].replace('Association Troyes Aube Champagne', 'ESTAC Troyes')
network_data['source'] = network_data['source'].replace('Union Sportive Valenciennes-Anzin Arrondissement', 'Valenciennes FC')

#target teams
network_data['target'] = network_data['target'].replace('Association Troyes Aube Champagne', 'ESTAC Troyes')
network_data['target'] = network_data['target'].replace('Union Sportive Valenciennes-Anzin Arrondissement', 'Valenciennes FC')

## LaLiga

In [None]:
spanish_source_teams = network_data[network_data['source_country'] == 'Spain']['source'].unique()
spanish_target_teams = network_data[network_data['target_country'] == 'Spain']['target'].unique()

In [None]:
sorted_spanish_source_teams = sorted(spanish_source_teams)
print("Spanish Source Teams in Alphabetical Order:")
for team in sorted_spanish_source_teams:
    print(team)

sorted_spanish_target_teams = sorted(spanish_target_teams)
print("\nSpanish Target Teams in Alphabetical Order:")
for team in sorted_spanish_target_teams:
    print(team)

In [None]:
#source teams
network_data['source'] = network_data['source'].replace('CF Extremadura (- 2010)', 'CF Extremadura')

#target teams
network_data['target'] = network_data['target'].replace('CF Extremadura (- 2010)', 'CF Extremadura')

## Bundesliga

In [None]:
german_source_teams = network_data[network_data['source_country'] == 'Germany']['source'].unique()
german_target_teams = network_data[network_data['target_country'] == 'Germany']['target'].unique()

In [None]:
sorted_german_source_teams = sorted(german_source_teams)
print("German Source Teams in Alphabetical Order:")
for team in sorted_german_source_teams:
    print(team)

sorted_german_target_teams = sorted(german_target_teams)
print("\nGerman Target Teams in Alphabetical Order:")
for team in sorted_german_target_teams:
    print(team)

In [None]:
#source teams
network_data['source'] = network_data['source'].replace('VfB Leipzig (- 2004)', 'VfB Leipzig')
network_data['source'] = network_data['source'].replace('Bayer 05 Uerdingen', 'KFC Uerdingen 05')

#target teams
network_data['target'] = network_data['target'].replace('VfB Leipzig (- 2004)', 'VfB Leipzig')
network_data['target'] = network_data['target'].replace('Bayer 05 Uerdingen', 'KFC Uerdingen 05')

## Liga Portugal 1

In [None]:
protuguese_source_teams = network_data[network_data['source_country'] == 'Portugal']['source'].unique()
protuguese_target_teams = network_data[network_data['target_country'] == 'Portugal']['target'].unique()

In [None]:
sorted_protuguese_source_teams = sorted(protuguese_source_teams)
print("Portuguese Source Teams in Alphabetical Order:")
for team in sorted_protuguese_source_teams:
    print(team)

sorted_protuguese_target_teams = sorted(protuguese_target_teams)
print("\nPortuguese Target Teams in Alphabetical Order:")
for team in sorted_protuguese_target_teams:
    print(team)

In [None]:
#source teams
network_data['source'] = network_data['source'].replace('Desportivo Aves (- 2020)', 'Desportivo Aves')
network_data['source'] = network_data['source'].replace('SC Campomaiorense (- 2001)', 'SC Campomaiorense')

#target teams
network_data['target'] = network_data['target'].replace('Desportivo Aves (- 2020)', 'Desportivo Aves')
network_data['target'] = network_data['target'].replace('SC Campomaiorense (- 2001)', 'SC Campomaiorense')

## Prem'er-Liga

In [None]:
russian_source_teams = network_data[network_data['source_country'] == 'Russia']['source'].unique()
russian_target_teams = network_data[network_data['target_country'] == 'Russia']['target'].unique()

In [None]:
sorted_russian_source_teams = sorted(russian_source_teams)
print("Russian Source Teams in Alphabetical Order:")
for team in sorted_russian_source_teams:
    print(team)

sorted_russian_target_teams = sorted(russian_target_teams)
print("\nRussian Target Teams in Alphabetical Order:")
for team in sorted_russian_target_teams:
    print(team)

In [None]:
#source teams
network_data['source'] = network_data['source'].replace('Dynamo Moscow', 'Dinamo Moscow')
network_data['source'] = network_data['source'].replace('Anzhi Makhachkala ( -2022)', 'Anzhi Makhachkala')
network_data['source'] = network_data['source'].replace('FC Nizhniy Novgorod', 'FC Pari Nizhniy Novgorod')
network_data['source'] = network_data['source'].replace('FC Tosno (-2018)', 'FC Tosno')
network_data['source'] = network_data['source'].replace('Kuban Krasnodar (-2018)', 'Kuban Krasnodar')
network_data['source'] = network_data['source'].replace('Mordovia Saransk (-2020)', 'Mordovia Saransk')
network_data['source'] = network_data['source'].replace('Saturn REN-TV Ramenskoe', 'Saturn Ramenskoe')
network_data['source'] = network_data['source'].replace('Sibir Novosibirsk (- 2019)', 'Sibir Novosibirsk')
network_data['source'] = network_data['source'].replace('Volga Nizhniy Novgorod (- 2016)', 'Volga Nizhniy Novgorod')
network_data['source'] = network_data['source'].replace('Spartak Vladikavkaz', 'Alania Vladikavkaz')
network_data['source'] = network_data['source'].replace('Spartak-Alania Vladikavkaz', 'Alania Vladikavkaz')
network_data['source'] = network_data['source'].replace('Torpedo-Metallurg Moscow', 'FC Moscow')
network_data['source'] = network_data['source'].replace('Torpedo-ZiL Moscow', 'FC Moscow')
network_data['source'] = network_data['source'].replace('Факел-Воронеж Воронеж', 'Fakel Voronezh')
network_data['source'] = network_data['source'].replace('PFK Tambov', 'FK Tambov')
network_data['source'] = network_data['source'].replace('Dinamo-Gazovik Tyumen', 'FK Tyumen')
network_data['source'] = network_data['source'].replace('Торпедо-Лужники Москва', 'Torpedo Moscow')

#target teams
network_data['target'] = network_data['target'].replace('Dynamo Moscow', 'Dinamo Moscow')
network_data['target'] = network_data['target'].replace('Anzhi Makhachkala ( -2022)', 'Anzhi Makhachkala')
network_data['target'] = network_data['target'].replace('FC Nizhniy Novgorod', 'FC Pari Nizhniy Novgorod')
network_data['target'] = network_data['target'].replace('FC Tosno (-2018)', 'FC Tosno')
network_data['target'] = network_data['target'].replace('Kuban Krasnodar (-2018)', 'Kuban Krasnodar')
network_data['target'] = network_data['target'].replace('Mordovia Saransk (-2020)', 'Mordovia Saransk')
network_data['target'] = network_data['target'].replace('Saturn REN-TV Ramenskoe', 'Saturn Ramenskoe')
network_data['target'] = network_data['target'].replace('Sibir Novosibirsk (- 2019)', 'Sibir Novosibirsk')
network_data['target'] = network_data['target'].replace('Volga Nizhniy Novgorod (- 2016)', 'Volga Nizhniy Novgorod')
network_data['target'] = network_data['target'].replace('Spartak Vladikavkaz', 'Alania Vladikavkaz')
network_data['target'] = network_data['target'].replace('Spartak-Alania Vladikavkaz', 'Alania Vladikavkaz')
network_data['target'] = network_data['target'].replace('Torpedo-Metallurg Moscow', 'FC Moscow')
network_data['target'] = network_data['target'].replace('Torpedo-ZiL Moscow', 'FC Moscow')
network_data['target'] = network_data['target'].replace('Факел-Воронеж Воронеж', 'Fakel Voronezh')
network_data['target'] = network_data['target'].replace('PFK Tambov', 'FK Tambov')
network_data['target'] = network_data['target'].replace('Dinamo-Gazovik Tyumen', 'FK Tyumen')
network_data['target'] = network_data['target'].replace('Торпедо-Лужники Москва', 'Torpedo Moscow')

## Eredivisie

In [None]:
dutch_source_teams = network_data[network_data['source_country'] == 'Netherlands']['source'].unique()
dutch_target_teams = network_data[network_data['target_country'] == 'Netherlands']['target'].unique()

In [None]:
sorted_dutch_source_teams = sorted(dutch_source_teams)
print("Dutch Source Teams in Alphabetical Order:")
for team in sorted_dutch_source_teams:
    print(team)

sorted_dutch_source_teams = sorted(dutch_source_teams)
print("\nDutch Target Teams in Alphabetical Order:")
for team in sorted_dutch_source_teams:
    print(team)

In [None]:
#source teams
network_data['source'] = network_data['source'].replace("Dordrecht'90", 'FC Dordrecht')
network_data['source'] = network_data['source'].replace('FC Zwolle', 'PEC Zwolle')
network_data['source'] = network_data['source'].replace('Cambuur-Leeuwarden bvo', 'SC Cambuur-Leeuwarden')
network_data['source'] = network_data['source'].replace('SC Cambuur Leeuwarden', 'SC Cambuur-Leeuwarden')

#target teams
network_data['target'] = network_data['target'].replace("Dordrecht'90", 'FC Dordrecht')
network_data['target'] = network_data['target'].replace('FC Zwolle', 'PEC Zwolle')
network_data['target'] = network_data['target'].replace('Cambuur-Leeuwarden bvo', 'SC Cambuur-Leeuwarden')
network_data['target'] = network_data['target'].replace('SC Cambuur Leeuwarden', 'SC Cambuur-Leeuwarden')

In [None]:
len(network_data['source'].unique()), len(network_data['target'].unique())

In [None]:
network_data.to_csv('network_data.csv')