In [27]:
import pandas as pd
from difflib import get_close_matches

In [28]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Intermediate Cleaning Step
Even after cleaning the transfermarkt player names for special characters in tableau the lef join operation, where the left df is the fbref dataset, we get 169 mismatched names in the fbref data.

Upon close inspectino in Tableau we realize the issue for many of this mismatches is that player names are often reported with shorter versions of their name in one data set vs the other, or with middle name vs no middle names in the other.

Due to the diversity of discrepancies in names, the best solution is to perform fuzzy matching. Tableau is reportedly poor at this task and thus we decided to perform this intermediate cleaning step in python before returning to Tableau to perform the final join between the fbref and transfermarkt data sets.

We use fuzzy matching to find and replace the fbref player name with its trasnfermarkt player name equivalent if possible. after we import the new fbref table with the more harmonized player names back into Tableau to peform the final merge.

### Loading Transfermarkt data cleaned in Tableau + fbref data cleaned in pycharm

In [160]:
transfermarkt_df = pd.read_csv('transfermarkt_player_marketvalue_cleaned.csv', index_col=0)

In [161]:
fbref_df = pd.read_csv('fbref_player_statistics.csv',index_col=0)

### Using fuzzy matching to replace player names in fbref data with transfermarkt variation if available

In [162]:
harmonized_names=[] #this is the list used to replace the player names in fbref
matched_pairs = dict() #to control quality of match we create matched_pairs dic + a no_match list
no_match = [] #if you wish to know how many are not found a match at all (for instance by varying cutoff parameter)
for i in fbref_df['Player']:
    close_match = get_close_matches(word=i, possibilities=transfermarkt_df['Player'], n=1, cutoff=.6)
    if close_match:
        matched_pairs[i] = close_match[0]
        harmonized_names.append(close_match[0])
    else:
        no_match.append(i)
        harmonized_names.append(i)

In [163]:
# fbref_df.sort_values('Player')

In [164]:
#controlling quality of match: we manually inspect dic of inexact matches to find wrong matches and generate a list to use
#as a filter that only adds th

sortedDict = dict(sorted(matched_pairs.items(), key=lambda x: x[0].lower()))
i=0
for k,v in sortedDict.items():
    if k!=v:
        print('{}:{}'.format(k,v))
        i+=1
print("# of inexact matches: " + str(i))


Abdul Majeed Waris:Majeed Waris
Adri Embarba:Adrian Embarba
Adrian Guerrero:Adria Guerrero
Alejandro Baena:Alessandro Berardi
Alejandro Blesa:Alessandro Berardi
Alejandro Pozo Pozo:Alejandro Pozo
Amad Traore:Adama Traore
Anderson Lima:Jefferson Lerma
Andre Frank Zambo Anguissa:Andre Zambo Anguissa
Angelo da Costa Junior:Angelo da Costa
Antonio Cortes:Antonio Puertas
Arthur Melo:Arthur
Baba Rahman:Benito Raman
Bryan:Bryan Gil
Cauly Oliveira Souza:Paulo Oliveira
Charalambos Lykogiannis:Charalampos Lykogiannis
Churripi:Chumi
Cristian Rivera:Christian Rivera
Dalbert Henrique:Caio Henrique
Dani Carvajal:Daniel Carvajal
Daniel Martin:Dani Martin
Daniel Parejo:Dani Parejo
Danilo Barbosa:Mariano Barbosa
Danilo Larangeira:Danilo Cataldi
Didier Ibrahim Ndong:Didier Ndong
Dimitris Siovas:Dimitrios Siovas
Eddy Salcedo:Eddie Salcedo
Edmilson Indjai:Edimilson Fernandes
Efthimis Koulouris:Efthymios Koulouris
Elie Youan:Thody Elie Youan
Elif Elmas:Eljif Elmas
Emerson Palmieri:Emerson
Emi Buendia:Emili

In [165]:
#through manual inspection of the inexact matches produced above, we determined a list of fbref that were found an 
#erroneous match in the transfermarkt names

wrong_matches = ['Alejandro Baena','Alejandro Blesa','Anderson Lima','Antonio Cortes','Baba Rahman','Cauly Oliveira Souza',
                 'Churripi','Dalbert Henrique','Danilo Larangeira','Edmilson Indjai', 'Espeto','Ezequiel Avila',
                 'Felipe dal Belo','Fernando Marcal','Fernando Nino', 'Fernando','Gabriel Dos Santos','Gleison Bremer',
                 'Jonny Castro','Jose Holebas','Kike','Lee Kangin','Louis Beyer','Mathias Jorgensen','Moanes Dabour',
                 'Nico Ribaudo','Obite NDicka','Opoku Ampomah','Raphael Dias Belloli','Raul','Thiago Alcantara'
                 'Thomas Doyle','Victor Perea','Vitorino Hilton','Yoel']

In [166]:
# replacing fbrefs old Player names column with the newly matched names
# fbref_df['Player'].replace(list(fbref_df['Player']), harmonized_names, inplace=True)

In [174]:
j = -1
for i in fbref_df['Player']:
    j+=1
    if i not in wrong_matches:
        fbref_df.loc[(fbref_df.Player==i),'Player'] = harmonized_names[j]
    else:
        continue

# fbref_df['Player_fuzzy'] = fbref_df['Player'].copy()
# j=-1
# for i in fbref_df['Player_fuzzy']:
#      j+=1
#      if i not in wrong_matches:
#          fbref_df['Player_fuzzy'][j]= harmonized_names[j]

In [177]:
fbref_df.sort_values('Player')

Unnamed: 0,Player,Nation,Age,Squad,Comp,Rk,Min,Gls,Ast,xG,npxG,xA,GCA,Def_GCA,Drib_GCA,Fld_GCA,OG_GCA,PassDead,PassLive_GCA,Sh_SCA,SCA,Def_SCA,Drib_SCA,Fld_SCA,PassDead_SCA,PassLive_SCA,Sh_SCA.1,Blocks,Press,Succ_Press,Tkl,TklW,Int,Tkl+Int,Clr,Att_Pass,Cmp_Pass,TotPrgDist_Pass,#Prog_Pass,TotPrgDist_Carried,TotDist_Carried,Oppon_Drib,Att_Drib,Recep,Att_Recep,Live_Touches,played_two_leagues,Player_fuzzy
501,Aaron Connolly,IRL,19,Brighton,eng Premier League,547,1258,3,1,3.2,3.2,0.3,5.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,25.0,3.0,3.0,9.0,0.0,7.0,3.0,8.0,234.0,69.0,12.0,8,5,17,1.0,163.0,126.0,242.0,10.0,947.0,1713.0,6.0,16.0,235.0,535.0,334.0,False,Aaron Connolly
538,Aaron Cresswell,ENG,29,West Ham,eng Premier League,583,2727,3,0,1.3,1.3,1.8,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,46.0,0.0,2.0,4.0,6.0,33.0,1.0,64.0,355.0,118.0,48.0,29,15,63,92.0,1681.0,1241.0,10702.0,123.0,2310.0,4672.0,19.0,38.0,1047.0,1139.0,1581.0,False,Aaron Cresswell
720,Aaron Escandell,ESP,23,Granada,es La Liga,785,270,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0,0,0,0.0,128.0,77.0,2005.0,0.0,285.0,425.0,0.0,0.0,45.0,45.0,95.0,False,Aaron Escandell
1313,Aaron Lennon,ENG,32,Burnley,eng Premier League,1401,496,0,0,0.0,0.0,0.1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,1.0,0.0,6.0,0.0,8.0,143.0,27.0,15.0,9,8,23,6.0,131.0,92.0,347.0,8.0,472.0,790.0,6.0,8.0,115.0,156.0,187.0,False,Aaron Lennon
1325,Aaron Leya Iseka,BEL,21,Toulouse,fr Ligue 1,1413,777,2,0,4.9,3.4,0.2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,20.0,1.0,1.0,4.0,0.0,13.0,1.0,6.0,137.0,28.0,5.0,5,2,7,7.0,183.0,142.0,252.0,10.0,605.0,1077.0,9.0,13.0,219.0,399.0,277.0,False,Aaron Leya Iseka
1466,Aaron Martin,ESP,22,Mainz 05,de Bundesliga,1561,1792,0,0,0.2,0.2,3.4,3.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,50.0,0.0,1.0,1.0,20.0,27.0,0.0,38.0,218.0,61.0,21.0,15,14,35,43.0,912.0,662.0,4792.0,54.0,2164.0,3953.0,26.0,40.0,527.0,610.0,819.0,False,Aaron Martin
1596,Aaron Mooy,AUS,28,Brighton,eng Premier League,1696,2085,2,2,2.0,2.0,3.4,7.0,0.0,2.0,0.0,0.0,1.0,4.0,0.0,70.0,0.0,3.0,2.0,12.0,51.0,2.0,48.0,421.0,125.0,43.0,31,18,61,29.0,1075.0,825.0,4142.0,96.0,3154.0,6448.0,39.0,51.0,927.0,1104.0,1279.0,False,Aaron Mooy
1942,Aaron Ramsdale,ENG,21,Bournemouth,eng Premier League,2064,3330,0,1,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,8.0,1.0,0.0,0,0,0,0.0,1124.0,648.0,14848.0,0.0,1776.0,3109.0,0.0,0.0,406.0,410.0,826.0,False,Aaron Ramsdale
1943,Aaron Ramsey,WAL,28,Juventus,it Serie A,2065,1051,3,1,2.7,2.7,1.6,4.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,44.0,0.0,2.0,2.0,0.0,40.0,0.0,21.0,229.0,65.0,28.0,19,9,37,7.0,746.0,649.0,1744.0,35.0,1419.0,2948.0,16.0,23.0,719.0,850.0,887.0,False,Aaron Ramsey
2466,Aaron Wan Bissaka,ENG,21,Manchester Utd,eng Premier League,2643,3070,0,4,0.9,0.9,2.9,6.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,47.0,0.0,5.0,0.0,2.0,40.0,0.0,101.0,611.0,176.0,138.0,65,49,187,108.0,2064.0,1676.0,9484.0,143.0,4973.0,8335.0,60.0,91.0,1456.0,1570.0,2182.0,False,Aaron Wan Bissaka
