In [63]:
import os
import json
import pandas as pd
from collections import defaultdict

directory = os.fsencode('data/wtt_matches')

In [64]:
tf = pd.read_csv('data/tournaments_wtt.tsv', sep='\t', parse_dates=['StartDateTime', 'EndDateTime'])
tf.sort_values(by=['StartDateTime'], inplace=True)

In [65]:
players = []
genders = defaultdict(lambda: None)

for row in tf.itertuples():
    evt = os.path.join(f'data/wtt_matches', str(row.EventId))
    if not os.path.isdir(evt):
        continue

    for filename in os.listdir(evt):
        if not filename.endswith(".json"):
            continue

        m = None
        with open(os.path.join(f'data/wtt_matches', str(row.EventId), filename)) as f:
            m = json.load(f)

        gender = m['documentCode'][3]

        isTeam = False
        for c in m['competitiors']:
            if isTeam:
                break

            for p in c['players']:
                p['gender'] = gender
                pid = int(p['playerId'])
                if pid > 100000000 or pid < 90000:
                    isTeam = True
                if isTeam:
                    break

                if not p['playerOrgCode']:
                    p['playerOrgCode'] = c['competitiorOrg']
                players.append(p)

        if isTeam and m['teamParentData']:
            for _m in m['teamParentData']['extended_info']['matches']:
                if not _m.get('match_result'):
                    continue

                mm = _m['match_result']
                for c in mm['competitiors']:
                    for p in c['players']:
                        p['gender'] = gender
                        pid = int(p['playerId'])
                        if pid > 100000000 or pid < 90000:
                            continue

                        if not p['playerOrgCode']:
                            p['playerOrgCode'] = c['competitiorOrg']
                        players.append(p)


In [66]:
pf = pd.DataFrame(players)
pf.drop(columns=['playerPosition'], inplace=True)
pf

Unnamed: 0,playerId,playerName,playerGivenName,playerFamilyName,playerOrgCode,gender
0,112074,FALCK Mattias,,,SWE,M
1,104379,KARLSSON Kristian,,,SWE,M
2,118994,JEON Jihee,Jihee,JEON,KOR,W
3,121706,Mariia TAILAKOVA,Mariia,TAILAKOVA,RUS,W
4,101648,CHEN Chien-An,,,TPE,M
...,...,...,...,...,...,...
64006,137627,LIU Yangzi,,,AUS,W
64007,102841,FREITAS Marcos,,,POR,M
64008,114715,QIU Dang,,,GER,M
64009,112442,PITCHFORD Liam,,,ENG,M


In [67]:
pf[pf.playerName == 'Lubomir PISTEJ']

Unnamed: 0,playerId,playerName,playerGivenName,playerFamilyName,playerOrgCode,gender
191,107445,Lubomir PISTEJ,Lubomir,PISTEJ,SVK,X
432,107445,Lubomir PISTEJ,Lubomir,PISTEJ,SVK,M
497,107445,Lubomir PISTEJ,Lubomir,PISTEJ,SVK,M
549,107445,Lubomir PISTEJ,Lubomir,PISTEJ,SVK,M
919,107445,Lubomir PISTEJ,Lubomir,PISTEJ,SVK,M
1253,107445,Lubomir PISTEJ,Lubomir,PISTEJ,SVK,M
1606,107445,Lubomir PISTEJ,Lubomir,PISTEJ,SVK,X


In [68]:
pf[pf.playerId.isna()]
# only 2, manually fix

Unnamed: 0,playerId,playerName,playerGivenName,playerFamilyName,playerOrgCode,gender


In [69]:
name_conflicts = pf.groupby('playerId')['playerName'].apply(set)
conf = pd.DataFrame(name_conflicts)
conf['cnt'] = name_conflicts.apply(len)
conf[conf.cnt > 1]


Unnamed: 0_level_0,playerName,cnt
playerId,Unnamed: 1_level_1,Unnamed: 2_level_1
100032,"{Farah ABDELAZIZ, ABDEL-AZIZ Farah}",2
100154,"{Mawussi AGBETOGLO, AGBETOGLO Mawussi}",2
100439,"{SALEH Ahmed, Ahmed SALEH}",2
100486,"{ALTO Gaston, Gaston ALTO}",2
100534,"{Jimoh AMUSA, AMUSA Jimoh}",2
...,...,...
202914,"{YEE Seng, TERAWAUEA Hannah, AGARI Tammi}",3
202924,"{BYE, NUOPULA Gary, KOUTO Selwyn}",3
202925,"{CARLOT Brendan, LULU Ham, SHING Daiki}",3
202989,"{ZHANG Bei Yan, YEE Joshua, WU Vicky, CHAUHAN ...",4


In [70]:
id_conflicts = pf.groupby('playerName')['playerId'].apply(set)
idconf = pd.DataFrame(id_conflicts)
idconf['cnt'] = id_conflicts.apply(len)
idconf[idconf.cnt > 1]
# These actually might be different people... 

Unnamed: 0_level_0,playerId,cnt
playerName,Unnamed: 1_level_1,Unnamed: 2_level_1
AGARI Tammi,"{202914, 202502}",2
BELROSE Ocean,"{101040, 202869, 133613}",3
BYE,"{202991, 102213, 202812, 202924, 145172, 11323...",7
CARLOT Brendan,"{202877, 202925}",2
CARNET Bydhir,"{202869, 133613}",2
...,...,...
YEE Joshua,"{135693, 202989}",2
YEE Seng,"{202502, 202914, 202916}",3
Yang WANG,"{112735, 109995}",2
ZHANG Bei Yan,"{133554, 123935, 202989}",3


In [71]:
org_conflicts = pf.groupby('playerId')['playerOrgCode'].apply(set)
oconf = pd.DataFrame(org_conflicts)
oconf['cnt'] = org_conflicts.apply(len)
oconf[oconf.cnt > 1]


Unnamed: 0_level_0,playerOrgCode,cnt
playerId,Unnamed: 1_level_1,Unnamed: 2_level_1
100868,"{SVK/FRA, LUX/SVK, SVK, POL/SVK, SVK/CZE, SVK/...",7
102380,"{ENG, ENG/GER, GBR}",3
102441,"{CMR/ALG, CMR}",2
103163,"{USA/ESP, USA/SUI, USA}",3
103425,"{CMR/EGY, CMR/LUX, CMR}",3
...,...,...
202520,"{MAR, MAR/RSA}",2
202862,"{PYF, PNG/PYF, PYF/SOL}",3
204538,"{ETH, ETH/ZIM}",2
206094,"{RSA, MRI/RSA}",2


In [72]:
cleaned = []
for id, rows in pf.groupby('playerId'):
    clrow = {
        'id': id,
        'org': None,
        'name': '',
        'gender': 'X',
    }
    for row in rows.itertuples():
        if row.gender != 'X':
            clrow['gender'] = row.gender
        if '^' in clrow['name'] or (clrow['name'] != row.playerName and min(sum(1 for c in clrow['name'] if c.isupper()), 4) < min(sum(1 for c in row.playerName if c.isupper()), 4)):
            clrow['name'] = row.playerName
        if not clrow['org']:
            clrow['org'] = row.playerOrgCode
        elif row.playerOrgCode:
            # take shorter country code, don't want doubles codes
            if len(row.playerOrgCode) < len(clrow['org']):
                clrow['org'] = row.playerOrgCode

    cleaned.append(clrow)
cf = pd.DataFrame(cleaned)

In [73]:
pf[pf.playerId.isin(cf[cf.org.isna()].id)]

Unnamed: 0,playerId,playerName,playerGivenName,playerFamilyName,playerOrgCode,gender


In [74]:
cf.to_csv('data/wtt_cleaned/players.tsv', index=False, sep='\t')