# Data cleaning

In [1]:
import glob
import re
import json

# make sure we have all the races

### results downloaded

In [2]:
files = glob.glob('./../data/results/*')
filenames = list(map(lambda x: x.split('/')[-1], files))

In [3]:
results_downloaded = {}

for name in filenames:
    race,year,month,day = re.match("(.+)_(\d{4})(\d{2})(\d{2}).jl", name).groups()
    race = race.lower()
    if race not in results_downloaded:
        results_downloaded[race] = {
            'years': [year],
            'dates': [f'{year}{month}{day}']
        }
    else:
        if year in results_downloaded[race]['years']:
            print(f'Year {year} already present for race {race}')
        results_downloaded[race]['years'].append(year)
        results_downloaded[race]['dates'].append(f'{year}{month}{day}')
    
print(f'Data for {len(results_downloaded)} races have been downloaded.')

Year 2011 already present for race portmacquarie70.3
Year 2008 already present for race arizona
Data for 199 races have been downloaded.


### from races listing

In [4]:
all_races = {}
with open('./../data/races/races.jl', 'r') as f:
    data = [json.loads(line.strip()) for line in f.readlines()]
for race in data:
    if race:
        all_races[race['id'].lower()] = {'id': race['id'], 'name': race['name'], 'website': race['website']}
        
print(f'A listing of {len(all_races)} races had been inputed')

A listing of 211 races had been inputed


In [5]:
ids = {}
for race in all_races:
    if all_races[race]['id'].lower() in ids:
        print(f'race id {all_races[race]["id"]} already present')
    else:
        ids[all_races[race]["id"].lower()] = True

### get missing races info

In [6]:
missing_results = []
for race in all_races:
    if not results_downloaded.get(race):
        missing_results.append(all_races[race])
        print(f"{all_races[race]['id']} => {all_races[race]['website']}")
print(f'{len(missing_results)} race results are missing')

buffalosprings70.3 => http://www.ironman.com/triathlon/events/americas/ironman-70.3/inactive/buffalo-springs-lake.aspx
traversecity70.3 => http://www.ironman.com/triathlon/events/americas/ironman-70.3/traverse-city.aspx
jeju70.3 => http://www.ironman.com/triathlon/events/asiapac/ironman-70.3/goseong-korea.aspx
stcroix70.3 => http://www.ironman.com/triathlon/events/americas/ironman-70.3/inactive/st.-croix.aspx
hefei70.3 => http://www.ironman.com/triathlon/events/asiapac/ironman-70.3/hefei.aspx
korea70.3 => http://www.ironman.com/triathlon/events/asiapac/ironman-70.3/gurye-korea.aspx
chungju70.3 => http://www.ironman.com/triathlon/events/asiapac/ironman-70.3/inactive/chungju-korea.aspx
Vineman => http://www.ironman.com/triathlon/events/americas/ironman/vineman.aspx
busan70.3 => http://www.ironman.com/triathlon/events/asiapac/ironman-70.3/inactive/busan-korea.aspx
putrajaya70.3 => http://www.ironman.com/triathlon/events/asiapac/ironman-70.3/inactive/putrajaya.aspx
ireland70.3 => http://ww

In [7]:
# save file for download
with open('missing_races.jl', 'w') as f:
    for race in missing_results:
        f.write(f'{json.dumps(race)}\n')

# Check if correct count has been downloaded

In [8]:
files = glob.glob('./../data/results/*')
filenames = list(map(lambda x: x.split('/')[-1], files))

counts_downloaded = {}

for i,name in enumerate(filenames):
    race,year,month,day = re.match("(.+)_(\d{4})(\d{2})(\d{2}).jl", name).groups()
    race = race.lower()
    race_id = f"{race}_{year}{month}{day}"
    if race_id not in counts_downloaded:
        with open(files[i], 'r') as f:
            counts_downloaded[race_id] = len(f.readlines(  ))
    else:
        print(f"race {race} ({year}{month}{day}) is already present")
    
print(f'Data for {len(counts_downloaded)} races have been checked')

Data for 1071 races have been checked


### compare with scraped counts

In [9]:
with open('./../data/races/races-athletes-count.jl', 'r') as f:
    data_count = [json.loads(line.strip()) for line in f.readlines()]
    
counts_scraped = {}
for race in data_count:
    race_id = f"{race['id'].lower()}_{race['date']}"
    # only take in account the race if there are actually results
    if race['count']>10:
        counts_scraped[race_id] = race['count']

In [10]:
n = 0
not_none = set()
incomplete = {}
for count in counts_scraped:
    scraped = counts_scraped[count]
    downloaded = counts_downloaded.get(count)
    if scraped != downloaded:
        n +=1
        print(count, downloaded, f'({scraped})')
        if downloaded:
            not_none.add(count.split('_')[0])
            if not incomplete.get(count.split('_')[0]):
                incomplete[count.split('_')[0]] = [count.split('_')[1]]
            else:
                incomplete[count.split('_')[0]].append(count.split('_')[1])

australia_20170507 1026 (1132)
australia_20150503 1530 (1650)
australia_20180506 1108 (1206)
australia_20190505 1309 (1401)
australia_20160501 1249 (1355)
westernaustralia_20151206 1187 (1203)
westernaustralia_20161204 1365 (1382)
westernaustralia_20171203 1440 (1461)
westernaustralia_20181202 1120 (1146)
coquimbo70.3_20171021 389 (457)
uk_20130804 1601 (1602)
lakeplacid_20050724 2207 (2208)
miami70.3_20141026 2927 (3137)
branson70.3_20100919 1356 (1357)
regensburg_20120617 1292 (1322)
branson70.3_20120923 859 (860)


In [11]:
n

16

In [28]:
with open('not-complete.txt', 'w') as f:
    for race in not_none:
        f.write(f'"{race}"\n')

In [34]:
with open('incomplete.jl', 'w') as f:
    f.write(json.dumps(incomplete))

In [33]:
incomplete

{'taiwan70.3': ['20170319', '20180318', '20190324'],
 'newzealand': ['20140301', '20160305', '20170304', '20180303', '20190302'],
 'geelong70.3': ['20170219'],
 'davao70.3': ['20190324', '20180325'],
 'monterrey70.3': ['20180513'],
 'liuzhou70.3': ['20180414'],
 'florida70.3': ['20130519'],
 'australia': ['20170507',
  '20150503',
  '20110501',
  '20180506',
  '20190505',
  '20160501'],
 'mallorca70.3': ['20150509'],
 'vietnam70.3': ['20170507', '20180513'],
 'florida': ['20031108'],
 'malaysia': ['20161112'],
 'loscabos70.3': ['20161030', '20171112'],
 'xiamen70.3': ['20171112'],
 'sydney70.3': ['20171126'],
 'westernaustralia': ['20151206',
  '20161204',
  '20171203',
  '20181202',
  '20121209'],
 'atlanticcity70.3': ['20160918', '20170917', '20180923'],
 'worldchampionship': ['20181013'],
 'coquimbo70.3': ['20171021'],
 'neworleans70.3': ['20130421', '20150419', '20110417', '20140413'],
 'shanghai70.3': ['20181021'],
 'augusta70.3': ['20090927'],
 'sunshinecoast70.3': ['20170910', '

In [87]:
def get_races_urls(file, selection=None):
    # read file created by racespider
    urls = set()
    races = []
    with open(file, 'r') as f:
        data = [json.loads(line.strip()) for line in f.readlines()]
    for race in data:
        if race:
            if selection:
                if not any(race_id.lower() in race['id'].lower() for race_id in map(lambda x: x['id'], selection)):
                    continue
            root = re.match('(.*).asp', race['website']).group(1)
            race_results_url = f"{root}/results.aspx"
            if race_results_url not in urls:
                races.append({'id': race['id'], 'url': race_results_url, 'region': race['region']})
                urls.add(race_results_url)
    if not selection:
        # add world championship 70.3 (to also get female results)
        races.append({'id': 'worldchampionship70.3', 'region': 'americas' , 'url': 'http://www.ironman.com/triathlon/events/americas/ironman-70.3/70.3-world-championship-womens-race/results.aspx'})
    return races

In [89]:
selection = [
  {
    "id": 'France70.3',
    "years": []
  }
]
get_races_urls('./../data/races/races.jl', selection)

[{'id': 'france70.3',
  'url': 'http://www.ironman.com/triathlon/events/emea/ironman-70.3/pays-d-aix/results.aspx',
  'region': 'emea'}]

In [91]:
list(filter(lambda x: x["id"] == "France70.3" , selection))

[{'id': 'France70.3', 'years': []}]