In [1]:
import re
import numpy as np
import pandas as pd

### Clean the competition dates

In [2]:
df = pd.read_csv('../data/comp_data.csv')
dates = df['Dates'].copy()
years = dates.str[-4:]

pattern = re.compile(r'[^a-zA-Z]') # Grab only alpha chars
month_map = {'Februar':2,
             'Juli':7,
             'Dezember':12,
             'April':4,
             'Mrz':3,
             'September':9,
             'Juni':6,
             'Mai':5,
             'Oktober':10,
             'JuniMai':6,
             'August':8,
             'November':11,
             'Januar':1,
             'JuniJuli':6,
             'AugustSeptember':8,
             'SeptemberOktober':9,
             'MaiJuni':5,
             'JuliAugust':7,
             'MrzMai':3,
             'AprilMai':4,
             'OktoberNovember':10,
             'NovemberDezember':11,
             'DezemberNovember':11,
             'SeptemberAugust':8,
             'AugustJuli':7,
             'NovemberOktober':10,
             'FebruarJanuar':1,
             'MaiApril':4,
             'OktoberSeptember':9,
             'MrzApril':4} # unfortunately some of these when listing multiples months flip the order. Maybe something to watch for. Basically don't trust the days
months = dates.apply(lambda x: pattern.sub('', x))
months = months.apply(month_map.get)

def stop_at_alpha(text):
    non_alpha = ''
    for char in text:
        if char.isalpha():
            break
        non_alpha += char
    return non_alpha.strip()
        
days_maybe = dates.apply(stop_at_alpha)
df['day_maybe'] = days_maybe
df['month'] = months.astype(int)
df['year'] = years.astype(int)
df = df.sort_values(by=['year', 'month', 'day_maybe'], ascending=[False, True, True])
df = df.set_index('ID')

In [3]:
# df.to_csv('../data/comp_data_clean.csv')

In [4]:
IFSC = df[df['Name'].str[:4] == 'IFSC']
worldcup = IFSC[IFSC['Name'].str.contains('Worldcup')].copy()

def get_disciplines(text):
    pattern = re.compile(r'\(([^\)]+)\)')
    return pattern.search(text).group()
    
disciplines = worldcup['Name'].apply(get_disciplines)
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(disciplines)
has_lead = disciplines.str.contains('L').astype(int)
has_boulder = disciplines.str.contains('B').astype(int)
has_speed = disciplines.str.contains('S').astype(int)
worldcup['has_L'] = has_lead
worldcup['has_B'] = has_boulder
worldcup['has_S'] = has_speed
worldcup[:20]

Unnamed: 0_level_0,Name,Dates,day_maybe,month,year,has_L,has_B,has_S
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7926,"IFSC Climbing Worldcup (B,S) - Moscow (RUS) 2019",12 - 14 April 2019,12 - 14,4,2019,0,1,1
7929,"IFSC Climbing Worldcup (B,S) - Chongqing (CHN)...",26 - 28 April 2019,26 - 28,4,2019,0,1,1
7923,IFSC Climbing Worldcup (B) - Meiringen (SUI) 2...,5 - 6 April 2019,5 - 6,4,2019,0,1,0
7932,IFSC Climbing Worldcup (B) - Munich (GER) 2019,18 - 19 Mai 2019,18 - 19,5,2019,0,1,0
7959,"IFSC Climbing Worldcup (B,S) - Wujiang (CHN) 2019",3 - 5 Mai 2019,3 - 5,5,2019,0,1,1
7935,IFSC Climbing Worldcup (B) - Vail (USA) 2019,7 - 8 Juni 2019,7 - 8,6,2019,0,1,0
7944,"IFSC Climbing Worldcup (L, S) - Chamonix (FRA)...",11 - 13 Juli 2019,11 - 13,7,2019,1,0,1
7947,IFSC Climbing Worldcup (L) - Briançon (FRA) 2019,19 - 20 Juli 2019,19 - 20,7,2019,1,0,0
7941,"IFSC Climbing Worldcup (L, S) - Villars (SUI) ...",4 - 6 Juli 2019,4 - 6,7,2019,1,0,1
7938,IFSC Climbing Worldcup (L) - Kranj (SLO) 2019,28 - 29 September 2019,28 - 29,9,2019,1,0,0


In [7]:
worldcup.to_csv('../data/comp_data_ifsc_worldcup.csv')

Unnamed: 0_level_0,Name,Dates,day_maybe,month,year,has_L,has_B,has_S
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1106,IFSC Climbing Worldcup (B) - Hall (AUT) 2008,18 - 19 April 2008,18 - 19,4,2008,0,1,0
1093,IFSC Climbing Worldcup (B) - Reunion (FRA) 2008,2 - 3 Mai 2008,2 - 3,5,2008,0,1,0
1096,IFSC Climbing Worldcup (B) - Grindelwald (SUI)...,30 - 31 Mai 2008,30 - 31,5,2008,0,1,0
1097,IFSC Climbing Worldcup (B) - Fiera di Primiero...,13 - 14 Juni 2008,13 - 14,6,2008,0,1,0
1122,IFSC Climbing Worldcup (B) - Vail (USA) 2008,6 - 7 Juni 2008,6 - 7,6,2008,0,1,0
1099,IFSC Climbing Worldcup (B) - Montauban (FRA) 2008,4 - 5 Juli 2008,4 - 5,7,2008,0,1,0
1119,IFSC Climbing Worldcup (S+B) - Moscow (RUS) 20...,31 Oktober - 2 November 2008,31,10,2008,0,1,1
