# IFSC Data Wrangling

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
# load data
boulder = pd.read_csv('data/boulder_results.csv')
lead = pd.read_csv('data/lead_results.csv')
speed = pd.read_csv('data/speed_results.csv')
combined = pd.read_csv('data/combined_results.csv')

## Boulder Category

In [3]:
boulder.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9741 entries, 0 to 9740
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Competition Title  9741 non-null   object 
 1   Competition Date   9741 non-null   object 
 2   FIRST              9741 non-null   object 
 3   LAST               9741 non-null   object 
 4   Nation             9741 non-null   object 
 5   StartNr            9325 non-null   float64
 6   Rank               9741 non-null   int64  
 7   Qualification      5432 non-null   object 
 8   Qualification 1    2154 non-null   object 
 9   Qualification 2    2155 non-null   object 
 10  Semifinal          1524 non-null   object 
 11  Final              1458 non-null   object 
 12  Category           9741 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 989.4+ KB


In [4]:
boulder.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,Qualification 1,Qualification 2,Semifinal,Final,Category
0,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Naile,MEIGNAN,FRA,15.0,1,6T7z99,,,3T4z55,3T4z89,boulder
1,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Miriam,FOGU,ITA,27.0,2,4T8z814,,,3T4z55,2T4z27,boulder
2,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Vanda,MICHALKOVA,SVK,48.0,3,6T7z89,,,3T3z43,2T3z23,boulder
3,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Lola,SAUTIER,FRA,17.0,4,4T6z69,,,2T3z89,1T3z38,boulder
4,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Polina,KULAGINA,RUS,39.0,5,4T6z611,,,3T4z76,1T3z47,boulder


In [5]:
# checking for duplicated entries
boulder.duplicated().sum()

3272

In [6]:
# remove duplicates
boulder.drop_duplicates(inplace=True)
boulder.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6469 entries, 0 to 6468
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Competition Title  6469 non-null   object 
 1   Competition Date   6469 non-null   object 
 2   FIRST              6469 non-null   object 
 3   LAST               6469 non-null   object 
 4   Nation             6469 non-null   object 
 5   StartNr            6261 non-null   float64
 6   Rank               6469 non-null   int64  
 7   Qualification      2923 non-null   object 
 8   Qualification 1    1774 non-null   object 
 9   Qualification 2    1772 non-null   object 
 10  Semifinal          1162 non-null   object 
 11  Final              859 non-null    object 
 12  Category           6469 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 707.5+ KB


In [7]:
# investigate entries with null values in 'StartNr' column
boulder.loc[boulder['StartNr'].isnull()]

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,Qualification 1,Qualification 2,Semifinal,Final,Category
2124,European Youth Cup (B) - Soure (POR) 2019,27 - 28 April 2019,Luce,DOUADY,FRA,,1,8T8z1310,,,,1T3z16,boulder
2125,European Youth Cup (B) - Soure (POR) 2019,27 - 28 April 2019,Julia,LOTZ,AUT,,2,7T8z1111,,,,1T3z16,boulder
2126,European Youth Cup (B) - Soure (POR) 2019,27 - 28 April 2019,Kintana,ILTIS,FRA,,3,6T6z127,,,,1T3z48,boulder
2127,European Youth Cup (B) - Soure (POR) 2019,27 - 28 April 2019,Camille,POUGET,FRA,,4,7T7z1412,,,,1T2z13,boulder
2128,European Youth Cup (B) - Soure (POR) 2019,27 - 28 April 2019,Aida,TORRES ILLAMOLA,ESP,,5,5T7z1312,,,,1T1z22,boulder
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2360,European Youth Cup (B) - Soure (POR) 2019,27 - 28 April 2019,Albert,ILSAAS SUTHURST,NOR,,39,2T5z58,,,,,boulder
2361,European Youth Cup (B) - Soure (POR) 2019,27 - 28 April 2019,Uri,AGMON,ISR,,40,1T6z212,,,,,boulder
2362,European Youth Cup (B) - Soure (POR) 2019,27 - 28 April 2019,Joseph,WALMSLEY,GBR,,41,1T3z14,,,,,boulder
2363,European Youth Cup (B) - Soure (POR) 2019,27 - 28 April 2019,Diogo,PIRES,POR,,42,0T1z02,,,,,boulder


#### null values, scoring and ranking
According to the infomation on Kaggle from where this dataset is obtained, 'StartNr' is the running order of competitors. The data in this column is likely not of interest to our analyses.
There are also null values in the Qualification columns, Semifinal and final columns. This is expected because often competitors are devided into qualification groups. Qualification group 1 will only have scores for for Qualification 1 and not Qualification 2 and vice versa. And only those who made it into Semifinal and Fianl rounds will have scores for those rounds. 
However, since each boulder competition have unique boulder problems for each round, we cannot compare scores from competiton to competiton. To assess how an athlete perform from one competition to another, we will look at their overall ranking from each competition and disregard the scorings from each competition. 

In [8]:
# extracting the location of the competiton from the Competition Title column into a new column 'Location'
boulder['Location'] = boulder['Competition Title'].str.extract(r'\(([A-Z]{3})\)')
boulder.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,Qualification 1,Qualification 2,Semifinal,Final,Category,Location
0,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Naile,MEIGNAN,FRA,15.0,1,6T7z99,,,3T4z55,3T4z89,boulder,ITA
1,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Miriam,FOGU,ITA,27.0,2,4T8z814,,,3T4z55,2T4z27,boulder,ITA
2,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Vanda,MICHALKOVA,SVK,48.0,3,6T7z89,,,3T3z43,2T3z23,boulder,ITA
3,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Lola,SAUTIER,FRA,17.0,4,4T6z69,,,2T3z89,1T3z38,boulder,ITA
4,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Polina,KULAGINA,RUS,39.0,5,4T6z611,,,3T4z76,1T3z47,boulder,ITA


In [9]:
# create a new column 'home' that indicates if the athlete is competeing in their home country

boulder['home'] = boulder['Nation'].eq(boulder['Location'])
boulder.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,Qualification 1,Qualification 2,Semifinal,Final,Category,Location,home
0,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Naile,MEIGNAN,FRA,15.0,1,6T7z99,,,3T4z55,3T4z89,boulder,ITA,False
1,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Miriam,FOGU,ITA,27.0,2,4T8z814,,,3T4z55,2T4z27,boulder,ITA,True
2,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Vanda,MICHALKOVA,SVK,48.0,3,6T7z89,,,3T3z43,2T3z23,boulder,ITA,False
3,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Lola,SAUTIER,FRA,17.0,4,4T6z69,,,2T3z89,1T3z38,boulder,ITA,False
4,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Polina,KULAGINA,RUS,39.0,5,4T6z611,,,3T4z76,1T3z47,boulder,ITA,False


In [10]:
# check to see how many unique values for Nation
boulder_nations = boulder['Nation'].unique()
print(boulder_nations)
print('Climbers are from ', len(boulder_nations), ' different nations')

['FRA' 'ITA' 'SVK' 'RUS' 'AUT' 'ESP' 'SLO' 'BEL' 'BUL' 'SUI' 'NED' 'CRO'
 'UKR' 'NOR' 'CZE' 'GBR' 'GRE' 'DEN' 'SRB' 'LAT' 'IRL' 'ISR' 'POL' 'POR'
 'HUN' 'GER' 'ROU' 'FIN' 'SWE' 'BLR' 'LTU' 'JPN' 'USA' 'CAN' 'KOR' 'RSA'
 'AUS' 'CHI' 'NZL' 'IND' 'MEX' 'BRA' 'UZB' 'ECU' 'KAZ' 'THA' 'INA' 'ARG'
 'HKG' 'SGP' 'LUX' 'CHN' 'IRI' 'TPE' 'PAK' 'PER' 'GUA' 'EST' 'MAC' 'MAS'
 'PHI' 'VEN' 'CAM' 'GEO' 'KGZ' 'MKD' 'LKA' 'TUR' 'MGL' 'NEP']
Climbers are from  70  different nations


In [11]:
# How many unique countries where competitions are held? Also checking for re extraction mistakes
boulder_locations = boulder['Location'].unique()
print(boulder_locations)
print('Competitions are held in', len(boulder_locations), 'different countries')

['ITA' 'POL' 'JPN' 'USA' 'BUL' 'GER' 'AUT' 'CHN' 'POR' 'RUS' 'SUI' 'HKG'
 'ECU' 'BEL' 'THA' 'NED']
Competitions are held in 16 different countries


Only athletes from these 16 countries got chances to compete in their home country.

In [12]:
# How many individual athletes just by first and last name
boulder.groupby(['FIRST','LAST']).size().sort_values(ascending=False)

FIRST            LAST             
Heeyeon          PARK                 16
Flavy            COHAUT               16
Yoshiyuki        OGATA                16
Urska            REPUSIC              16
Alex             KHAZANOV             16
                                      ..
Leonardo         MAGALLANES TEJADA     1
                 GONTERO               1
                 DE RIVERO HUAMAN      1
Lenka            FURDIKOVA             1
ADLIYAH BAIQUNI  EGALITA               1
Length: 1840, dtype: int64

In [13]:
# How many individual athletes by name and nation
boulder.groupby(['FIRST', 'LAST', 'Nation']).size().sort_values(ascending=False)

FIRST            LAST       Nation
Mickael          MAWEM      FRA       16
Urska            REPUSIC    SLO       16
Alex             KHAZANOV   ISR       16
Heeyeon          PARK       KOR       16
Yoshiyuki        OGATA      JPN       16
                                      ..
Hristo           ALTANCHEV  BUL        1
Hongik           CHOI       KOR        1
Rudrangsho       DEY        IND        1
Hinayah          MUHAMMAD   INA        1
ADLIYAH BAIQUNI  EGALITA    INA        1
Length: 1842, dtype: int64

In [14]:
# Athletes with same name but different nation?
w_nation = boulder.groupby(['FIRST', 'LAST', 'Nation']).size().to_frame(name = 'count').reset_index()
no_nation = boulder.groupby(['FIRST', 'LAST']).size().to_frame(name = 'count').reset_index()
difference = pd.concat([w_nation.drop('Nation', 1), no_nation]).drop_duplicates(keep=False)
difference

Unnamed: 0,FIRST,LAST,count
962,Louis,FECHOZ,1
963,Louis,FECHOZ,2
962,Louis,FECHOZ,3
1419,Robin,CASEY,4


In [15]:
boulder.loc[boulder['LAST'] == 'FECHOZ']

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,Qualification 1,Qualification 2,Semifinal,Final,Category,Location,home
1830,European Youth Cup (B) - Graz (AUT) 2019,11 - 12 May 2019,Louis,FECHOZ,FRA,74.0,4,8T8z1212,,,,3T3z88,boulder,AUT,False
2282,European Youth Cup (B) - Soure (POR) 2019,27 - 28 April 2019,Louis,FECHOZ,FRA,,5,7T8z128,,,,2T3z35,boulder,POR,False
4394,IFSC Youth World Championships - Moscow (RUS)...,9 - 16 August 2018,Louis,FECHOZ,AUS,169.0,33,,2T3z78,,,,boulder,RUS,False


According to the IFSC website
Louis Fechoz is Australian but he is on the French team.

In [16]:
boulder.loc[boulder['LAST'] == 'CASEY']

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,Qualification 1,Qualification 2,Semifinal,Final,Category,Location,home
44,European Youth Championships (B) - Brixen (ITA...,20 - 22 September 2019,Robin,CASEY,IRL,23.0,45,1T2z43,,,,,boulder,ITA,False
423,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Robin,CASEY,IRL,370.0,51,,,0T1z02,,,boulder,ITA,False
3651,European Youth Championships (B) - Brussels (B...,31 August - 2 September 2018,Robin,CASEY,GBR,603.0,26,2T5z210,,,,,boulder,BEL,False
4554,European Youth Cup (B) - Sofia (BUL) 2018,21 - 22 July 2018,Robin,CASEY,GBR,70.0,16,4T4z54,,,,,boulder,BUL,False



Robin Casey was on the British team until she switch to team Ireland in 2019.

In [17]:
boulder[['FIRST', 'LAST', 'home', 'Rank']].sort_values(by=['FIRST', 'LAST', 'home'], ignore_index=True).head(50)

Unnamed: 0,FIRST,LAST,home,Rank
0,ADLIYAH BAIQUNI,EGALITA,False,77
1,ALEXANDR,PETROV,False,18
2,ALINA,PONOMARYOVA,False,77
3,ALINA,PONOMARYOVA,False,29
4,ALINA,PONOMARYOVA,False,63
5,ANDREY,BURZHINSKIY,False,81
6,ARAILYM,ONGALBAY,False,21
7,ARAILYM,ONGALBAY,False,56
8,Aaron,MATTES,False,79
9,Aaron,PEÑARANDA,False,71


In [18]:
# Create dataframe of rank averages when they compete at home or away for each athlete
ra_boulder = boulder[['FIRST', 'LAST', 'home', 'Rank']].groupby(['FIRST', 'LAST', 'home']).mean().reset_index()
ra_boulder

Unnamed: 0,FIRST,LAST,home,Rank
0,ADLIYAH BAIQUNI,EGALITA,False,77.000000
1,ALEXANDR,PETROV,False,18.000000
2,ALINA,PONOMARYOVA,False,56.333333
3,ANDREY,BURZHINSKIY,False,81.000000
4,ARAILYM,ONGALBAY,False,38.500000
...,...,...,...,...
2167,Ádám,BAKURECZ,False,26.000000
2168,Örjan,RÖDLAND VAAGE,False,38.666667
2169,Övgün,YILDIRIM,False,125.000000
2170,Šimon,POTŮČEK,False,34.000000


In [52]:
# Checking for duplicated athletes (those who have competed both home and away)
home_and_away_boulder = ra_boulder.duplicated(subset=['FIRST', 'LAST'], keep=False)
home_and_away_boulder.sum()

664

In [53]:
ra_boulder[home_and_away_boulder]

Unnamed: 0,FIRST,LAST,home,Rank
13,Adelaide,D'ADDARIO,False,10.142857
14,Adelaide,D'ADDARIO,True,31.000000
20,Afonso Mano,CRISANTO,False,43.500000
21,Afonso Mano,CRISANTO,True,30.500000
22,Afra,HÖNIG,False,29.100000
...,...,...,...,...
2136,Zander,WALLER,True,31.000000
2142,ZhuoYing,CHEN,False,54.333333
2143,ZhuoYing,CHEN,True,4.000000
2152,Zoé,EGLI,False,17.666667


In [20]:
ra_boulder.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2172 entries, 0 to 2171
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   FIRST   2172 non-null   object 
 1   LAST    2172 non-null   object 
 2   home    2172 non-null   bool   
 3   Rank    2172 non-null   float64
dtypes: bool(1), float64(1), object(2)
memory usage: 53.2+ KB


332 athletes have competed both in their home country and abroad.

In [379]:
# checking random athletes 
ra_boulder.loc[rank_average['LAST'] == 'NARASAKI']

Unnamed: 0,FIRST,LAST,home,Rank
1340,Meichi,NARASAKI,False,19.5
1341,Meichi,NARASAKI,True,8.0
1958,Tomoa,NARASAKI,False,4.363636
1959,Tomoa,NARASAKI,True,1.5


In [33]:
# unique competitons

print('There are ', len(boulder['Competition Title'].unique()), 'bouldering competitions in 2018-19')

boulder.groupby('Competition Title').size()

There are  33 bouldering competitions in 2018-19


Competition Title
Asia Cup (B) - Hong Kong (HKG) 2018                                   73
Asia Cup (B,S) - Bangkok (THA) 2018                                   59
Asian Championships - Kurayoshi (JPN) 2018                            87
Asian Cup (B) - Hong Kong (HKG) 2019                                  49
Asian Youth Championships - Chongqing (CHN) 2018                     151
European Championship (B) - Zakopane (POL) 2019                       91
European Youth Championships (B) - Brixen (ITA) 2019                 282
European Youth Championships (B) - Brussels (BEL) 2018               301
European Youth Cup (B) - Delft (NED) 2018                            211
European Youth Cup (B) - Graz (AUT) 2018                             275
European Youth Cup (B) - Graz (AUT) 2019                             328
European Youth Cup (B) - Sofia (BUL) 2018                            193
European Youth Cup (B) - Sofia (BUL) 2019                            258
European Youth Cup (B) - Soure (P

# Lead Category

In [34]:
lead.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9930 entries, 0 to 9929
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Competition Title  9930 non-null   object 
 1   Competition Date   9930 non-null   object 
 2   FIRST              9930 non-null   object 
 3   LAST               9930 non-null   object 
 4   Nation             9930 non-null   object 
 5   StartNr            9480 non-null   float64
 6   Rank               9930 non-null   int64  
 7   Qualification      648 non-null    object 
 8   Qualification 1    9034 non-null   object 
 9   Qualification 2    9694 non-null   object 
 10  Semifinal          2446 non-null   object 
 11  Final              1716 non-null   object 
 12  Points             8737 non-null   float64
 13  Category           9930 non-null   object 
dtypes: float64(2), int64(1), object(11)
memory usage: 1.1+ MB


Similar to the Boulder category, we are not concerned with the null values in the 'StartNr' column and the scorings in each rounds as we will not use them in our analyses for the same reasons.

In [35]:
lead.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,Qualification 1,Qualification 2,Semifinal,Final,Points,Category
0,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Lucka,RAKOVEC,SLO,222.0,1,,"50+1.,50+1.,50+1.,50+1.","44 2.,44 2.,44 2.,44 2.",39,46+,1.73,lead
1,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Laura,ROGORA,ITA,234.0,2,,"16+24.,16+24.,16+24.,16+24.","36+7.,36+7.,36+7.,36+7.",39+,45+,15.62,lead
2,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Luce,DOUADY,FRA,229.0,3,,"48+3.,48+3.,48+3.,48+3.","30+15.,30+15.,30+15.,30+15.",38+,43,7.83,lead
3,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Mina,MARKOVIC,SLO,237.0,4,,"16+24.,16+24.,16+24.,16+24.","36+7.,36+7.,36+7.,36+7.",43+,42+,15.62,lead
4,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Jessica,PILZ,AUT,213.0,5,,"46+5.,46+5.,46+5.,46+5.","42+3.,42+3.,42+3.,42+3.",38+,42,3.87,lead


In [36]:
# Checking for duplicates
lead.duplicated().sum()

0

In [37]:
# extracting the location of the competiton from the Competition Title column into a new column 'Location'
lead['Location'] = lead['Competition Title'].str.extract(r'\(([A-Z]{3})\)')
lead.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,Qualification 1,Qualification 2,Semifinal,Final,Points,Category,Location
0,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Lucka,RAKOVEC,SLO,222.0,1,,"50+1.,50+1.,50+1.,50+1.","44 2.,44 2.,44 2.,44 2.",39,46+,1.73,lead,GBR
1,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Laura,ROGORA,ITA,234.0,2,,"16+24.,16+24.,16+24.,16+24.","36+7.,36+7.,36+7.,36+7.",39+,45+,15.62,lead,GBR
2,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Luce,DOUADY,FRA,229.0,3,,"48+3.,48+3.,48+3.,48+3.","30+15.,30+15.,30+15.,30+15.",38+,43,7.83,lead,GBR
3,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Mina,MARKOVIC,SLO,237.0,4,,"16+24.,16+24.,16+24.,16+24.","36+7.,36+7.,36+7.,36+7.",43+,42+,15.62,lead,GBR
4,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Jessica,PILZ,AUT,213.0,5,,"46+5.,46+5.,46+5.,46+5.","42+3.,42+3.,42+3.,42+3.",38+,42,3.87,lead,GBR


In [38]:
# create a new column 'home' that indicates if the athlete is competeing in their home country

lead['home'] = lead['Nation'].eq(lead['Location'])
lead.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,Qualification 1,Qualification 2,Semifinal,Final,Points,Category,Location,home
0,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Lucka,RAKOVEC,SLO,222.0,1,,"50+1.,50+1.,50+1.,50+1.","44 2.,44 2.,44 2.,44 2.",39,46+,1.73,lead,GBR,False
1,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Laura,ROGORA,ITA,234.0,2,,"16+24.,16+24.,16+24.,16+24.","36+7.,36+7.,36+7.,36+7.",39+,45+,15.62,lead,GBR,False
2,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Luce,DOUADY,FRA,229.0,3,,"48+3.,48+3.,48+3.,48+3.","30+15.,30+15.,30+15.,30+15.",38+,43,7.83,lead,GBR,False
3,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Mina,MARKOVIC,SLO,237.0,4,,"16+24.,16+24.,16+24.,16+24.","36+7.,36+7.,36+7.,36+7.",43+,42+,15.62,lead,GBR,False
4,"European Championship (L,S) - Edinburgh (GBR) ...",4 - 6 October 2019,Jessica,PILZ,AUT,213.0,5,,"46+5.,46+5.,46+5.,46+5.","42+3.,42+3.,42+3.,42+3.",38+,42,3.87,lead,GBR,False


In [39]:
# check to see how many unique values for Nation
lead_nations = lead['Nation'].unique()
print(lead_nations)
print('Climbers are from ', len(lead_nations), ' different nations')

['SLO' 'ITA' 'FRA' 'AUT' 'NOR' 'BEL' 'SUI' 'GBR' 'UKR' 'SVK' 'NED' 'RUS'
 'ISR' 'POL' 'CZE' 'ESP' 'GER' 'CRO' 'GRE' 'KOR' 'JPN' 'USA' 'IRI' 'TPE'
 'HKG' 'BRA' 'CAN' 'SWE' 'ARG' 'CHI' 'AUS' 'IRL' 'RSA' 'DEN' 'BUL' 'MEX'
 'NZL' 'UZB' 'ECU' 'LUX' 'HUN' 'LAT' 'IND' 'THA' 'SGP' 'ROU' 'POR' 'KGZ'
 'CHN' 'KAZ' 'INA' 'GEO' 'VEN' 'PER' 'GUA' 'PHI' 'CAM' 'MAC' 'MAS' 'SRB'
 'EST' 'BLR' 'MGL' 'MKD' 'TUR' 'FIN']
Climbers are from  66  different nations


In [40]:
# How many unique countries where competitions are held? Also checking for re extraction mistakes
lead_locations = lead['Location'].unique()
print(lead_locations)
print('Competitions are held in', len(lead_locations), 'different countries')

['GBR' 'SLO' 'ITA' 'JPN' 'AUT' 'FRA' 'SUI' 'ECU' 'CHN' 'RUS' 'GER' nan]
Competitions are held in 12 different countries


In [41]:
# investigate nan value in Location
lead['Competition Title'].unique()

array(['European Championship (L,S) - Edinburgh (GBR) 2019',
       'IFSC Climbing Worldcup (L) - Kranj (SLO) 2019 ',
       'IFSC Youth World Championships -  Arco (ITA) 2019 ',
       'IFSC Climbing World Championships - Hachioji (JPN) 2019',
       'European Youth Cup (S,L) - Imst (AUT) 2019',
       'IFSC Climbing Worldcup (L) - Briançon (FRA) 2019 ',
       'IFSC Climbing Worldcup (L, S) - Chamonix (FRA) 2019 ',
       'IFSC Climbing Worldcup (L, S) - Villars (SUI) 2019',
       'European Youth Cup (L) - St. Pierre Faucigny (FRA) 2019',
       'European Youth Cup (L) - Ostermundigen (SUI) 2019',
       'IFSC PanAmerican Championship (L, S, B, C) - Guayaquil (ECU) 2018',
       'Asian Championships - Kurayoshi (JPN) 2018 ',
       'Asian Youth Championships - Chongqing (CHN) 2018',
       'IFSC Climbing Worldcup (L,S) - Xiamen (CHN) 2018 ',
       'IFSC Climbing Worldcup (L,S) - Wujiang (CHN) 2018 ',
       'IFSC Climbing Worldcup (L) - Kranj (SLO) 2018 ',
       'IFSC Climbing Wor

The Imst 2018 competition does not have a country ISO code. It should be AUT 


In [42]:
# fill the nan values with 'AUT'
lead['Location'].fillna('AUT', inplace=True)
lead['Location'].unique()

array(['GBR', 'SLO', 'ITA', 'JPN', 'AUT', 'FRA', 'SUI', 'ECU', 'CHN',
       'RUS', 'GER'], dtype=object)

In [43]:
# how many individual athletes by names only?
lead.groupby(['FIRST','LAST']).size().sort_values(ascending=False)

FIRST        LAST             
Mykhayil     TKACHUK              37
Laura        ROGORA               33
Alberto      GINÉS LÓPEZ          33
Jakub        KONECNY              32
Mikel Asier  LINACISORO MOLINA    32
                                  ..
Georgii      KEDROV                2
Winai        RUANGRIT              2
Nikola       KRAMARIC              1
Veronika     SCHEUEROVA            1
Sergei       LUZHETSKII            1
Length: 1467, dtype: int64

In [44]:
# how many individul athletes by names and nations?
lead.groupby(['FIRST', 'LAST', 'Nation']).size().sort_values(ascending=False)

FIRST        LAST                 Nation
Mykhayil     TKACHUK              UKR       37
Laura        ROGORA               ITA       33
Alberto      GINÉS LÓPEZ          ESP       33
Jakub        KONECNY              CZE       32
Mikel Asier  LINACISORO MOLINA    ESP       32
                                            ..
Mariia       MUSIENKO             RUS        2
Frederik     VIBERG CHRISTIANSEN  DEN        2
Nikola       KRAMARIC             CRO        1
Veronika     SCHEUEROVA           CZE        1
Sergei       LUZHETSKII           RUS        1
Length: 1468, dtype: int64

In [45]:
# Athletes with same name but different nation?
w_nation2 = lead.groupby(['FIRST', 'LAST', 'Nation']).size().to_frame(name = 'count').reset_index()
no_nation2 = lead.groupby(['FIRST', 'LAST']).size().to_frame(name = 'count').reset_index()
difference2 = pd.concat([w_nation2.drop('Nation', 1), no_nation2]).drop_duplicates(keep=False)
difference2

Unnamed: 0,FIRST,LAST,count
1119,Robin,CASEY,8


In [46]:
lead.loc[lead['LAST'] == 'CASEY']

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,Qualification 1,Qualification 2,Semifinal,Final,Points,Category,Location,home
249,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Robin,CASEY,IRL,370.0,43,,"30+30.,30+30.,30+30.,30+30.","12+58.,12+58.,12+58.,12+58.",,,43.45,lead,ITA,False
871,"European Youth Cup (S,L) - Imst (AUT) 2019",2 - 4 August 2019,Robin,CASEY,IRL,150.0,40,,"28+42.,28+42.,28+42.,28+42.","27+36.,27+36.,27+36.,27+36.",,,39.15,lead,AUT,False
3868,European Youth Cup (L) - Munich (GER) 2018,7 - 8 July 2018,Robin,CASEY,GBR,16.0,23,,"24+26.,24+26.,24+26.,24+26.","25+15.,25+15.,25+15.,25+15.",,,20.91,lead,GER,False
4250,European Youth Cup (L) - Uster (SUI) 2018,30 June - 1 July 2018,Robin,CASEY,GBR,30.0,31,,"19 33.,19 33.,19 33.,19 33.","25+27.,25+27.,25+27.,25+27.",,,31.58,lead,SUI,False
5173,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Robin,CASEY,IRL,370.0,43,,"30+30.,30+30.,30+30.,30+30.,30+30.,30+30.,30+3...","12+58.,12+58.,12+58.,12+58.,12+58.,12+58.,12+5...",,,43.45,lead,ITA,False
5795,"European Youth Cup (S,L) - Imst (AUT) 2019",2 - 4 August 2019,Robin,CASEY,IRL,150.0,40,,"28+42.,28+42.,28+42.,28+42.,28+42.,28+42.,28+4...","27+36.,27+36.,27+36.,27+36.,27+36.,27+36.,27+3...",,,39.15,lead,AUT,False
8792,European Youth Cup (L) - Munich (GER) 2018,7 - 8 July 2018,Robin,CASEY,GBR,16.0,23,,"24+26.,24+26.,24+26.,24+26.,24+26.,24+26.,24+2...","25+15.,25+15.,25+15.,25+15.,25+15.,25+15.,25+1...",,,20.91,lead,GER,False
9174,European Youth Cup (L) - Uster (SUI) 2018,30 June - 1 July 2018,Robin,CASEY,GBR,30.0,31,,"19 33.,19 33.,19 33.,19 33.,19 33.,19 33.,19 3...","25+27.,25+27.,25+27.,25+27.,25+27.,25+27.,25+2...",,,31.58,lead,SUI,False


This is the same athlete that switched team from GBR to IRL whom we encounter before. But there seems to be duplicated entries with scorings in different format so it was not caught by the method before. 

In [47]:
# checking for duplicates again using subset of columns.
lead.duplicated(subset=['Competition Title', 'Competition Date', 'FIRST', 'LAST', 'Nation', 'Rank']).sum()

4924

In [48]:
# drop duplicated
lead.drop_duplicates(subset=['Competition Title', 'Competition Date', 'FIRST', 'LAST', 'Nation', 'Rank'], inplace=True)
lead.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5006 entries, 0 to 5005
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Competition Title  5006 non-null   object 
 1   Competition Date   5006 non-null   object 
 2   FIRST              5006 non-null   object 
 3   LAST               5006 non-null   object 
 4   Nation             5006 non-null   object 
 5   StartNr            4781 non-null   float64
 6   Rank               5006 non-null   int64  
 7   Qualification      324 non-null    object 
 8   Qualification 1    4561 non-null   object 
 9   Qualification 2    4888 non-null   object 
 10  Semifinal          1249 non-null   object 
 11  Final              866 non-null    object 
 12  Points             4781 non-null   float64
 13  Category           5006 non-null   object 
 14  Location           5006 non-null   object 
 15  home               5006 non-null   bool   
dtypes: bool(1), float64(2), 

In [49]:
# Create dataframe of rank averages when they compete at home or away for each athlete
ra_lead = lead[['FIRST', 'LAST', 'home', 'Rank']].groupby(['FIRST', 'LAST', 'home']).mean().reset_index()
ra_lead

Unnamed: 0,FIRST,LAST,home,Rank
0,ALEXANDR,PETROV,False,41.000000
1,ALINA,PONOMARYOVA,False,40.500000
2,ARAILYM,ONGALBAY,False,40.500000
3,Aaron,PEÑARANDA,False,51.000000
4,Aaron,PEÑARANDA,True,4.000000
...,...,...,...,...
1739,Àlex,HERNÁNDEZ CASTILLA,False,73.500000
1740,Óscar,MONTÓN GARCÍA,False,58.000000
1741,Örjan,RÖDLAND VAAGE,False,37.800000
1742,Šimon,POTŮČEK,False,35.307692


In [50]:
ra_lead.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1744 entries, 0 to 1743
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   FIRST   1744 non-null   object 
 1   LAST    1744 non-null   object 
 2   home    1744 non-null   bool   
 3   Rank    1744 non-null   float64
dtypes: bool(1), float64(1), object(2)
memory usage: 42.7+ KB


In [54]:
# Checking for duplicated athletes
home_and_away_lead = ra_lead.duplicated(subset=['FIRST', 'LAST'], keep=False)
home_and_away_lead.sum()

554

In [55]:
# Athletes who have competed both HOME and AWAY
ra_lead[home_and_away_lead]

Unnamed: 0,FIRST,LAST,home,Rank
3,Aaron,PEÑARANDA,False,51.000000
4,Aaron,PEÑARANDA,True,4.000000
10,Adelaide,D'ADDARIO,False,19.166667
11,Adelaide,D'ADDARIO,True,34.000000
16,Adrien,LEMAIRE,False,17.400000
...,...,...,...,...
1729,Zoé,EGLI,True,30.500000
1731,Zélia,AVEZOU,False,12.000000
1732,Zélia,AVEZOU,True,11.000000
1737,valentine,MANGIN,False,19.000000


277 athletes have competed both at home and away.

In [399]:
# checking random athletes 
ra_lead.loc[ra_lead['LAST'] == 'GARNBRET']

Unnamed: 0,FIRST,LAST,home,Rank
666,Janja,GARNBRET,False,2.090909
667,Janja,GARNBRET,True,7.5


# Speed Category

In [57]:
speed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11868 entries, 0 to 11867
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Competition Title  11868 non-null  object
 1   Competition Date   11868 non-null  object
 2   FIRST              11868 non-null  object
 3   LAST               11868 non-null  object
 4   Nation             11868 non-null  object
 5   StartNr            11868 non-null  int64 
 6   Rank               11868 non-null  int64 
 7   Qualification      11868 non-null  object
 8   1/8 - Final        2656 non-null   object
 9   1/4 - Final        2290 non-null   object
 10  1/2 - Final        1181 non-null   object
 11  Small final        592 non-null    object
 12  Final              589 non-null    object
 13  Category           11868 non-null  object
dtypes: int64(2), object(12)
memory usage: 1.3+ MB


As opposed to the Boulder and Lead categories, in the speed category, the route that the athletes climb is standardized and they are trying to get to the top in the shortest amount of time. Therefore, we can use the scores (time) in the qualification and subsequent rounds to analyse how they perform in each competition. Null values are expected as speed competition takes on an elimination format. Those who got eliminated in the qualification round will not have score in subsequent rounds and so on and so forth.  

In [58]:
speed.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,1/8 - Final,1/4 - Final,1/2 - Final,Small final,Final,Category
0,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Anna,CALANCA,ITA,114,1,8.686,,8.855,9.442,,8.661,speed
1,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Capucine,VIGLIONE,FRA,67,2,8.612,,9.243,8.605,,10.175,speed
2,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Lison,GAUTRON,FRA,66,3,8.977,,8.546,13.409,8.886,,speed
3,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Polina,KULAGINA,RUS,159,4,8.872,,8.837,9.249,9.302,,speed
4,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Kamilla,KUSHAEVA,RUS,160,5,8.871,,8.639,,,,speed


In [59]:
# Checking for duplicated entries
speed.duplicated().sum()

7843

In [60]:
# Dropping duplicates
speed.drop_duplicates(inplace=True)
speed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4025 entries, 0 to 4024
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Competition Title  4025 non-null   object
 1   Competition Date   4025 non-null   object
 2   FIRST              4025 non-null   object
 3   LAST               4025 non-null   object
 4   Nation             4025 non-null   object
 5   StartNr            4025 non-null   int64 
 6   Rank               4025 non-null   int64 
 7   Qualification      4025 non-null   object
 8   1/8 - Final        896 non-null    object
 9   1/4 - Final        790 non-null    object
 10  1/2 - Final        407 non-null    object
 11  Small final        204 non-null    object
 12  Final              203 non-null    object
 13  Category           4025 non-null   object
dtypes: int64(2), object(12)
memory usage: 471.7+ KB


In [61]:
# extracting the location of the competiton from the Competition Title column into a new column 'Location'
speed['Location'] = speed['Competition Title'].str.extract(r'\(([A-Z]{3})\)')
speed.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,1/8 - Final,1/4 - Final,1/2 - Final,Small final,Final,Category,Location
0,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Anna,CALANCA,ITA,114,1,8.686,,8.855,9.442,,8.661,speed,RUS
1,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Capucine,VIGLIONE,FRA,67,2,8.612,,9.243,8.605,,10.175,speed,RUS
2,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Lison,GAUTRON,FRA,66,3,8.977,,8.546,13.409,8.886,,speed,RUS
3,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Polina,KULAGINA,RUS,159,4,8.872,,8.837,9.249,9.302,,speed,RUS
4,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Kamilla,KUSHAEVA,RUS,160,5,8.871,,8.639,,,,speed,RUS


In [62]:
speed['home'] = speed['Nation'].eq(speed['Location'])
speed.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,1/8 - Final,1/4 - Final,1/2 - Final,Small final,Final,Category,Location,home
0,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Anna,CALANCA,ITA,114,1,8.686,,8.855,9.442,,8.661,speed,RUS,False
1,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Capucine,VIGLIONE,FRA,67,2,8.612,,9.243,8.605,,10.175,speed,RUS,False
2,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Lison,GAUTRON,FRA,66,3,8.977,,8.546,13.409,8.886,,speed,RUS,False
3,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Polina,KULAGINA,RUS,159,4,8.872,,8.837,9.249,9.302,,speed,RUS,True
4,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Kamilla,KUSHAEVA,RUS,160,5,8.871,,8.639,,,,speed,RUS,True


In [63]:
# check to see how many unique values for Nation
speed_nations = speed['Nation'].unique()
print(speed_nations)
print('Climbers are from ', len(speed_nations), ' different nations')

['ITA' 'FRA' 'RUS' 'GER' 'BUL' 'AUT' 'SLO' 'CRO' 'NOR' 'ESP' 'SVK' 'DEN'
 'CZE' 'UKR' 'POL' 'HUN' 'GBR' 'ISR' 'SUI' 'GRE' 'USA' 'INA' 'KOR' 'KAZ'
 'BEL' 'CAN' 'AUS' 'JPN' 'NZL' 'RSA' 'IND' 'IRL' 'BRA' 'LAT' 'SWE' 'CHI'
 'THA' 'MEX' 'HKG' 'ECU' 'SGP' 'UZB' 'CHN' 'TPE' 'IRI' 'ARG' 'NED' 'PAK'
 'GEO' 'MAS' 'BLR' 'VEN' 'PER' 'GUA' 'PHI' 'CAM' 'KGZ' 'SRB' 'LKA' 'MGL'
 'MKD' 'TUR']
Climbers are from  62  different nations


In [64]:
# How many unique countries where competitions are held? Also checking for re extraction mistakes
speed_locations = speed['Location'].unique()
print(speed_locations)
print('Competitions are held in', len(speed_locations), 'different countries')

['RUS' 'GBR' 'ITA' 'JPN' 'AUT' 'POL' 'FRA' 'SUI' 'CHN' 'ECU' 'THA' nan]
Competitions are held in 12 different countries


In [65]:
# investigate nan value 
speed['Competition Title'].unique()

array(['European Youth Championships (L,S) - Voronezh (RUS) 2019',
       'European Championship (L,S) - Edinburgh (GBR) 2019',
       'IFSC Youth World Championships -  Arco (ITA) 2019 ',
       'IFSC Climbing World Championships - Hachioji (JPN) 2019',
       'European Youth Cup (S,L) - Imst (AUT) 2019',
       'European Youth Cup (S) - Tarnow (POL) 2019',
       'IFSC Climbing Worldcup (L, S) - Chamonix (FRA) 2019 ',
       'IFSC Climbing Worldcup (L, S) - Villars (SUI) 2019',
       'European Youth Cup (S) - Mezzolombardo (ITA) 2019',
       'IFSC Climbing Worldcup (B,S) - Wujiang (CHN) 2019',
       'IFSC Climbing Worldcup (B,S) - Chongqing (CHN) 2019',
       'IFSC Climbing Worldcup (B,S) - Moscow (RUS) 2019 ',
       'IFSC PanAmerican Championship (L, S, B, C) - Guayaquil (ECU) 2018',
       'Asian Championships - Kurayoshi (JPN) 2018 ',
       'Asian Youth Championships - Chongqing (CHN) 2018',
       'IFSC Climbing Worldcup (L,S) - Xiamen (CHN) 2018 ',
       'IFSC Climbing Wo

Same problem as before: The ISO code missing from the competition title from Imst 2018

In [66]:
# fill the nan values with 'AUT'
speed['Location'].fillna('AUT', inplace=True)
speed['Location'].unique()

array(['RUS', 'GBR', 'ITA', 'JPN', 'AUT', 'POL', 'FRA', 'SUI', 'CHN',
       'ECU', 'THA'], dtype=object)

In [67]:
# how many individual athletes by names only?
speed.groupby(['FIRST','LAST']).size().sort_values(ascending=False)

FIRST            LAST             
Gian Luca        ZODDA                19
Elena            REMIZOVA             17
Hana             KRIZOVA              17
Aleksandra       KALUCKA              17
Marcin           DZIENSKI             16
                                      ..
Leonid           OSADCHYI              1
Leonardo         VAZQUEZ RODRIGUEZ     1
Lela             HENTSCHEL             1
Leila            SHMIDKE               1
ADLIYAH BAIQUNI  EGALITA               1
Length: 1191, dtype: int64

In [68]:
# how many individual athletes by name and nation?
speed.groupby(['FIRST', 'LAST', 'Nation']).size().sort_values(ascending=False)

FIRST            LAST               Nation
Gian Luca        ZODDA              ITA       19
Elena            REMIZOVA           RUS       17
Aleksandra       KALUCKA            POL       17
Hana             KRIZOVA            CZE       17
Aurelia          SARISSON           FRA       16
                                              ..
Leonid           OSADCHYI           UKR        1
Leonardo         VAZQUEZ RODRIGUEZ  MEX        1
Lela             HENTSCHEL          SUI        1
Leila            SHMIDKE            KAZ        1
ADLIYAH BAIQUNI  EGALITA            INA        1
Length: 1192, dtype: int64

In [69]:
# Athletes with same name but different nation?
w_nation3 = speed.groupby(['FIRST', 'LAST', 'Nation']).size().to_frame(name = 'count').reset_index()
no_nation3 = speed.groupby(['FIRST', 'LAST']).size().to_frame(name = 'count').reset_index()
difference3 = pd.concat([w_nation3.drop('Nation', 1), no_nation3]).drop_duplicates(keep=False)
difference3

Unnamed: 0,FIRST,LAST,count
632,Louis,FECHOZ,1
633,Louis,FECHOZ,3
632,Louis,FECHOZ,4


This is the same athlete we encountered before who is a Australian national but competes for the French team.

In [70]:
speed.loc[speed['LAST'] == 'FECHOZ']

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,StartNr,Rank,Qualification,1/8 - Final,1/4 - Final,1/2 - Final,Small final,Final,Category,Location,home
65,"European Youth Championships (L,S) - Voronezh ...",18 - 20 Oktober 2019,Louis,FECHOZ,FRA,83,14,9.312,,,,,,speed,RUS,False
755,"European Youth Cup (S,L) - Imst (AUT) 2019",2 - 4 August 2019,Louis,FECHOZ,FRA,61,3,8.34,,8.02,8.39,7.97,,speed,AUT,False
878,European Youth Cup (S) - Tarnow (POL) 2019,19 - 20 July 2019,Louis,FECHOZ,FRA,80,5,7.72,,7.69,,,,speed,POL,False
2686,IFSC Youth World Championships - Moscow (RUS)...,9 - 16 August 2018,Louis,FECHOZ,AUS,298,26,9.33,,,,,,speed,RUS,False


In [71]:
# Create dataframe of rank averages when they compete at home or away for each athlete
ra_speed = speed[['FIRST', 'LAST', 'home', 'Rank']].groupby(['FIRST', 'LAST', 'home']).mean().reset_index()
ra_speed

Unnamed: 0,FIRST,LAST,home,Rank
0,ADLIYAH BAIQUNI,EGALITA,False,11.000000
1,ALEXANDR,PETROV,False,22.000000
2,ALINA,PONOMARYOVA,False,21.333333
3,ANDREY,BURZHINSKIY,False,41.000000
4,ARAILYM,ONGALBAY,False,28.500000
...,...,...,...,...
1420,pino,LEON,False,38.000000
1421,solene,MOREAU,False,11.000000
1422,valentine,MANGIN,False,7.000000
1423,Àlex,HERNÁNDEZ CASTILLA,False,71.000000


In [75]:
home_and_away_speed = ra_speed.duplicated(subset=['FIRST', 'LAST'], keep=False)
home_and_away_speed.sum()

468

234 athletes have both home and away entries.

In [76]:
ra_speed[home_and_away_speed]

Unnamed: 0,FIRST,LAST,home,Rank
13,Adrien,LEMAIRE,False,6.333333
14,Adrien,LEMAIRE,True,5.000000
17,Ai,MORI,False,51.000000
18,Ai,MORI,True,60.000000
23,Akiyo,NOGUCHI,False,36.583333
...,...,...,...,...
1405,ZhiXing,CHEN,True,15.000000
1407,ZhiYong,OU,False,19.333333
1408,ZhiYong,OU,True,26.750000
1409,ZhuoYing,CHEN,False,18.000000


In [77]:
# make a new dataframe that averages the times of athletes in each rounds

# slicing out the columns of interest
sa_speed = speed.loc[:,['FIRST', 'LAST', 'home', 'Qualification', '1/8 - Final', '1/4 - Final', '1/2 - Final', 'Small final', 'Final']]
sa_speed

Unnamed: 0,FIRST,LAST,home,Qualification,1/8 - Final,1/4 - Final,1/2 - Final,Small final,Final
0,Anna,CALANCA,False,8.686,,8.855,9.442,,8.661
1,Capucine,VIGLIONE,False,8.612,,9.243,8.605,,10.175
2,Lison,GAUTRON,False,8.977,,8.546,13.409,8.886,
3,Polina,KULAGINA,True,8.872,,8.837,9.249,9.302,
4,Kamilla,KUSHAEVA,True,8.871,,8.639,,,
...,...,...,...,...,...,...,...,...,...
4020,Mykhayil,TKACHUK,False,9.780,,,,,
4021,Guillem,MONSECH GASCA,False,10.230,,,,,
4022,Kristóf,TÓTH,False,10.270,,,,,
4023,Jorge,DÍAZ-RULLO CALVO,False,11.910,,,,,


In [78]:
# changeing data type to numeric values
rounds = ['Qualification', '1/8 - Final', '1/4 - Final', '1/2 - Final', 'Small final', 'Final']

for _round in rounds:
    
    sa_speed[_round] = pd.to_numeric(sa_speed[_round], errors='coerce')
    
sa_speed

Unnamed: 0,FIRST,LAST,home,Qualification,1/8 - Final,1/4 - Final,1/2 - Final,Small final,Final
0,Anna,CALANCA,False,8.686,,8.855,9.442,,8.661
1,Capucine,VIGLIONE,False,8.612,,9.243,8.605,,10.175
2,Lison,GAUTRON,False,8.977,,8.546,13.409,8.886,
3,Polina,KULAGINA,True,8.872,,8.837,9.249,9.302,
4,Kamilla,KUSHAEVA,True,8.871,,8.639,,,
...,...,...,...,...,...,...,...,...,...
4020,Mykhayil,TKACHUK,False,9.780,,,,,
4021,Guillem,MONSECH GASCA,False,10.230,,,,,
4022,Kristóf,TÓTH,False,10.270,,,,,
4023,Jorge,DÍAZ-RULLO CALVO,False,11.910,,,,,


In [79]:
sa_speed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4025 entries, 0 to 4024
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   FIRST          4025 non-null   object 
 1   LAST           4025 non-null   object 
 2   home           4025 non-null   bool   
 3   Qualification  3917 non-null   float64
 4   1/8 - Final    799 non-null    float64
 5   1/4 - Final    716 non-null    float64
 6   1/2 - Final    369 non-null    float64
 7   Small final    171 non-null    float64
 8   Final          179 non-null    float64
dtypes: bool(1), float64(6), object(2)
memory usage: 286.9+ KB


In [80]:
# averaging the scores from each round
sa_speed['average speed'] = sa_speed.loc[:, 'Qualification': 'Final'].mean(axis=1)
sa_speed

Unnamed: 0,FIRST,LAST,home,Qualification,1/8 - Final,1/4 - Final,1/2 - Final,Small final,Final,average speed
0,Anna,CALANCA,False,8.686,,8.855,9.442,,8.661,8.91100
1,Capucine,VIGLIONE,False,8.612,,9.243,8.605,,10.175,9.15875
2,Lison,GAUTRON,False,8.977,,8.546,13.409,8.886,,9.95450
3,Polina,KULAGINA,True,8.872,,8.837,9.249,9.302,,9.06500
4,Kamilla,KUSHAEVA,True,8.871,,8.639,,,,8.75500
...,...,...,...,...,...,...,...,...,...,...
4020,Mykhayil,TKACHUK,False,9.780,,,,,,9.78000
4021,Guillem,MONSECH GASCA,False,10.230,,,,,,10.23000
4022,Kristóf,TÓTH,False,10.270,,,,,,10.27000
4023,Jorge,DÍAZ-RULLO CALVO,False,11.910,,,,,,11.91000


For the time scores in each rounds, other than numerical and NaN values, there are 3 string type values, 'false start', 'fall', and 'wildcard. A 'false start' is an illegal event where a climber's foot leaves the sensor pad too early, before the beeping sound that indicates the start of the timer and the round will restart to give the climber 1 more chance to start properly. A 'wildcard' given as a score when a climber's opponent has 2 false start and is disqualified from the round, thus granting the climber to be advanced automatically without having to climb and therefore does not have a score. We do not care about 'wildcard' as it is due to purely luck of the draw. A 'fall' is when a climber slips off and fail to reach the top of the route and therefore does not receive a time score. These values need to be removed in order for the time averages to be calculated due to data type inconsistency. However, it is potentially intereting to see if mistakes like a 'fall' and 'false start' happen at different frequncy when a climber is competing in their home country vs when they are abroad. 

In [81]:
# adding the non numeric values back from the original dataframe
sa_speed.fillna(speed.loc[:, 'Qualification': 'Final'], inplace=True)
sa_speed

Unnamed: 0,FIRST,LAST,home,Qualification,1/8 - Final,1/4 - Final,1/2 - Final,Small final,Final,average speed
0,Anna,CALANCA,False,8.686,,8.855,9.442,,8.661,8.91100
1,Capucine,VIGLIONE,False,8.612,,9.243,8.605,,10.175,9.15875
2,Lison,GAUTRON,False,8.977,,8.546,13.409,8.886,,9.95450
3,Polina,KULAGINA,True,8.872,,8.837,9.249,9.302,,9.06500
4,Kamilla,KUSHAEVA,True,8.871,,8.639,,,,8.75500
...,...,...,...,...,...,...,...,...,...,...
4020,Mykhayil,TKACHUK,False,9.78,,,,,,9.78000
4021,Guillem,MONSECH GASCA,False,10.23,,,,,,10.23000
4022,Kristóf,TÓTH,False,10.27,,,,,,10.27000
4023,Jorge,DÍAZ-RULLO CALVO,False,11.91,,,,,,11.91000


In [82]:
# counting occurance of 'fall' and 'false start'

sa_speed['fall'] = sa_speed.apply(lambda x: x.str.contains('fall').sum(), axis=1)
sa_speed['false start'] = sa_speed.apply(lambda x: x.str.contains('false start').sum(), axis=1)
sa_speed.tail(50)

Unnamed: 0,FIRST,LAST,home,Qualification,1/8 - Final,1/4 - Final,1/2 - Final,Small final,Final,average speed,fall,false start
3975,Jacopo,STEFANI,False,6.6,,7.88,7.23,,fall,7.236667,1,0
3976,Gaetan,PETRI,True,7.77,,7.69,8.68,7.73,,7.9675,0,0
3977,Matteo,ZURLONI,False,8.6,,8.77,9.8,fall,,9.056667,1,0
3978,Marceau,GARNIER,True,7.8,,7.81,,,,7.805,0,0
3979,Filippo,MARESI,False,8.46,,8.5,,,,8.48,0,0
3980,Yaroslav,TKACH,False,6.67,,fall,,,,6.67,1,0
3981,Davide Marco,COLOMBO,False,8.81,,fall,,,,8.81,1,0
3982,Leonardo,GIANNOTTI,False,9.21,,,,,,9.21,0,0
3983,Luca,FERRARI,False,9.27,,,,,,9.27,0,0
3984,Ziga,ZAJC,False,9.38,,,,,,9.38,0,0


In [83]:
# slicing out the columns we need

sa_speed = sa_speed.loc[:, ['FIRST', 'LAST', 'home', 'average speed', 'fall', 'false start']]
sa_speed

Unnamed: 0,FIRST,LAST,home,average speed,fall,false start
0,Anna,CALANCA,False,8.91100,0,0
1,Capucine,VIGLIONE,False,9.15875,0,0
2,Lison,GAUTRON,False,9.95450,0,0
3,Polina,KULAGINA,True,9.06500,0,0
4,Kamilla,KUSHAEVA,True,8.75500,0,0
...,...,...,...,...,...,...
4020,Mykhayil,TKACHUK,False,9.78000,0,0
4021,Guillem,MONSECH GASCA,False,10.23000,0,0
4022,Kristóf,TÓTH,False,10.27000,0,0
4023,Jorge,DÍAZ-RULLO CALVO,False,11.91000,0,0


In [84]:
sa_speed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4025 entries, 0 to 4024
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   FIRST          4025 non-null   object 
 1   LAST           4025 non-null   object 
 2   home           4025 non-null   bool   
 3   average speed  3917 non-null   float64
 4   fall           4025 non-null   int64  
 5   false start    4025 non-null   int64  
dtypes: bool(1), float64(1), int64(2), object(2)
memory usage: 192.6+ KB


In [85]:
sa_speed.loc[sa_speed['average speed'].isnull()]

Unnamed: 0,FIRST,LAST,home,average speed,fall,false start
39,Varvara,LUZINA,True,,1,0
75,Jaka,JAKI,False,,1,0
97,Veronika,SCHEUEROVA,False,,0,1
129,Fedir,SAMOILOV,False,,0,1
130,Jakob,SCHUBERT,False,,1,0
...,...,...,...,...,...,...
3927,Bohdan,BOTNAR,False,,1,0
3928,Kai,HARADA,False,,0,1
3929,Dmitry,KARAVAEV,False,,0,1
3930,Alimzhan,MYRZABEKOV,False,,0,1


These entries have null values for average speed likely because they either fell or had false starts in the Qualification round in that particular event.

In [86]:
# average speed for individual athlete when competing and home and abroad, and summing the 'fall' and 'fall start'

sa_speed2 = sa_speed.groupby(['FIRST', 'LAST', 'home']).agg({'average speed': 'mean', 'fall': 'sum', 'false start': 'sum'}).reset_index()
sa_speed2

Unnamed: 0,FIRST,LAST,home,average speed,fall,false start
0,ADLIYAH BAIQUNI,EGALITA,False,9.9455,0,0
1,ALEXANDR,PETROV,False,8.3150,0,1
2,ALINA,PONOMARYOVA,False,12.5090,0,0
3,ANDREY,BURZHINSKIY,False,8.3620,0,0
4,ARAILYM,ONGALBAY,False,14.5650,0,0
...,...,...,...,...,...,...
1420,pino,LEON,False,9.3400,0,0
1421,solene,MOREAU,False,10.7541,0,0
1422,valentine,MANGIN,False,11.4100,0,0
1423,Àlex,HERNÁNDEZ CASTILLA,False,13.8100,0,0


In [444]:
# checking random athletes

sa_speed2.loc[sa_speed2['LAST'] == 'MAWEM']

Unnamed: 0,FIRST,LAST,home,average speed,fall,false start
172,Bassa,MAWEM,False,5.948904,5,0
173,Bassa,MAWEM,True,5.8,0,1
902,Mickael,MAWEM,False,7.162444,0,0
903,Mickael,MAWEM,True,6.5465,0,0


In [448]:
sa_speed2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1425 entries, 0 to 1424
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   FIRST          1425 non-null   object 
 1   LAST           1425 non-null   object 
 2   home           1425 non-null   bool   
 3   average speed  1398 non-null   float64
 4   fall           1425 non-null   int64  
 5   false start    1425 non-null   int64  
dtypes: bool(1), float64(1), int64(2), object(2)
memory usage: 57.2+ KB


In [449]:
# investigate the null values in 'average speed' column
sa_speed2.loc[sa_speed2['average speed'].isnull()]

Unnamed: 0,FIRST,LAST,home,average speed,fall,false start
91,Amanda,BROWNSTEIN,False,,0,1
99,Anahí,RIVEROS,False,,1,0
136,Antonia,VALENZUELA,False,,0,1
253,Daniele,BALESTRAZZI,True,,0,1
288,Davoud,REKABI,False,,0,1
306,Dmitrii,FAKIRYANOV,False,,1,0
578,Jakob,SCHUBERT,True,,0,1
648,Josefina,MESA BARRIENTOS,False,,1,0
736,Leila,SHMIDKE,False,,1,0
820,Maksym,ZINCHENKO,False,,0,1


These athletes have null values for average speed because they either fell or had false starts in the only speed competition they participated in 2018 and 2019. 

In [88]:
home_and_away_speed2 = sa_speed2.duplicated(subset=['FIRST', 'LAST'], keep=False)
home_and_away_speed2.sum()

468

In [90]:
sa_speed2[home_and_away_speed2]

Unnamed: 0,FIRST,LAST,home,average speed,fall,false start
13,Adrien,LEMAIRE,False,7.398333,0,0
14,Adrien,LEMAIRE,True,7.850000,0,0
17,Ai,MORI,False,13.790200,0,0
18,Ai,MORI,True,12.974000,0,0
23,Akiyo,NOGUCHI,False,11.033333,0,0
...,...,...,...,...,...,...
1405,ZhiXing,CHEN,True,6.912083,0,1
1407,ZhiYong,OU,False,6.677833,0,0
1408,ZhiYong,OU,True,6.946889,1,0
1409,ZhuoYing,CHEN,False,10.453000,0,0


# Combined Category

In [430]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Competition Title      879 non-null    object 
 1   Competition Date       879 non-null    object 
 2   FIRST                  879 non-null    object 
 3   LAST                   879 non-null    object 
 4   Nation                 879 non-null    object 
 5   Rank                   879 non-null    int64  
 6   Qualification lead     879 non-null    object 
 7   Qualification speed    879 non-null    object 
 8   Qualification boulder  879 non-null    object 
 9   Final lead             120 non-null    object 
 10  Final speed            120 non-null    object 
 11  Final boulder          120 non-null    object 
 12  Points                 879 non-null    float64
 13  Final Points           120 non-null    float64
 14  Category               879 non-null    object 
dtypes: flo

In [431]:
combined.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,Rank,Qualification lead,Qualification speed,Qualification boulder,Final lead,Final speed,Final boulder,Points,Final Points,Category
0,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Natsumi,HIRANO,JPN,1,1.0,14.0,1.0,,,,14.0,,combined
1,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Julia,LOTZ,AUT,2,7.0,1.0,5.0,,,,35.0,,combined
2,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Emily,PHILLIPS,GBR,3,2.0,5.0,4.0,,,,40.0,,combined
3,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Jana,RAUTH,AUT,4,4.0,4.0,3.0,,,,48.0,,combined
4,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Vanda,MICHALKOVA,SVK,5,3.0,15.0,6.0,,,,270.0,,combined


In [92]:
# checking for duplicates
combined.duplicated().sum()

586

In [93]:
# dropping duplicates
combined.drop_duplicates(inplace=True)
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293 entries, 0 to 292
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Competition Title      293 non-null    object 
 1   Competition Date       293 non-null    object 
 2   FIRST                  293 non-null    object 
 3   LAST                   293 non-null    object 
 4   Nation                 293 non-null    object 
 5   Rank                   293 non-null    int64  
 6   Qualification lead     293 non-null    object 
 7   Qualification speed    293 non-null    object 
 8   Qualification boulder  293 non-null    object 
 9   Final lead             40 non-null     object 
 10  Final speed            40 non-null     object 
 11  Final boulder          40 non-null     object 
 12  Points                 293 non-null    float64
 13  Final Points           40 non-null     float64
 14  Category               293 non-null    object 
dtypes: flo

In [94]:
# how many combined competition were there?b
combined.groupby('Competition Title').size()

Competition Title
IFSC Climbing World Championships Combined - Hachioji (JPN) 2019      40
IFSC Climbing World Championships Combined - Innsbruck (AUT) 2018     12
IFSC PanAmerican Championship (L, S, B, C) - Guayaquil (ECU) 2018     41
IFSC Youth World Championships -  Arco (ITA) 2019                    200
dtype: int64

In [95]:
# extracting the location of the competiton from the Competition Title column into a new column 'Location'
combined['Location'] = combined['Competition Title'].str.extract(r'\(([A-Z]{3})\)')
combined.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,Rank,Qualification lead,Qualification speed,Qualification boulder,Final lead,Final speed,Final boulder,Points,Final Points,Category,Location
0,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Natsumi,HIRANO,JPN,1,1.0,14.0,1.0,,,,14.0,,combined,ITA
1,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Julia,LOTZ,AUT,2,7.0,1.0,5.0,,,,35.0,,combined,ITA
2,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Emily,PHILLIPS,GBR,3,2.0,5.0,4.0,,,,40.0,,combined,ITA
3,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Jana,RAUTH,AUT,4,4.0,4.0,3.0,,,,48.0,,combined,ITA
4,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Vanda,MICHALKOVA,SVK,5,3.0,15.0,6.0,,,,270.0,,combined,ITA


In [96]:
combined['home'] = combined['Nation'].eq(combined['Location'])
combined.head()

Unnamed: 0,Competition Title,Competition Date,FIRST,LAST,Nation,Rank,Qualification lead,Qualification speed,Qualification boulder,Final lead,Final speed,Final boulder,Points,Final Points,Category,Location,home
0,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Natsumi,HIRANO,JPN,1,1.0,14.0,1.0,,,,14.0,,combined,ITA,False
1,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Julia,LOTZ,AUT,2,7.0,1.0,5.0,,,,35.0,,combined,ITA,False
2,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Emily,PHILLIPS,GBR,3,2.0,5.0,4.0,,,,40.0,,combined,ITA,False
3,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Jana,RAUTH,AUT,4,4.0,4.0,3.0,,,,48.0,,combined,ITA,False
4,IFSC Youth World Championships - Arco (ITA) 2...,22 - 31 August 2019,Vanda,MICHALKOVA,SVK,5,3.0,15.0,6.0,,,,270.0,,combined,ITA,False


In [97]:
# check to see how many unique values for Nation
combined_nations = combined['Nation'].unique()
print(combined_nations)
print('Climbers are from ', len(combined_nations), ' different nations')

['JPN' 'AUT' 'GBR' 'SVK' 'BEL' 'SUI' 'BUL' 'SLO' 'KOR' 'ESP' 'CRO' 'POL'
 'NOR' 'AUS' 'GRE' 'CHI' 'NZL' 'IRL' 'DEN' 'HUN' 'IND' 'SWE' 'BRA' 'LAT'
 'ITA' 'CZE' 'CAN' 'THA' 'MEX' 'USA' 'GER' 'ISR' 'UKR' 'HKG' 'SGP' 'UZB'
 'ECU' 'RSA' 'FRA' 'CHN' 'KAZ' 'ARG' 'VEN' 'PER' 'GUA']
Climbers are from  45  different nations


In [98]:
# How many unique countries where competitions are held? Also checking for re extraction mistakes
combined_locations = combined['Location'].unique()
print(combined_locations)
print('Competitions are held in', len(combined_locations), 'different countries')

['ITA' 'JPN' 'ECU' 'AUT']
Competitions are held in 4 different countries


In [99]:
# how many individual athletes by names only?
combined.groupby(['FIRST','LAST']).size().sort_values(ascending=False)

FIRST    LAST       
Petra    KLINGLER       2
Rudolph  RUANA          2
Tomoa    NARASAKI       2
Kokoro   FUJII          2
Miho     NONAKA         2
                       ..
Matthew  JONES          1
Mattea   PÖTZI          1
Mathias  RASK JUNKER    1
Mateus   BELLOTTO       1
Abby     GEBERT         1
Length: 277, dtype: int64

In [100]:
# how many individual athletes by name and nation?
combined.groupby(['FIRST', 'LAST', 'Nation']).size().sort_values(ascending=False)

FIRST    LAST         Nation
Petra    KLINGLER     SUI       2
Rudolph  RUANA        USA       2
Tomoa    NARASAKI     JPN       2
Kokoro   FUJII        JPN       2
Miho     NONAKA       JPN       2
                               ..
Matthew  JONES        NZL       1
Mattea   PÖTZI        AUT       1
Mathias  RASK JUNKER  DEN       1
Mateus   BELLOTTO     BRA       1
Abby     GEBERT       NZL       1
Length: 277, dtype: int64

In [101]:
# Create dataframe of rank averages when they compete at home or away for each athlete
ra_combined = combined[['FIRST', 'LAST', 'home', 'Rank']].groupby(['FIRST', 'LAST', 'home']).mean().reset_index()
ra_combined

Unnamed: 0,FIRST,LAST,home,Rank
0,Abby,GEBERT,False,30.0
1,Abby,MANNING,False,21.0
2,Adam,ONDRA,False,10.0
3,Ai,MORI,True,6.0
4,Akiyo,NOGUCHI,False,4.0
...,...,...,...,...
280,Zoi,PALTATSIDOU,False,35.0
281,Zoé,EGLI,False,8.0
282,Zuzanna,MIENTUS,False,29.0
283,jose tomas,LEDESMA,False,30.0


In [103]:
home_and_away_combined = ra_combined.duplicated(subset=['FIRST', 'LAST'], keep=False)
home_and_away_combined.sum()

16

In [104]:
ra_combined[home_and_away_combined]

Unnamed: 0,FIRST,LAST,home,Rank
4,Akiyo,NOGUCHI,False,4.0
5,Akiyo,NOGUCHI,True,2.0
112,Jakob,SCHUBERT,False,2.0
113,Jakob,SCHUBERT,True,1.0
124,Jessica,PILZ,False,10.0
125,Jessica,PILZ,True,3.0
145,Kai,HARADA,False,4.0
146,Kai,HARADA,True,4.0
150,Kokoro,FUJII,False,6.0
151,Kokoro,FUJII,True,6.0


8 athletes have home and away entries.