# Imports

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

# Get data from scraping

In [2]:
dataframe = pd.read_csv("../data/triathlon_m_results.csv")
dataframe

Unnamed: 0,Position,Number,Name,Pos_in_swim,Swim_time,T1,Pos_in_bike,Bike_time,T2,Pos_in_run,Run_time,Race_control,Time,Rank,Category
0,1.,14,VINUELA GONZALEZ Kevin Tarek,3,17:25,1:29,1,1:08:58,55,4,23:41,Finish,1:52:29,1/28,ELM
1,2.,18,LEJEUNE Emmanuel,15,19:18,1:30,5,1:12:08,55,5,23:52,Finish,1:57:46,2/28,ELM
2,3.,7,DUPUIS Igor,2,17:23,1:30,8,1:12:56,53,18,25:24,Finish,1:58:07,3/28,ELM
3,4.,347,NAVARRO Thomas,29,20:02,1:42,4,1:11:27,56,7,24:24,Finish,1:58:34,4/28,ELM
4,5.,27,LEBOIS Paul,6,17:32,1:32,11,1:14:28,52,9,24:30,Finish,1:58:56,5/28,ELM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1172,STRT,1533,POUPIN Yann,-,,,-,,,-,,start,0:00:00,-,M40-44
1173,STRT,1536,BELIOT Francis,-,,,-,,,-,,start,0:00:00,-,M25-29
1174,STRT,1538,CURT Adrien,-,,,-,,,-,,start,0:00:00,-,M25-29
1175,STRT,1559,NIQUE Nelson,-,,,-,,,-,,start,0:00:00,-,M30-34


In [3]:
# Unfortunately, some participants were unable to finish the race, or their times were recorded inaccurately. 
# We will need to remove these entries.
dataframe.dropna(axis=0, inplace=True)
dataframe

Unnamed: 0,Position,Number,Name,Pos_in_swim,Swim_time,T1,Pos_in_bike,Bike_time,T2,Pos_in_run,Run_time,Race_control,Time,Rank,Category
0,1.,14,VINUELA GONZALEZ Kevin Tarek,3,17:25,1:29,1,1:08:58,55,4,23:41,Finish,1:52:29,1/28,ELM
1,2.,18,LEJEUNE Emmanuel,15,19:18,1:30,5,1:12:08,55,5,23:52,Finish,1:57:46,2/28,ELM
2,3.,7,DUPUIS Igor,2,17:23,1:30,8,1:12:56,53,18,25:24,Finish,1:58:07,3/28,ELM
3,4.,347,NAVARRO Thomas,29,20:02,1:42,4,1:11:27,56,7,24:24,Finish,1:58:34,4/28,ELM
4,5.,27,LEBOIS Paul,6,17:32,1:32,11,1:14:28,52,9,24:30,Finish,1:58:56,5/28,ELM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029,1296.,1445,DEMARET Florian,988,34:10,5:24,1304,2:31:04,2:54,1206,46:50,Finish,4:00:25,166/200,M25-29
1030,1297.,1323,VIAL Nicolas,1342,40:40,5:24,1251,2:23:49,3:44,1201,46:46,Finish,4:00:26,89/118,M45-49
1031,1298.,1092,DEFRANCE Gaëtan,1327,40:26,4:31,1197,2:18:36,2:45,1349,54:27,Finish,4:00:48,90/118,M45-49
1032,1299.,979,FLAVIO Amato,803,31:48,5:04,1314,2:32:14,2:34,1273,49:29,Finish,4:01:10,94/117,M50-54


In [4]:
# As during our webscraping we get only text data we have to convert columns
dataframe.dtypes

Position        object
Number           int64
Name            object
Pos_in_swim     object
Swim_time       object
T1              object
Pos_in_bike     object
Bike_time       object
T2              object
Pos_in_run      object
Run_time        object
Race_control    object
Time            object
Rank            object
Category        object
dtype: object

In [6]:
# Convert numeric columns into integers
dataframe = dataframe.astype({"Number": "int",
                  "Pos_in_swim": "int",
                  "Pos_in_bike": "int",
                  "Pos_in_run": "int"})
dataframe.dtypes

Position        object
Number           int64
Name            object
Pos_in_swim      int64
Swim_time       object
T1              object
Pos_in_bike      int64
Bike_time       object
T2              object
Pos_in_run       int64
Run_time        object
Race_control    object
Time            object
Rank            object
Category        object
dtype: object

In [7]:
# Convert timing columns into times
dataframe["Swim_time"] = pd.to_datetime(dataframe["Swim_time"], format="%M:%S")
dataframe["T1"] = pd.to_datetime(dataframe["T1"], format="%M:%S")
dataframe["Bike_time"] = pd.to_datetime(dataframe["Bike_time"], format="%H:%M:%S")
dataframe["Time"] = pd.to_datetime(dataframe["Time"], format="%H:%M:%S")

In [8]:
# As T2 & Run_time has different timing we have to fix that
dataframe["T2"] = dataframe["T2"].apply(lambda x: "00:" + x if len(x) == 2 else x)
dataframe["Run_time"] = dataframe["Run_time"].apply(lambda x: "00:" + x if len(x) == 5 else x)

In [9]:
# Convert remaining columns into times
dataframe["T2"] = pd.to_datetime(dataframe["T2"], format="%M:%S")
dataframe["Run_time"] = pd.to_datetime(dataframe["Run_time"], format="%H:%M:%S")

In [10]:
dataframe.dtypes

Position                object
Number                   int64
Name                    object
Pos_in_swim              int64
Swim_time       datetime64[ns]
T1              datetime64[ns]
Pos_in_bike              int64
Bike_time       datetime64[ns]
T2              datetime64[ns]
Pos_in_run               int64
Run_time        datetime64[ns]
Race_control            object
Time            datetime64[ns]
Rank                    object
Category                object
dtype: object

In [12]:
dataframe["Time"].max()

Timestamp('1900-01-01 04:01:26')