# Imports

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

# Get clean data from scraping

In [4]:
# Create data frames from files
men_dataframe = pd.read_csv("../data/medium_men_results.csv")
women_dataframe = pd.read_csv("../data/medium_women_results.csv")

In [5]:
# Concatenate data frames 
dataframe = pd.concat([men_dataframe, women_dataframe])

In [6]:
# Drop Did Not Finish (DNF) and non starting (STRT) rows
dataframe = dataframe[~dataframe["Position"].isin(["DNF", "STRT"])]

In [7]:
# Unfortunately, some racers were inaccurately recorded so we need to remove them.
dataframe.dropna(axis=0, inplace=True)

In [8]:
# As during our webscraping we get only text data we have to convert columns
dataframe.dtypes

Position        object
Number           int64
Name            object
Pos_in_swim     object
Swim_time       object
T1              object
Pos_in_bike     object
Bike_time       object
T2              object
Pos_in_run      object
Run_time        object
Race_control    object
Time            object
Rank            object
Category        object
dtype: object

In [9]:
# Convert numeric columns into integers
dataframe = dataframe.astype({"Number": "int",
                  "Pos_in_swim": "int",
                  "Pos_in_bike": "int",
                  "Pos_in_run": "int"})
dataframe.dtypes

Position        object
Number           int64
Name            object
Pos_in_swim      int64
Swim_time       object
T1              object
Pos_in_bike      int64
Bike_time       object
T2              object
Pos_in_run       int64
Run_time        object
Race_control    object
Time            object
Rank            object
Category        object
dtype: object

In [10]:
# Convert timing columns into times
dataframe["Swim_time"] = pd.to_datetime(dataframe["Swim_time"], format="%M:%S")
dataframe["T1"] = pd.to_datetime(dataframe["T1"], format="%M:%S")
dataframe["Bike_time"] = pd.to_datetime(dataframe["Bike_time"], format="%H:%M:%S")
dataframe["Time"] = pd.to_datetime(dataframe["Time"], format="%H:%M:%S")

In [11]:
# As T2 & Run_time has different timing we have to fix that
dataframe["T2"] = dataframe["T2"].apply(lambda x: "00:" + x if len(x) == 2 else x)
dataframe["Run_time"] = dataframe["Run_time"].apply(lambda x: "00:" + x if len(x) <= 5 else x)

In [12]:
# Convert remaining columns into times
dataframe["T2"] = pd.to_datetime(dataframe["T2"], format="%M:%S")
dataframe["Run_time"] = pd.to_datetime(dataframe["Run_time"], format="%H:%M:%S")

In [13]:
# Set the right date of race
date_time_columns = list(dataframe.select_dtypes(include = ["datetime64[ns]"]).columns)
for column in date_time_columns:
    dataframe[column] = dataframe[column].astype(str)
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("1900-01-01", "2023-07-28"))
    dataframe[column] = pd.to_datetime(dataframe[column])


In [14]:
dataframe

Unnamed: 0,Position,Number,Name,Pos_in_swim,Swim_time,T1,Pos_in_bike,Bike_time,T2,Pos_in_run,Run_time,Race_control,Time,Rank,Category
0,1.,14,VINUELA GONZALEZ Kevin Tarek,3,2023-07-28 00:17:25,2023-07-28 00:01:29,1,2023-07-28 01:08:58,2023-07-28 00:00:55,4,2023-07-28 00:23:41,Finish,2023-07-28 01:52:29,1/28,ELM
1,2.,18,LEJEUNE Emmanuel,15,2023-07-28 00:19:18,2023-07-28 00:01:30,5,2023-07-28 01:12:08,2023-07-28 00:00:55,5,2023-07-28 00:23:52,Finish,2023-07-28 01:57:46,2/28,ELM
2,3.,7,DUPUIS Igor,2,2023-07-28 00:17:23,2023-07-28 00:01:30,8,2023-07-28 01:12:56,2023-07-28 00:00:53,18,2023-07-28 00:25:24,Finish,2023-07-28 01:58:07,3/28,ELM
3,4.,347,NAVARRO Thomas,29,2023-07-28 00:20:02,2023-07-28 00:01:42,4,2023-07-28 01:11:27,2023-07-28 00:00:56,7,2023-07-28 00:24:24,Finish,2023-07-28 01:58:34,4/28,ELM
4,5.,27,LEBOIS Paul,6,2023-07-28 00:17:32,2023-07-28 00:01:32,11,2023-07-28 01:14:28,2023-07-28 00:00:52,9,2023-07-28 00:24:30,Finish,2023-07-28 01:58:56,5/28,ELM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,1405.,283,FLASSE Vic,895,2023-07-28 00:32:56,2023-07-28 00:06:46,1401,2023-07-28 03:07:11,2023-07-28 00:03:14,1383,2023-07-28 01:01:05,Finish,2023-07-28 04:51:15,25/26,F20-24
367,1406.,1554,CAMILLE Bensiam,871,2023-07-28 00:32:41,2023-07-28 00:05:26,1400,2023-07-28 03:06:01,2023-07-28 00:02:51,1391,2023-07-28 01:06:34,Finish,2023-07-28 04:53:36,81/93,F25-29
368,1407.,1167,MOUROT Anne-Lise,1108,2023-07-28 00:35:16,2023-07-28 00:03:42,1405,2023-07-28 03:13:29,2023-07-28 00:01:48,1381,2023-07-28 01:00:22,Finish,2023-07-28 04:54:38,45/52,F45-49
370,1409.,745,SHE Michelle,1410,2023-07-28 00:45:53,2023-07-28 00:09:10,1393,2023-07-28 03:01:21,2023-07-28 00:03:19,1384,2023-07-28 01:01:48,Finish,2023-07-28 05:01:33,82/93,F25-29


In [15]:
# Now let's reorder the data frame
dataframe.sort_values(by = "Time", ascending=True, inplace=True)

In [16]:
# Let's drop duplicated rows 
dataframe.drop_duplicates(inplace=True)

In [17]:
dataframe.set_index("Position", inplace=True)

# Data analysis

In [18]:
dataframe

Unnamed: 0_level_0,Number,Name,Pos_in_swim,Swim_time,T1,Pos_in_bike,Bike_time,T2,Pos_in_run,Run_time,Race_control,Time,Rank,Category
Position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.,14,VINUELA GONZALEZ Kevin Tarek,3,2023-07-28 00:17:25,2023-07-28 00:01:29,1,2023-07-28 01:08:58,2023-07-28 00:00:55,4,2023-07-28 00:23:41,Finish,2023-07-28 01:52:29,1/28,ELM
2.,18,LEJEUNE Emmanuel,15,2023-07-28 00:19:18,2023-07-28 00:01:30,5,2023-07-28 01:12:08,2023-07-28 00:00:55,5,2023-07-28 00:23:52,Finish,2023-07-28 01:57:46,2/28,ELM
3.,7,DUPUIS Igor,2,2023-07-28 00:17:23,2023-07-28 00:01:30,8,2023-07-28 01:12:56,2023-07-28 00:00:53,18,2023-07-28 00:25:24,Finish,2023-07-28 01:58:07,3/28,ELM
4.,347,NAVARRO Thomas,29,2023-07-28 00:20:02,2023-07-28 00:01:42,4,2023-07-28 01:11:27,2023-07-28 00:00:56,7,2023-07-28 00:24:24,Finish,2023-07-28 01:58:34,4/28,ELM
5.,27,LEBOIS Paul,6,2023-07-28 00:17:32,2023-07-28 00:01:32,11,2023-07-28 01:14:28,2023-07-28 00:00:52,9,2023-07-28 00:24:30,Finish,2023-07-28 01:58:56,5/28,ELM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1405.,283,FLASSE Vic,895,2023-07-28 00:32:56,2023-07-28 00:06:46,1401,2023-07-28 03:07:11,2023-07-28 00:03:14,1383,2023-07-28 01:01:05,Finish,2023-07-28 04:51:15,25/26,F20-24
1406.,1554,CAMILLE Bensiam,871,2023-07-28 00:32:41,2023-07-28 00:05:26,1400,2023-07-28 03:06:01,2023-07-28 00:02:51,1391,2023-07-28 01:06:34,Finish,2023-07-28 04:53:36,81/93,F25-29
1407.,1167,MOUROT Anne-Lise,1108,2023-07-28 00:35:16,2023-07-28 00:03:42,1405,2023-07-28 03:13:29,2023-07-28 00:01:48,1381,2023-07-28 01:00:22,Finish,2023-07-28 04:54:38,45/52,F45-49
1409.,745,SHE Michelle,1410,2023-07-28 00:45:53,2023-07-28 00:09:10,1393,2023-07-28 03:01:21,2023-07-28 00:03:19,1384,2023-07-28 01:01:48,Finish,2023-07-28 05:01:33,82/93,F25-29
