In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

%matplotlib inline

In [4]:
import os
data_file = os.path.join("Resources", "marathon_results_2019.csv")
df = pd.read_csv(data_file)
df.head()

Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 8,5K,...,25K,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division
0,2,"Cherono, Lawrence",30,M,Eldoret,,KEN,,,0:15:11,...,1:16:23,1:32:01,1:47:16,2:01:45,0:04:53,,2:07:57,1,1,1
1,6,"Desisa, Lelisa",29,M,Ambo,,ETH,,,0:15:10,...,1:16:24,1:32:01,1:47:16,2:01:46,0:04:53,,2:07:59,2,2,2
2,7,"Kipkemoi, Kenneth",34,M,Eldoret,,KEN,,,0:15:14,...,1:16:24,1:32:01,1:47:16,2:01:45,0:04:54,,2:08:07,3,3,3
3,8,"Kandie, Felix",32,M,Iten,,KEN,,,0:15:14,...,1:16:24,1:32:01,1:47:16,2:02:08,0:04:55,,2:08:54,4,4,4
4,11,"Kirui, Geoffrey",26,M,Keringet,,KEN,,,0:15:12,...,1:16:23,1:32:01,1:47:16,2:01:57,0:04:56,,2:08:55,5,5,5


In [5]:
# Subset the dataframe to only the columns "Age", "M/F", split times (i.e. "5K, 10K", etc.), "Pace", and "Official Time". 
df_subset = df[["Age", "M/F", "5K", "10K", "15K", "20K", "25K", "30K", "35K", "40K", "Pace", "Official Time"]]
df_subset.head()

Unnamed: 0,Age,M/F,5K,10K,15K,20K,25K,30K,35K,40K,Pace,Official Time
0,30,M,0:15:11,0:30:21,0:45:48,1:01:16,1:16:23,1:32:01,1:47:16,2:01:45,0:04:53,2:07:57
1,29,M,0:15:10,0:30:22,0:45:46,1:01:16,1:16:24,1:32:01,1:47:16,2:01:46,0:04:53,2:07:59
2,34,M,0:15:14,0:30:22,0:45:47,1:01:17,1:16:24,1:32:01,1:47:16,2:01:45,0:04:54,2:08:07
3,32,M,0:15:14,0:30:24,0:45:47,1:01:16,1:16:24,1:32:01,1:47:16,2:02:08,0:04:55,2:08:54
4,26,M,0:15:12,0:30:21,0:45:46,1:01:15,1:16:23,1:32:01,1:47:16,2:01:57,0:04:56,2:08:55


In [15]:
# Convert the split times, "Pace", and "Official Time" to timedeltas using apply() and pandas.to_timedelta()
time_columns = ["5K", "10K", "15K", "20K", "25K", "30K", "35K", "40K", "Pace", "Official Time"]
df_subset[time_columns] = df_subset[time_columns].apply(pd.to_timedelta)
df_subset.head()

Unnamed: 0,Age,M/F,5K,10K,15K,20K,25K,30K,35K,40K,Pace,Official Time
0,30,M,0 days 00:00:00.000000911,0 days 00:00:00.000001821,0 days 00:00:00.000002748,0 days 00:00:00.000003676,0 days 00:00:00.000004583,0 days 00:00:00.000005521,0 days 00:00:00.000006436,0 days 00:00:00.000007305,0 days 00:00:00.000000293,0 days 00:00:00.000007677
1,29,M,0 days 00:00:00.000000910,0 days 00:00:00.000001822,0 days 00:00:00.000002746,0 days 00:00:00.000003676,0 days 00:00:00.000004584,0 days 00:00:00.000005521,0 days 00:00:00.000006436,0 days 00:00:00.000007306,0 days 00:00:00.000000293,0 days 00:00:00.000007679
2,34,M,0 days 00:00:00.000000914,0 days 00:00:00.000001822,0 days 00:00:00.000002747,0 days 00:00:00.000003677,0 days 00:00:00.000004584,0 days 00:00:00.000005521,0 days 00:00:00.000006436,0 days 00:00:00.000007305,0 days 00:00:00.000000294,0 days 00:00:00.000007687
3,32,M,0 days 00:00:00.000000914,0 days 00:00:00.000001824,0 days 00:00:00.000002747,0 days 00:00:00.000003676,0 days 00:00:00.000004584,0 days 00:00:00.000005521,0 days 00:00:00.000006436,0 days 00:00:00.000007328,0 days 00:00:00.000000295,0 days 00:00:00.000007734
4,26,M,0 days 00:00:00.000000912,0 days 00:00:00.000001821,0 days 00:00:00.000002746,0 days 00:00:00.000003675,0 days 00:00:00.000004583,0 days 00:00:00.000005521,0 days 00:00:00.000006436,0 days 00:00:00.000007317,0 days 00:00:00.000000296,0 days 00:00:00.000007735


In [16]:
df_subset.dtypes

Age                        int64
M/F                       object
5K               timedelta64[ns]
10K              timedelta64[ns]
15K              timedelta64[ns]
20K              timedelta64[ns]
25K              timedelta64[ns]
30K              timedelta64[ns]
35K              timedelta64[ns]
40K              timedelta64[ns]
Pace             timedelta64[ns]
Official Time    timedelta64[ns]
dtype: object

In [19]:
# Convert timedeltas to seconds by applying the lambda function lambda x: x.dt.total_seconds()
df_subset[time_columns] = df_subset[time_columns].apply(lambda x: x.dt.total_seconds())


AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
# Subset df to only rows where the split values are non-zero
### YOUR CODE HERE ###

In [None]:
# Use LabelEncoder to convert 'M/F' into integer labels
df["M/F"] = LabelEncoder().fit_transform(df["M/F"])

In [None]:
# Convert 'Age' to a numeric value
### YOUR CODE HERE ###

In [None]:
# 'Pace' should be perfectly correlated with 'Official Time', so we'll remove it from our X data (but keep it for analysis)
# Plot a scatter plot of 'Pace' against 'Official Time' to make sure
### YOUR CODE HERE ###

In [None]:
# Create a training set 'X' with every column except 'Pace'
### YOUR CODE HERE ###

In [None]:
# Scale the dataset using MinMaxScaler()
### YOUR CODE HERE ###