# Prelude

In [65]:
import pandas as pd
import altair as alt

In [66]:
csvpath = input("File path? (defaults to: ./marathon_small.csv): ") or "./marathon_small.csv"
print(csvpath)

./marathon_small.csv


# Step 1: Load and Explore the Data

In [67]:
## Read the dataset
dataframe = pd.read_csv(csvpath)
dataframe

Unnamed: 0,age,bmi,km5_time_seconds,km10_time_seconds,sex
0,25.0,21.622116,,2798.0,female
1,41.0,23.905970,1210.0,,male
2,25.0,21.640728,994.0,,male
3,35.0,23.592323,1075.0,2135.0,male
4,34.0,22.706404,1186.0,,male
...,...,...,...,...,...
1828,32.0,22.727272,,2591.0,female
1829,55.0,20.585695,1157.0,2771.0,male
1830,42.0,23.747681,1203.0,,male
1831,23.0,24.209032,2040.0,,female


## Display top 25

In [68]:
dataframe[:25] # dataframe.head(25) works too. More familiar with slice operator.

Unnamed: 0,age,bmi,km5_time_seconds,km10_time_seconds,sex
0,25.0,21.622116,,2798.0,female
1,41.0,23.90597,1210.0,,male
2,25.0,21.640728,994.0,,male
3,35.0,23.592323,1075.0,2135.0,male
4,34.0,22.706404,1186.0,,male
5,45.0,42.087543,3240.0,,female
6,33.0,22.518295,1292.0,,male
7,58.0,25.234079,,3420.0,male
8,29.0,24.505407,1440.0,3240.0,male
9,36.0,25.408615,2115.0,4210.0,female


## \# of Rows and Columns

In [69]:
[rows, cols] = dataframe.shape
print(f"Rows: {rows}, Columns: {cols}")

Rows: 1833, Columns: 5


# Step 2: Subset and Filter Data

## Only male runners

In [70]:
male_runners = dataframe.loc[
  dataframe.sex == "male"
]
male_runners

Unnamed: 0,age,bmi,km5_time_seconds,km10_time_seconds,sex
1,41.0,23.905970,1210.0,,male
2,25.0,21.640728,994.0,,male
3,35.0,23.592323,1075.0,2135.0,male
4,34.0,22.706404,1186.0,,male
6,33.0,22.518295,1292.0,,male
...,...,...,...,...,...
1825,36.0,25.968874,1380.0,2940.0,male
1826,58.0,29.749651,,4260.0,male
1829,55.0,20.585695,1157.0,2771.0,male
1830,42.0,23.747681,1203.0,,male


## Only `bmi` and `km10_time_seconds` columns

In [71]:
# nit, naming might be confusing or vague in real world applications, but good enough for jupyter notebook
# imho: male_runners_reduced__bmi__km10_time_seconds may be better. dependent on code styling wherever you work i guess.
male_runners_reduced = male_runners[["bmi", "km10_time_seconds"]]
male_runners_reduced 

Unnamed: 0,bmi,km10_time_seconds
1,23.905970,
2,21.640728,
3,23.592323,2135.0
4,22.706404,
6,22.518295,
...,...,...
1825,25.968874,2940.0
1826,29.749651,4260.0
1829,20.585695,2771.0
1830,23.747681,


In [72]:
# nit: removed empty rows, clean data, not specifically in instructions
# could've also done replacement strategy (ie. some linear relationship between bmi and km10_time, but this is assuming 
# awfully a lot (and could affect intepretation), removing outright seemed better)
male_runners_cleaned = male_runners_reduced.dropna()
male_runners_cleaned

Unnamed: 0,bmi,km10_time_seconds
3,23.592323,2135.0
7,25.234079,3420.0
8,24.505407,3240.0
10,26.401688,3360.0
11,25.560312,2580.0
...,...,...
1823,22.263451,2890.0
1825,25.968874,2940.0
1826,29.749651,4260.0
1829,20.585695,2771.0


# Step 3: Transform Data

In [73]:
# ditto: somewhat vague name, but w/e
male_runners_transformed =  male_runners_cleaned.assign(
  km10_time_minutes=male_runners_cleaned["km10_time_seconds"] / 60
)
male_runners_transformed

Unnamed: 0,bmi,km10_time_seconds,km10_time_minutes
3,23.592323,2135.0,35.583333
7,25.234079,3420.0,57.000000
8,24.505407,3240.0,54.000000
10,26.401688,3360.0,56.000000
11,25.560312,2580.0,43.000000
...,...,...,...
1823,22.263451,2890.0,48.166667
1825,25.968874,2940.0,49.000000
1826,29.749651,4260.0,71.000000
1829,20.585695,2771.0,46.183333


# Step 4: Visualise the Relationship

## Create scatter plot

In [74]:
plot = alt.Chart(male_runners_transformed).mark_point().encode(
  x="bmi",
  y="km10_time_minutes"
)
plot

## Customize the visualization

In [75]:
# is name collision/var overwrite a real concern in data sci applications? probably not...
plot = alt.Chart(male_runners_transformed).mark_point().encode(
  x=alt.X("bmi").title("Body Mass Index (BMI)"),
  y=alt.Y("km10_time_minutes").title("10 km run time (minutes)")
).configure_axis(
  labelFontSize=12,
  titleFontSize=12,
)
plot

# Step 5: Interpret the Results:

B. Positive correlation. IE. (Higher BMI → Longer race time)

**Other comments:**

There is a large clustering which also may indicate the average type of person to do a 10k race (ie. they have to have a certain confidence of their own athletic ability to participate, self-select out of the race), ie. the most common race participant. So the data may be limited in that it only includes people likely to do a 10k (ie. may not be generally representative of a wider population). 

Assuming 10km run time is everyone who made it to the 10km mark, it also excludes any people who were unable to make it. (ie. 10km time was not extrapolated from shorter time `km5_time_seconds`), or maybe the race had split divisions (ie. a 5k and 10k race division). Without further context on the dataset it is not possible to make the deduction. 

