In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import altair as alt
import vegafusion
alt.data_transformers.disable_max_rows()

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
url = "https://data.scorenetwork.org/data/boston_marathon_2023.csv"

df = pd.read_csv(url)
df.head()

Unnamed: 0,age_group,place_overall,place_gender,place_division,name,team,bib_number,half_time,finish_net,finish_gun,gender,half_time_sec,finish_net_sec,finish_gun_sec,finish_net_minutes
0,18-39,1,1,1,"Chebet, Evans",Team–,1,1H 2M 20S,2H 5M 54S,2H 5M 54S,M,3740.0,7554,7554,125.9
1,18-39,2,2,2,"Geay, Gabriel",Team–,3,1H 2M 20S,2H 6M 4S,2H 6M 4S,M,3740.0,7564,7564,126.066667
2,18-39,3,3,3,"Kipruto, Benson",Team–,5,1H 2M 19S,2H 6M 6S,2H 6M 6S,M,3739.0,7566,7566,126.1
3,18-39,4,4,4,"Korir, Albert",Team–,19,1H 2M 20S,2H 8M 1S,2H 8M 1S,M,3740.0,7681,7681,128.016667
4,18-39,5,5,5,"Talbi, Zouhair",Team–,31,1H 2M 20S,2H 8M 35S,2H 8M 35S,M,3740.0,7715,7715,128.583333


In [3]:

# Step 1: Convert seconds to datetime (treating as timestamps from 1970-01-01)
df["fake_finish"] = pd.to_datetime(df["finish_net_sec"], unit="s", origin="1970-01-01")


df.head()

Unnamed: 0,age_group,place_overall,place_gender,place_division,name,team,bib_number,half_time,finish_net,finish_gun,gender,half_time_sec,finish_net_sec,finish_gun_sec,finish_net_minutes,fake_finish
0,18-39,1,1,1,"Chebet, Evans",Team–,1,1H 2M 20S,2H 5M 54S,2H 5M 54S,M,3740.0,7554,7554,125.9,1970-01-01 02:05:54
1,18-39,2,2,2,"Geay, Gabriel",Team–,3,1H 2M 20S,2H 6M 4S,2H 6M 4S,M,3740.0,7564,7564,126.066667,1970-01-01 02:06:04
2,18-39,3,3,3,"Kipruto, Benson",Team–,5,1H 2M 19S,2H 6M 6S,2H 6M 6S,M,3739.0,7566,7566,126.1,1970-01-01 02:06:06
3,18-39,4,4,4,"Korir, Albert",Team–,19,1H 2M 20S,2H 8M 1S,2H 8M 1S,M,3740.0,7681,7681,128.016667,1970-01-01 02:08:01
4,18-39,5,5,5,"Talbi, Zouhair",Team–,31,1H 2M 20S,2H 8M 35S,2H 8M 35S,M,3740.0,7715,7715,128.583333,1970-01-01 02:08:35


In [4]:
df['fake_finish'].dtype

dtype('<M8[ns]')

In [10]:
simple_fig = px.histogram(
    x = df['fake_finish'],
    nbins = 100
)
# Format x-axis to show HH:MM:SS
simple_fig.update_layout(
    xaxis_title="Finish Time (HH:MM:SS)",
    xaxis=dict(tickformat="%H:%M:%S")
)
simple_fig

In [18]:
# Convert to datetime64[ns]
df["fake_finish"] = pd.to_datetime(df["finish_net_sec"], unit="s", origin="1970-01-01")

#  Group timestamps into 5-minute bins
df["binned_time"] = df["fake_finish"].dt.ceil("5min")  # Rounds down to nearest 5 min

#  Count occurrences per bin
grouped_df = df.groupby("binned_time").size().reset_index(name="count")

#  Create Altair bar chart (looks like a histogram)
alt_fig = alt.Chart(grouped_df).mark_bar().encode(
    x=alt.X("binned_time:T", title="Finish Time (HH:MM:SS)", axis=alt.Axis(format="%H:%M:%S")),  # X-axis is timestamp
    y=alt.Y("count:Q", title="Count"),
    tooltip=[
        alt.Tooltip("binned_time:T", title="Binned Time", format="%H:%M:%S"),
        alt.Tooltip("count:Q", title="Count")
    ]
).properties(
    title="Finish Time Distribution (5-Minute Bins)",
    width=800
).interactive()

alt_fig