# Marathon Runner Books

Marathon runners do not often write books but sometimes they do. What can we learn from the books we have about running?

In [None]:
%%capture
import json
from google.colab import userdata

!pip install kaggle

!mkdir -p ~/.config/kaggle
!touch ~/.config/kaggle/kaggle.json

secrets = {
  "username": userdata.get('KAGGLE_USERNAME'),
  "key": userdata.get('KAGGLE_KEY'),
}

with open('/root/.config/kaggle/kaggle.json', 'w') as file:
    json.dump(secrets, file)

In [None]:
%%capture

import kaggle

!kaggle datasets download --force -d evgenyarbatov/marathon-record-runner-books
!unzip -o marathon-record-runner-books.zip

!kaggle datasets download --force -d evgenyarbatov/marathon-running-times
!unzip -o marathon-running-times.zip

In [None]:
import ast

import pandas as pd
import matplotlib.pyplot as plt

In [None]:
books_df = pd.read_csv('/content/marathon-runner-books.csv')

books_df['publish_year'] = books_df['published_date'].str[:4].astype(int)
books_df['publish_year'] = pd.to_datetime(books_df['publish_year'], format='%Y').dt.year

books_df['authors'] = books_df['authors'].apply(ast.literal_eval)
books_df = books_df.explode('authors')

books_df.head(3)

In [None]:
marathon_df = pd.read_csv('/content/marathon.csv')

marathon_df['Date'] = pd.to_datetime(marathon_df['Date'], format='%d.%m.%Y')
marathon_df['Date of Birth'] = pd.to_datetime(marathon_df['Date of Birth'], format='%d.%m.%Y')

marathon_df.head(3)

In [None]:
df = pd.merge(marathon_df, books_df, left_on='Name', right_on='authors', how='inner')

df = df[[
    'Rank',
    'Time',
    'Name',
    'Country',
    'Date of Birth',
    'City',
    'Date',
    'title',
    'publish_year',
]]

df

In [None]:
grouped = df.groupby('publish_year')['title'].nunique().reset_index(name='Count')

plt.figure(figsize=(10, 6), dpi=300)
plt.plot(grouped['publish_year'], grouped['Count'])

plt.title('Marathon Runner Books by Year')
plt.xlabel('Year')
plt.ylabel('Count of Books Published')

plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
df['Years to Write'] = df['publish_year'] - df['Date'].dt.year
df['Age at Publication'] = df['publish_year'] - df['Date of Birth'].dt.year

df.head(3)

In [None]:
fig, ax = plt.subplots(dpi=300)

df['bin'] = pd.cut(df['Age at Publication'], bins=6)
bin_counts = df['bin'].value_counts()

bin_counts.plot(kind='bar')

plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Distribution of Runner Age at Publication Time')

plt.show()

In [None]:
fig, ax = plt.subplots(dpi=300)

df['bin'] = pd.cut(df['Years to Write'], bins=6)
bin_counts = df['bin'].value_counts()

bin_counts.plot(kind='bar')

plt.xlabel('Years')
plt.ylabel('Count')
plt.title('How Many Years To Write Book After Marathon')

plt.show()

In [None]:
df[[
    'Name',
    'title',
    'Years to Write',
]].sort_values(by='Years to Write', ascending=False)

In [None]:
df[[
    'Name',
    'title',
    'Age at Publication',
]].sort_values(by='Age at Publication', ascending=False)