# Get info on marathon cities

In [28]:
import pandas as pd 
from functools import reduce

In [29]:
df = pd.read_csv(
    '../data/running-records.csv'
)

In [30]:
marathons_df = df.loc[
    df['Event'] == 'Marathon'
]

In [53]:
unique_cities_sorted = marathons_df['City'].value_counts().sort_values(ascending=False)
major_cities = unique_cities_sorted[unique_cities_sorted > 50]

record_count = major_cities.reset_index()
record_count.columns = ['City', 'Record Count']

record_count

Unnamed: 0,City,Record Count
0,London,649
1,Berlin,452
2,Tokyo,429
3,Chicago,368
4,Boston,359
5,Paris,346
6,Valencia,344
7,Amsterdam,321
8,Rotterdam,293
9,New York City,292


In [54]:
count_by_gender = marathons_df.groupby(['City', 'Gender'])\
.agg(unique_count=('Name', 'nunique'))\
.reset_index()\
.rename(columns={'unique_count': 'People Count By Gender'})

gender_count = count_by_gender.pivot(
    index='City', 
    columns='Gender', 
    values='People Count By Gender'
).reset_index()
gender_count.columns.name = None

gender_count

Unnamed: 0,City,Men,Women
0,Abu Dhabi,11.0,13.0
1,Ad-Dawhah,,1.0
2,Ampugnano,27.0,17.0
3,Amsterdam,161.0,90.0
4,Annecy,,1.0
...,...,...,...
242,Yamaguchi,1.0,1.0
243,Yingkou,,1.0
244,Yokohama,,26.0
245,Zhengzhou,3.0,6.0


In [55]:
country_count = marathons_df.groupby('City')['Country'].nunique().reset_index(name='Country Count')
country_count

Unnamed: 0,City,Country Count
0,Abu Dhabi,6
1,Ad-Dawhah,1
2,Ampugnano,11
3,Amsterdam,29
4,Annecy,1
...,...,...
242,Yamaguchi,1
243,Yingkou,1
244,Yokohama,11
245,Zhengzhou,3


In [56]:
people_count = marathons_df.groupby('City')['Name'].nunique().reset_index(name='People Count')
people_count

Unnamed: 0,City,People Count
0,Abu Dhabi,24
1,Ad-Dawhah,1
2,Ampugnano,44
3,Amsterdam,251
4,Annecy,1
...,...,...
242,Yamaguchi,2
243,Yingkou,1
244,Yokohama,26
245,Zhengzhou,9


In [57]:
marathon_summary = [record_count, country_count, people_count, gender_count]

merged_df = reduce(lambda left, right: pd.merge(left, right, on='City', how='outer'), marathon_summary)

merged_df = merged_df.dropna(subset=['Record Count'], axis=0)

merged_df

Unnamed: 0,City,Record Count,Country Count,People Count,Men,Women
0,London,649.0,50,350,146.0,204.0
1,Berlin,452.0,34,356,162.0,194.0
2,Tokyo,429.0,33,309,183.0,126.0
3,Chicago,368.0,30,255,134.0,121.0
4,Boston,359.0,42,235,100.0,135.0
5,Paris,346.0,23,274,161.0,113.0
6,Valencia,344.0,47,271,149.0,122.0
7,Amsterdam,321.0,29,251,161.0,90.0
8,Rotterdam,293.0,35,229,150.0,79.0
9,New York City,292.0,29,165,60.0,105.0


In [44]:
merged_df.to_json(
    '../public/marathon_cities.json', 
    orient='records',
    indent=2,
)