# PREPARE

## MODULES IMPORT

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math

## CLEAN DATA IMPORT

In [5]:
df = pd.read_csv('./boston_clean.csv')

### CHANGING TYPE OF COLUMNS THAT USING TIME AS VALUES

In [7]:
time_columns = ['Pace', '5K', '10K', '15K', '20K', '25K', '30K', '35K', '40K', 'Official Time']
df[time_columns] = df[time_columns].apply(lambda col: pd.to_timedelta(col, errors='coerce'))

# BASIC INFORMATION

In [9]:
num_participants = len(df) # Number of participants
print(f"Number of participants: {num_participants}")

Number of participants: 79638


## PARTICIPANT ATTRIBUTES
The dataset contains the following attributes for each marathon participant:
- Name, gender, age
- Country and city
- Split times at each 5K
- Pace, official finish time, rank
- Year of participation

# UNIVARIATE ANALYSIS

## M/F

In [13]:
gender = df['M/F']

### GENDER DISTRIBUTION 

#### workshop

In [16]:
df_male = df[gender == 'M'] # Dataframe with males only
df_female = df[gender == 'F'] # Dataframe with females only

num_male = len(df_male) # Number of men
num_female = len(df_female) # Number of women

percent_male = round((num_male / num_participants * 100),2) # Percent of males
percent_female = round((num_female / num_participants * 100),2) # Percent of females

#### RESULTS

In [18]:
print(f"Number of males: {num_male}")
print(f"Number of females: {num_female}")
print(f"Percent of males: {percent_male}")
print(f"Percent of females: {percent_female}")

Number of males: 43482
Number of females: 36156
Percent of males: 54.6
Percent of females: 45.4


## AGE

In [20]:
age = df['Age'] # DataSeries with age

### MIN, MAX, AVG, MED

#### workshop

In [23]:
age_max = age.max() # Max age of participants
age_min = age.min() # Min age of participants
age_mean = round(age.mean(), 2) # Average age
age_median = age.median() # Median age

#### RESULTS

In [25]:
print(f"Max age of participants: {age_max}")
print(f"Min age of participants: {age_min}")
print(f"Average age: {age_mean}")
print(f"Median age: {age_median}")

Max age of participants: 84
Min age of participants: 18
Average age: 42.42
Median age: 42.0


### DISTRIBUTION OF AGE GROUPS

In [27]:
age_groups = df['Age Group']

#### workshop

In [29]:
# count of age groups
age_groups_count = age_groups.value_counts().sort_index()
# percentage of age groups
age_groups_percent = round(age_groups.value_counts(normalize=True).sort_index() * 100, 2)
# creating df to present results
df_age_groups_dist = pd.concat([age_groups_count, age_groups_percent], axis=1)

#### RESULTS

In [31]:
print(f"Percentage distribution of age groups:{'\n'}{df_age_groups_dist}")

# RESULTS EXPORT
df_age_groups_dist.to_csv('./export/age_groups_dist.csv')

Percentage distribution of age groups:
           count  proportion
Age Group                   
18–21        952        1.20
22–25       4461        5.60
26–29       7128        8.95
30–33       7188        9.03
34–37       8379       10.52
38–41       9457       11.87
42–45       9875       12.40
46–49       9900       12.43
50–53       8282       10.40
54–57       6154        7.73
58–61       4140        5.20
62–65       2121        2.66
66–69       1018        1.28
70–84        583        0.73


## COUNTRY, CONTINENT

In [33]:
country = df['Country']
continent = df['Continent']

### COUNTRIES COUNT

#### workshop

In [36]:
country_num = country.nunique() # number of countries

#### RESULTS

In [38]:
print(f"Number of countries in marathon: {country_num}")

Number of countries in marathon: 105


### COUNTRIES AND CONTINENTS DISTRIBUTION

#### workshop

In [41]:
# countries
country_count = country.value_counts()
country_percent = round(country.value_counts(normalize=True) * 100, 1)
# continents
continent_count = continent.value_counts()
continent_percent = round(continent.value_counts(normalize=True) * 100, 1)
# creating df to present results
df_country_dist = pd.concat([country_count, country_percent], axis=1)
df_continent_dist = pd.concat([continent_count, continent_percent], axis=1)

#### RESULTS

In [43]:
print(f"Top 10 countries distribution:{'\n'*2}{df_country_dist.head(10)}{'\n'*2}")
print(f"Continents distribution:{'\n'*2}{df_continent_dist}")

# RESULTS EXPORT
df_country_dist.to_csv('./export/country_dist.csv')
df_continent_dist.to_csv('./export/continent_dist.csv')

Top 10 countries distribution:

         count  proportion
Country                   
USA      64474        81.0
CAN       6171         7.7
GBR       1072         1.3
MEX        768         1.0
GER        573         0.7
JPN        491         0.6
AUS        475         0.6
ITA        474         0.6
CHN        430         0.5
BRA        428         0.5


Continents distribution:

               count  proportion
Continent                       
North America  71701        90.0
Europe          4499         5.6
Asia            1772         2.2
South America    971         1.2
Oceania          554         0.7
Africa           141         0.2


## TIME

In [64]:
df.head()

Unnamed: 0,Name,M/F,Age,Age Group,Country,Continent,City,Pace,5K,10K,15K,20K,25K,30K,35K,40K,Official Time,Rank,Year
0,"Desisa, Lelisa",M,25,22–25,ETH,Africa,Ambo,0 days 00:03:03.925872902,0 days 00:14:43,0 days 00:29:43,0 days 00:44:57,0 days 01:00:29,0 days 01:16:07,0 days 01:32:00,0 days 01:47:59,0 days 02:02:39,0 days 02:09:17,1,2015
1,"Kirui, Geoffrey",M,24,22–25,KEN,Africa,Keringet,0 days 00:03:04.547244094,0 days 00:15:25,0 days 00:30:28,0 days 00:45:44,0 days 01:01:15,0 days 01:16:59,0 days 01:33:01,0 days 01:48:19,0 days 02:02:53,0 days 02:09:37,2,2017
2,"Tsegay, Yemane Adhane",M,30,30–33,ETH,Africa,Addis Ababa,0 days 00:03:05.168615286,0 days 00:14:43,0 days 00:29:43,0 days 00:44:58,0 days 01:00:28,0 days 01:16:07,0 days 01:31:59,0 days 01:47:59,0 days 02:02:42,0 days 02:09:48,3,2015
3,"Rupp, Galen",M,30,30–33,USA,North America,Portland,0 days 00:03:05.168615286,0 days 00:15:24,0 days 00:30:27,0 days 00:45:44,0 days 01:01:15,0 days 01:16:59,0 days 01:33:01,0 days 01:48:19,0 days 02:03:14,0 days 02:09:58,4,2017
4,"Chebet, Wilson",M,29,26–29,KEN,Africa,Marakwet,0 days 00:03:05.789986478,0 days 00:14:43,0 days 00:29:43,0 days 00:44:57,0 days 01:00:29,0 days 01:16:07,0 days 01:32:00,0 days 01:47:59,0 days 02:03:01,0 days 02:10:22,5,2015


In [66]:
pace = df['Pace'] # DataSeries with pace
official_time = df['Official Time'] # DataSeries with official finish time

### BEST, LONGEST, AVG, MED

#### workshop

In [94]:
# OFFICIAL FINISH TIME
time_fastest = official_time.min()
time_longest = official_time.max()
time_avg = official_time.mean()
time_med = official_time.median()

# PACE
pace_fastest = pace.min()
pace_longest = pace.max()
pace_avg = pace.mean()
pace_med = pace.median()

#### RESULTS

In [97]:
print('OFFICIAL FINISH TIME STATISTICS')
print(f"Fastest time in marathon is: {time_fastest}")
print(f"Longest time in marathon is: {time_longest}")
print(f"Average time in marathon is: {time_avg}")
print(f"Median time in marathon is: {time_med}")
print('PACE STATISTICS')
print(f"Fastest pace in marathon is: {pace_fastest}")
print(f"Longest pace in marathon is: {pace_longest}")
print(f"Average pace in marathon is: {pace_avg}")
print(f"Median pace in marathon is: {pace_med}")

OFFICIAL FINISH TIME STATISTICS
Fastest time in marathon is: 0 days 02:09:17
Longest time in marathon is: 0 days 10:30:23
Average time in marathon is: 0 days 03:53:09.929166980
Median time in marathon is: 0 days 03:46:32
PACE STATISTICS
Fastest pace in marathon is: 0 days 00:03:03.925872902
Longest pace in marathon is: 0 days 00:15:13.415652588
Average pace in marathon is: 0 days 00:05:31.865983309
Median pace in marathon is: 0 days 00:05:22.491648771
