# PREPARE

## MODULES IMPORT

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math

## CLEAN DATA IMPORT

In [5]:
df = pd.read_csv('./boston_clean.csv')

### CHANGING TYPE OF COLUMNS THAT USING TIME AS VALUES

In [7]:
time_columns = ['Pace', '5K', '10K', '15K', '20K', '25K', '30K', '35K', '40K', 'Official Time']
df[time_columns] = df[time_columns].apply(lambda col: pd.to_timedelta(col, errors='coerce'))

# BASIC INFORMATION

In [9]:
num_participants = len(df) # Number of participants
print(f"Number of participants: {num_participants}")

Number of participants: 79638


## PARTICIPANT ATTRIBUTES
The dataset contains the following attributes for each marathon participant:
- Name, gender, age
- Country and city
- Split times at each 5K
- Pace, official finish time, rank
- Year of participation

# UNIVARIATE ANALYSIS

## M/F

In [13]:
gender = df['M/F']

### GENDER DISTRIBUTION 

#### workshop

In [16]:
df_male = df[gender == 'M'] # Dataframe with males only
df_female = df[gender == 'F'] # Dataframe with females only

gender_count = gender.value_counts() # gender count
gender_percent = round(gender.value_counts(normalize=True) * 100, 2) # gender percentage
df_gender_dist = pd.concat([gender_count, gender_percent], axis=1) # gender distribution

#### RESULTS

In [18]:
print(f"Gender distribution:\n{df_gender_dist}")
# RESULTS EXPORT
df_gender_dist.to_csv('./export/gender_dist.csv')

Gender distribution:
     count  proportion
M/F                   
M    43482        54.6
F    36156        45.4


## AGE

In [20]:
age = df['Age'] # DataSeries with age

### MIN, MAX, AVG, MED

#### workshop

In [23]:
age_max = age.max() # Max age of participants
age_min = age.min() # Min age of participants
age_avg = round(age.mean(), 2) # Average age
age_med = age.median() # Median age

#### RESULTS

In [25]:
print(f"Max age of participants: {age_max}")
print(f"Min age of participants: {age_min}")
print(f"Average age: {age_avg}")
print(f"Median age: {age_med}")

Max age of participants: 84
Min age of participants: 18
Average age: 42.42
Median age: 42.0


### DISTRIBUTION OF AGE GROUPS

In [27]:
age_groups = df['Age Group']

#### workshop

In [29]:
# count of age groups
age_groups_count = age_groups.value_counts().sort_index()
# percentage of age groups
age_groups_percent = round(age_groups.value_counts(normalize=True).sort_index() * 100, 2)
# creating df to present results
df_age_groups_dist = pd.concat([age_groups_count, age_groups_percent], axis=1)

#### RESULTS

In [31]:
print(f"Percentage distribution of age groups:{'\n'}{df_age_groups_dist}")

# RESULTS EXPORT
df_age_groups_dist.to_csv('./export/age_groups_dist.csv')

Percentage distribution of age groups:
           count  proportion
Age Group                   
18–21        952        1.20
22–25       4461        5.60
26–29       7128        8.95
30–33       7188        9.03
34–37       8379       10.52
38–41       9457       11.87
42–45       9875       12.40
46–49       9900       12.43
50–53       8282       10.40
54–57       6154        7.73
58–61       4140        5.20
62–65       2121        2.66
66–69       1018        1.28
70–84        583        0.73


## COUNTRY, CONTINENT

In [33]:
country = df['Country']
continent = df['Continent']

### COUNTRIES AND CONTINENTS DISTRIBUTION

#### workshop

In [36]:
# countries
country_num = country.nunique() # number of countries
country_count = country.value_counts()
country_percent = round(country.value_counts(normalize=True) * 100, 1)
# continents
continent_count = continent.value_counts()
continent_percent = round(continent.value_counts(normalize=True) * 100, 1)
# creating df to present results
df_country_dist = pd.concat([country_count, country_percent], axis=1)
df_continent_dist = pd.concat([continent_count, continent_percent], axis=1)

#### RESULTS

In [38]:
print(f"Number of countries in marathon: {country_num}")
print(f"\nTop 10 countries distribution:{'\n'*2}{df_country_dist.head(10)}{'\n'*2}")
print(f"Continents distribution:{'\n'*2}{df_continent_dist}")

# RESULTS EXPORT
df_country_dist.to_csv('./export/country_dist.csv')
df_continent_dist.to_csv('./export/continent_dist.csv')

Number of countries in marathon: 105

Top 10 countries distribution:

         count  proportion
Country                   
USA      64474        81.0
CAN       6171         7.7
GBR       1072         1.3
MEX        768         1.0
GER        573         0.7
JPN        491         0.6
AUS        475         0.6
ITA        474         0.6
CHN        430         0.5
BRA        428         0.5


Continents distribution:

               count  proportion
Continent                       
North America  71701        90.0
Europe          4499         5.6
Asia            1772         2.2
South America    971         1.2
Oceania          554         0.7
Africa           141         0.2


## TIME

In [40]:
pace = df['Pace'] # DataSeries with pace
official_time = df['Official Time'] # DataSeries with official finish time

### BEST, LONGEST, AVG, MED

#### workshop

In [43]:
# OFFICIAL FINISH TIME
time_fastest = official_time.min()
time_longest = official_time.max()
time_avg = official_time.mean()
time_med = official_time.median()

# PACE
pace_fastest = pace.min()
pace_longest = pace.max()
pace_avg = pace.mean()
pace_med = pace.median()

#### RESULTS

In [45]:
print('OFFICIAL FINISH TIME STATISTICS')
print(f"Fastest time in marathon is: {str(time_fastest)[7:]}")
print(f"Longest time in marathon is: {str(time_longest)[7:]}")
print(f"Average time in marathon is: {str(time_avg)[7:15]}")
print(f"Median time in marathon is: {str(time_med)[7:]}")
print('\nPACE STATISTICS')
print(f"Fastest pace in marathon is: {str(pace_fastest)[11:18] + ' min/km'}")
print(f"Longest pace in marathon is: {str(pace_longest)[10:18] + ' min/km'}")
print(f"Average pace in marathon is: {str(pace_avg)[11:18] + ' min/km'}")
print(f"Median pace in marathon is: {str(pace_med)[11:18] + ' min/km'}")

OFFICIAL FINISH TIME STATISTICS
Fastest time in marathon is: 02:09:17
Longest time in marathon is: 10:30:23
Average time in marathon is: 03:53:09
Median time in marathon is: 03:46:32

PACE STATISTICS
Fastest pace in marathon is: 3:03.92 min/km
Longest pace in marathon is: 15:13.41 min/km
Average pace in marathon is: 5:31.86 min/km
Median pace in marathon is: 5:22.49 min/km


# BIVARIATE ANALYSIS

## GENDER X AGE

In [60]:
df_gender_age = df[['M/F', 'Age', 'Age Group']]
df_male_age = df_gender_age[df_gender_age['M/F'] == 'M']
df_female_age = df_gender_age[df_gender_age['M/F'] == 'F']

In [62]:
# MALE
male_age_max = df_male_age['Age'].max()
male_age_min = df_male_age['Age'].min()
male_age_avg = round(df_male_age['Age'].mean())
male_age_med = df_male_age['Age'].median()
# FEMALE
female_age_max = df_female_age['Age'].max()
female_age_min = df_female_age['Age'].min()
female_age_avg = round(df_female_age['Age'].mean())
female_age_med = df_female_age['Age'].median()

## GENDER X AGE GROUPS

In [64]:
df_gender_age.head()

Unnamed: 0,M/F,Age,Age Group
0,M,25,22–25
1,M,24,22–25
2,M,30,30–33
3,M,30,30–33
4,M,29,26–29
