# PREPARE

## MODULES IMPORT

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math

## CLEAN DATA IMPORT

In [5]:
df = pd.read_csv('./boston_clean.csv')

### CHANGING TYPE OF COLUMNS THAT USING TIME AS VALUES

In [7]:
time_columns = ['Pace', '5K', '10K', '15K', '20K', '25K', '30K', '35K', '40K', 'Official Time']
df[time_columns] = df[time_columns].apply(lambda col: pd.to_timedelta(col, errors='coerce'))

# BASIC INFORMATION

In [9]:
num_participants = len(df) # number of participants
print(f"Number of participants: {num_participants}")

Number of participants: 79638


## PARTICIPANT ATTRIBUTES
The dataset contains the following attributes for each marathon participant:
- Name, gender, age
- Country and city
- Split times at each 5K
- Pace, official finish time, rank
- Year of participation

# UNIVARIATE ANALYSIS

## M/F

In [13]:
gender = df['M/F']

#### M/F - lab

In [15]:
df_male = df[gender == 'M'] # Dataframe with males only
df_female = df[gender == 'F'] # Dataframe with females only

gender_count = gender.value_counts() # gender count
gender_percent = round(gender.value_counts(normalize=True) * 100, 2) # gender percentage
df_gender_dist = pd.concat([gender_count, gender_percent], axis=1) # gender distribution

#### M/F - results

In [17]:
print(f"Gender distribution:\n{df_gender_dist}")
# RESULTS EXPORT
df_gender_dist.to_csv('./export/gender_dist.csv')

Gender distribution:
     count  proportion
M/F                   
M    43482        54.6
F    36156        45.4


## AGE

In [19]:
age = df['Age'] # DataSeries with age

#### AGE - lab

In [21]:
age_max = age.max() # max age of participants
age_min = age.min() # min age of participants
age_avg = round(age.mean(), 2) # average age
age_med = age.median() # median age

#### AGE - results

In [23]:
print(f"Max age of participants: {age_max}")
print(f"Min age of participants: {age_min}")
print(f"Average age: {age_avg}")
print(f"Median age: {age_med}")

Max age of participants: 84
Min age of participants: 18
Average age: 42.42
Median age: 42.0


### DISTRIBUTION OF AGE GROUPS

In [25]:
age_groups = df['Age Group']

#### DISTRIBUTION OF AGE GROUPS - lab

In [27]:
# count of age groups
age_groups_count = age_groups.value_counts().sort_index()

# percentage of age groups
age_groups_percent = round(age_groups.value_counts(normalize=True).sort_index() * 100, 2)

# creating df to present results
df_age_groups_dist = pd.concat([age_groups_count, age_groups_percent], axis=1)

#### DISTRIBUTION OF AGE GROUPS - results

In [29]:
print(f"Percentage distribution of age groups:{'\n'}{df_age_groups_dist}")

# results export
df_age_groups_dist.to_csv('./export/age_groups_dist.csv')

Percentage distribution of age groups:
           count  proportion
Age Group                   
18–21        952        1.20
22–25       4461        5.60
26–29       7128        8.95
30–33       7188        9.03
34–37       8379       10.52
38–41       9457       11.87
42–45       9875       12.40
46–49       9900       12.43
50–53       8282       10.40
54–57       6154        7.73
58–61       4140        5.20
62–65       2121        2.66
66–69       1018        1.28
70–84        583        0.73


## COUNTRY

In [31]:
country = df['Country']

#### COUNTRY - lab

In [33]:
# countries
country_num = country.nunique() # number of countries
country_count = country.value_counts()
country_percent = round(country.value_counts(normalize=True) * 100, 1)

# creating df to present results
df_country_dist = pd.concat([country_count, country_percent], axis=1)

#### COUNTRY - results

In [35]:
print(f"Number of countries in marathon: {country_num}")
print(f"\nTop 10 countries distribution:{'\n'*2}{df_country_dist.head(10)}{'\n'*2}")

# results export
df_country_dist.to_csv('./export/country_dist.csv')

Number of countries in marathon: 105

Top 10 countries distribution:

         count  proportion
Country                   
USA      64474        81.0
CAN       6171         7.7
GBR       1072         1.3
MEX        768         1.0
GER        573         0.7
JPN        491         0.6
AUS        475         0.6
ITA        474         0.6
CHN        430         0.5
BRA        428         0.5




### CONTINENT

In [37]:
continent = df['Continent']

#### CONTINENT - lab

In [39]:
# continents
continent_count = continent.value_counts()
continent_percent = round(continent.value_counts(normalize=True) * 100, 1)

# creating df to present results
df_continent_dist = pd.concat([continent_count, continent_percent], axis=1)

#### CONTINENT - results

In [41]:
print(f"Continents distribution:{'\n'*2}{df_continent_dist}")

# results export
df_continent_dist.to_csv('./export/continent_dist.csv')

Continents distribution:

               count  proportion
Continent                       
North America  71701        90.0
Europe          4499         5.6
Asia            1772         2.2
South America    971         1.2
Oceania          554         0.7
Africa           141         0.2


## OFFICIAL TIME

In [43]:
official_time = df['Official Time'] # DataSeries with official finish time

#### OFFICIAL TIME - lab

In [45]:
# official finish time
time_fastest = official_time.min()
time_longest = official_time.max()
time_avg = official_time.mean()
time_med = official_time.median()

#### OFFICIAL TIME - results

In [47]:
print('OFFICIAL FINISH TIME STATISTICS')
print(f"Fastest time in marathon is: {str(time_fastest)[7:]}")
print(f"Longest time in marathon is: {str(time_longest)[7:]}")
print(f"Average time in marathon is: {str(time_avg)[7:15]}")
print(f"Median time in marathon is: {str(time_med)[7:]}")

OFFICIAL FINISH TIME STATISTICS
Fastest time in marathon is: 02:09:17
Longest time in marathon is: 10:30:23
Average time in marathon is: 03:53:09
Median time in marathon is: 03:46:32


### PACE

In [49]:
pace = df['Pace'] # DataSeries with pace

#### PACE - lab

In [51]:
# pace
pace_fastest = pace.min()
pace_longest = pace.max()
pace_avg = pace.mean()
pace_med = pace.median()

#### PACE - results

In [53]:
print('PACE STATISTICS')
print(f"Fastest pace in marathon is: {str(pace_fastest)[11:18] + ' min/km'}")
print(f"Longest pace in marathon is: {str(pace_longest)[10:18] + ' min/km'}")
print(f"Average pace in marathon is: {str(pace_avg)[11:18] + ' min/km'}")
print(f"Median pace in marathon is: {str(pace_med)[11:18] + ' min/km'}")

PACE STATISTICS
Fastest pace in marathon is: 3:03.92 min/km
Longest pace in marathon is: 15:13.41 min/km
Average pace in marathon is: 5:31.86 min/km
Median pace in marathon is: 5:22.49 min/km


# BIVARIATE ANALYSIS

## GENDER X AGE

In [56]:
df_gender_age = df[['M/F', 'Age', 'Age Group']]

df_male_age = df_gender_age[df_gender_age['M/F'] == 'M']
df_female_age = df_gender_age[df_gender_age['M/F'] == 'F']

#### GENDER X AGE - lab

In [58]:
# male
male_age_max = df_male_age['Age'].max()
male_age_min = df_male_age['Age'].min()
male_age_avg = round(df_male_age['Age'].mean())
male_age_med = df_male_age['Age'].median()
# female
female_age_max = df_female_age['Age'].max()
female_age_min = df_female_age['Age'].min()
female_age_avg = round(df_female_age['Age'].mean())
female_age_med = df_female_age['Age'].median()

#### GENDER X AGE - results

In [113]:
print("male\n")
print(f"Male max age: {male_age_max}")
print(f"Male min age: {male_age_min}")
print(f"Male average age: {male_age_avg}")
print(f"Male median age: {male_age_med}\n")
print("female\n")
print(f"Female max age: {female_age_max}")
print(f"Female min age: {female_age_min}")
print(f"Female average age: {female_age_avg}")
print(f"Female median age: {female_age_med}")

male

Male max age: 83
Male min age: 18
Male average age: 45
Male median age: 45.0

female

Female max age: 84
Female min age: 18
Female average age: 40
Female median age: 40.0


### GENDER X AGE GROUPS

#### GENDER X AGE GROUPS - lab

In [63]:
# males
male_age_groups_count = df_male_age['Age Group'].value_counts().sort_index()
male_age_groups_percent = round(df_male_age['Age Group'].value_counts(normalize=True).sort_index() * 100,2)
# females
female_age_groups_count = df_female_age['Age Group'].value_counts().sort_index()
female_age_groups_percent = round(df_female_age['Age Group'].value_counts(normalize=True).sort_index() * 100,2)
# creating df to present results
df_male_age_groups_dist = pd.concat([male_age_groups_count, male_age_groups_percent], axis=1)
df_female_age_groups_dist = pd.concat([female_age_groups_count, female_age_groups_percent], axis=1)

#### GENDER X AGE GROUPS - results

In [119]:
print('male')
print(df_male_age_groups_dist)
print('\nfemale')
print(df_female_age_groups_dist)

# results export
df_male_age_groups_dist.to_csv('./export/male_age_groups_dist.csv')
df_female_age_groups_dist.to_csv('./export/female_age_groups_dist.csv')

male
           count  proportion
Age Group                   
18–21        460        1.06
22–25       1641        3.77
26–29       2869        6.60
30–33       3405        7.83
34–37       4082        9.39
38–41       4691       10.79
42–45       5316       12.23
46–49       5707       13.12
50–53       5089       11.70
54–57       4236        9.74
58–61       3026        6.96
62–65       1635        3.76
66–69        828        1.90
70–84        497        1.14

female
           count  proportion
Age Group                   
18–21        492        1.36
22–25       2820        7.80
26–29       4259       11.78
30–33       3783       10.46
34–37       4297       11.88
38–41       4766       13.18
42–45       4559       12.61
46–49       4193       11.60
50–53       3193        8.83
54–57       1918        5.30
58–61       1114        3.08
62–65        486        1.34
66–69        190        0.53
70–84         86        0.24


## GENDER X COUNTRY

In [142]:
df_gender_country = df[['M/F', 'Country', 'Continent']]

df_male_country = df_gender_country[df_gender_country['M/F'] == 'M']
df_female_country = df_gender_country[df_gender_country['M/F'] == 'F']

#### GENDER X COUNTRY - lab

In [161]:
# males
male_country_count = df_male_country['Country'].value_counts()
male_country_percent = round(df_male_country['Country'].value_counts(normalize=True) * 100, 2)
# results df
df_male_country_dist = pd.concat([male_country_count, male_country_percent], axis=1)

# females
female_country_count = df_female_country['Country'].value_counts()
female_country_percent = round(df_female_country['Country'].value_counts(normalize=True) * 100, 2)
# results df
df_female_country_dist = pd.concat([female_country_count, female_country_percent], axis=1)

#### GENDER X COUNTRY - results

In [176]:
print(f"males country distribution:\n{df_male_country_dist.head(10)}\n")
print(f"females country distribution:\n{df_female_country_dist.head(10)}")

males country distribution:
         count  proportion
Country                   
USA      33390       76.79
CAN       3499        8.05
GBR        707        1.63
MEX        510        1.17
GER        437        1.01
ITA        378        0.87
JPN        357        0.82
CHN        307        0.71
BRA        305        0.70
AUS        294        0.68

females country distribution:
         count  proportion
Country                   
USA      31084       85.97
CAN       2672        7.39
GBR        365        1.01
MEX        258        0.71
AUS        181        0.50
GER        136        0.38
JPN        134        0.37
BRA        123        0.34
CHN        123        0.34
ITA         96        0.27


### GENDER X CONTINENT

#### GENDER X CONTINENT - lab

In [183]:
# males
male_continent_count = df_male_country['Continent'].value_counts()
male_continent_percent = round(df_male_country['Continent'].value_counts(normalize=True) * 100, 2)
# results df
df_male_continent_dist = pd.concat([male_continent_count, male_continent_percent], axis=1)

# females
female_continent_count = df_female_country['Continent'].value_counts()
female_continent_percent = round(df_female_country['Continent'].value_counts(normalize=True) * 100, 2)
# results df
df_female_continent_dist = pd.concat([female_continent_count, female_continent_percent], axis=1)

#### GENDER X CONTINENT - results

In [192]:
print('male')
print(df_male_continent_dist)
print('\nfemale')
print(df_female_continent_dist)

# results export
df_male_continent_dist.to_csv('./export/male_continent_dist.csv')
df_female_continent_dist.to_csv('./export/female_continent_dist.csv')

male
               count  proportion
Continent                       
North America  37577       86.42
Europe          3425        7.88
Asia            1332        3.06
South America    713        1.64
Oceania          345        0.79
Africa            90        0.21

female
               count  proportion
Continent                       
North America  34124       94.38
Europe          1074        2.97
Asia             440        1.22
South America    258        0.71
Oceania          209        0.58
Africa            51        0.14


## GENDER X TIME

#### GENDER X TIME - lab

#### GENDER X TIME - results

## AGE X COUNTRY

## AGE X TIME

## COUNTRY X TIME