# PREPARE

## MODULES IMPORT

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math

## CLEAN DATA IMPORT

In [5]:
df = pd.read_csv('./boston_clean.csv')

### CHANGING TYPE OF COLUMNS THAT USING TIME AS VALUES

In [7]:
time_columns = ['Pace', '5K', '10K', '15K', '20K', '25K', '30K', '35K', '40K', 'Official Time']
df[time_columns] = df[time_columns].apply(lambda col: pd.to_timedelta(col, errors='coerce'))

# BASIC INFORMATION

In [9]:
num_participants = len(df) # Number of participants
print(f"Number of participants: {num_participants}")

Number of participants: 79638


## PARTICIPANT ATTRIBUTES
The dataset contains the following attributes for each marathon participant:
- Name, gender, age
- Country and city
- Split times at each 5K
- Pace, official finish time, rank
- Year of participation

# UNIVARIATE ANALYSIS

## M/F

In [13]:
gender = df['M/F']

### GENDER DISTRIBUTION 

In [15]:
df_male = df[gender == 'M'] # Dataframe with males only
df_female = df[gender == 'F'] # Dataframe with females only

#### ASSIGNING VARIABLES

In [17]:
num_male = len(df_male) # Number of men
num_female = len(df_female) # Number of women
percent_male = round((num_male / num_participants * 100),2) # Percent of males
percent_female = round((num_female / num_participants * 100),2) # Percent of females

#### RESULTS

In [19]:
print(f"Number of males: {num_male}")
print(f"Number of females: {num_female}")
print(f"Percent of males: {percent_male}")
print(f"Percent of females: {percent_female}")

Number of males: 43482
Number of females: 36156
Percent of males: 54.6
Percent of females: 45.4


## AGE

In [21]:
age = df['Age'] # DataSeries with age

### MIN, MAX, AVG, MED

#### ASSIGNING VARIABLES

In [24]:
age_max = age.max() # Max age of participants
age_min = age.min() # Min age of participants
age_mean = round(age.mean(), 2) # Average age
age_median = age.median() # Median age

#### RESULTS

In [26]:
print(f"Max age of participants: {age_max}")
print(f"Min age of participants: {age_min}")
print(f"Average age: {age_mean}")
print(f"Median age: {age_median}")

Max age of participants: 84
Min age of participants: 18
Average age: 42.42
Median age: 42.0


### PERCENTAGE DISTRIBUTION OF AGE GROUPS

#### ASSIGNING VARIABLES

In [29]:
age_groups = df['Age Group']
age_groups_counts = age_groups.value_counts().sort_index()
age_groups_sum = age_groups_counts.sum()
age_groups_percent = round(age_groups_counts / age_groups_sum * 100, 1)

#### RESULTS

In [31]:
print(f"Percentage distribution of age groups:{'\n'}{age_groups_percent.astype(str) + '%'}")

Percentage distribution of age groups:
Age Group
18–21     1.2%
22–25     5.6%
26–29     9.0%
30–33     9.0%
34–37    10.5%
38–41    11.9%
42–45    12.4%
46–49    12.4%
50–53    10.4%
54–57     7.7%
58–61     5.2%
62–65     2.7%
66–69     1.3%
70–84     0.7%
Name: count, dtype: object


## COUNTRY, CONTINENT

In [33]:
country = df['Country']
continent = df['Continent']

### COUNTRIES COUNT

#### ASSINGING VARIABLES

In [36]:
country_num = country.nunique() # number of countries

#### RESULTS

In [38]:
print(f"Number of countries in marathon: {country_num}")

Number of countries in marathon: 105


### COUNTRIES AND CONTINENTS PERCENTAGE DISTRIBUTION

#### ASSINGING VARIABLES

In [41]:
# countries
country_percent = round(country.value_counts(normalize=True) * 100, 1)
country_percent_top = country_percent.head(10)
# continents
continent_percent = round(continent.value_counts(normalize=True) * 100, 1)

#### RESULTS

In [43]:
print(f"Top 10 countries percentage distribution:{'\n'*2}{country_percent_top.astype(str) + '%'}{'\n'}")
print(f"Continents percentage distribution:{'\n'*2}{continent_percent.astype(str) + '%'}")

Top 10 countries percentage distribution:

Country
USA    81.0%
CAN     7.7%
GBR     1.3%
MEX     1.0%
GER     0.7%
JPN     0.6%
AUS     0.6%
ITA     0.6%
CHN     0.5%
BRA     0.5%
Name: proportion, dtype: object

Continents percentage distribution:

Continent
North America    90.0%
Europe            5.6%
Asia              2.2%
South America     1.2%
Oceania           0.7%
Africa            0.2%
Name: proportion, dtype: object
