# PREPARE

### MODULES IMPORT

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math

### CLEAN DATA IMPORT

In [5]:
df = pd.read_csv('./boston_clean.csv')

### CHANGING TYPE OF COLUMNS THAT USING TIME AS VALUES

In [7]:
time_columns = ['Pace', '5K', '10K', '15K', '20K', '25K', '30K', '35K', '40K', 'Official Time']
df[time_columns] = df[time_columns].apply(lambda col: pd.to_timedelta(col, errors='coerce'))

# BASIC INFORMATION

In [9]:
num_participants = len(df) # Number of participants
print(f"Number of participants: {num_participants}")

Number of participants: 79638


##### PARTICIPANT ATTRIBUTES
The dataset contains the following attributes for each marathon participant:
- Name, gender, age
- Country and city
- Split times at each 5K
- Pace, official finish time, rank
- Year of participation

# UNIVARIATE ANALYSIS

## M/F

In [13]:
df_male = df[df['M/F'] == 'M'] # Dataframe with males only
df_female = df[df['M/F'] == 'F'] # Dataframe with females only

In [14]:
num_male = len(df_male) # Number of men
num_female = len(df_female) # Number of women

print(f"Number of males: {num_male}")
print(f"Number of females: {num_female}")

Number of males: 43482
Number of females: 36156


In [15]:
percent_male = round((num_male / num_participants * 100),2) # Percent of males
percent_female = round((num_female / num_participants * 100),2) # Percent of females

print(f"Percent of males: {percent_male}")
print(f"Percent of females: {percent_female}")
print(percent_male + percent_female)

Percent of males: 54.6
Percent of females: 45.4
100.0


## AGE

In [17]:
age = df['Age']

In [18]:
age_max = age.max() # Max age of participants
age_min = age.min() # Min age of participants

In [19]:
age_mean = round(age.mean(), 2) # Average age
age_median = age.median() # Median age

In [20]:
print(f"Max age of participants: {age_max}")
print(f"Min age of participants: {age_min}")
print(f"Average age: {age_mean}")
print(f"Median age: {age_median}")

Max age of participants: 84
Min age of participants: 18
Average age: 42.42
Median age: 42.0


### CREATING AGE GROUPS

In [22]:
age_edges = list(np.arange(age_min, 70, 4)) # Creating age group edges with maximum age of 69
age_edges.append(70) # Adding one more starting edge that equals 70
age_edges.append(age_max + 1) # Adding ending edge that equals maximum age +1
                                # Now last group will be 70+ 

age_groups = [[age_edges[i], age_edges[i+1]] for i in range(len(age_edges)-1)] # Creating age groups
print(f"Age groups: {age_groups}")

Age groups: [[18, 22], [22, 26], [26, 30], [30, 34], [34, 38], [38, 42], [42, 46], [46, 50], [50, 54], [54, 58], [58, 62], [62, 66], [66, 70], [70, 85]]


#### CREATING COLUMN WITH AGE GROUP

In [24]:
labels = [f"{g[0]}–{g[1]-1}" for g in age_groups]
bins = [g[0] for g in age_groups] + [age_groups[-1][1]]

df['Age Group'] = pd.cut(age, bins=bins, labels=labels, right=False)
age_group = df['Age Group']

In [73]:
age_distribution = age_group.value_counts().sort_index() # Counting age distributions
age_distribution_percent = round(age_distribution / len(df) * 100, 2) # Converting into percentage

print(f"Age distribution in percentage:{'\n'}{age_distribution_percent.astype(str)+'%'}")

Age distribution in percentage:
Age Group
18–21      1.2%
22–25      5.6%
26–29     8.95%
30–33     9.03%
34–37    10.52%
38–41    11.87%
42–45     12.4%
46–49    12.43%
50–53     10.4%
54–57     7.73%
58–61      5.2%
62–65     2.66%
66–69     1.28%
70–84     0.73%
Name: count, dtype: object
