# **Feature engeneering**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pycountry_convert as pc

In [2]:
# Dataset download
cyclists_df = pd.read_csv('../dataset/cyclists.csv')
races_df = pd.read_csv('../dataset/races.csv')

#### ```categorical_height``` column


Since cycling is a sport in which, on average for the same height, the weight is similar, we group height by category with 5cm intervals to look for possible outliers in weight using conditional boxplots.

In [None]:
# Define the bins for the height category and create the feature
bins = [154, 159, 164, 169, 174, 179, 184, 189, 194, 199, 204]
labels = list(range(10))

cyclists_df['categorical_height'] = pd.cut(cyclists_df['height'], bins=bins, labels=labels, right=False, include_lowest=True)

print(cyclists_df[['height', 'categorical_height']].head(10))

Conditional Box Plot

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='categorical_height', y='weight', data=cyclists_df)
plt.title('Boxplot of Weight Conditioned on Height Category')
plt.xlabel('Height Category')
plt.ylabel('Weight')

# Set y-axis intervals
plt.yticks(np.arange(50, cyclists_df['weight'].max() + 5, 5))

plt.show()

We show the number of cyclists in each defined category (excluding those with zero weight) to evaluate the significance of the outliers identified from the boxplots.

In [None]:
# Exclude cyclists with null weight
cyclists_df_non_null_weight = cyclists_df[cyclists_df['weight'].notnull()]

# Count the number of cyclists in each height category
categorical_height_counts = cyclists_df_non_null_weight['categorical_height'].value_counts().sort_index()

# Plot the results
plt.figure(figsize=(10, 6))
ax = categorical_height_counts.plot(kind='bar')
plt.title('Number of Cyclists per Height Category (Excluding Null Weights)')
plt.xlabel('Height Category')
plt.ylabel('Number of Cyclists')

# Add the count above each bin
for i, count in enumerate(categorical_height_counts):
    ax.text(i, count + 5, str(count), ha='center', va='bottom')

# Rotate x-axis labels to horizontal
plt.xticks(rotation=0)

plt.show()

#### ```BMI_idx``` column

In [None]:
from utils import compute_bmi

# Apply the function to compute BMI index for each cyclist
cyclists_df['BMI_idx'] = cyclists_df.apply(
    lambda row: compute_bmi(row['weight'], row['height']),
    axis=1
)

print(cyclists_df[['weight', 'height', 'BMI_idx']].head(10))

Categorizing BMI index

In [None]:
# Define the BMI categories and their corresponding labels
bmi_bins = [0, 18.5, 24.9, 29.9, np.inf]
bmi_labels = ['Underweight', 'Normal weight', 'Overweight', 'Obese']

# Create the 'BMI' categorical column
cyclists_df['BMI'] = pd.cut(cyclists_df['BMI_idx'], bins=bmi_bins, labels=bmi_labels, right=False, include_lowest=True)

# Display the first few rows to verify
print(cyclists_df[['_url', 'BMI_idx', 'BMI']].head(10))

In [None]:
# Count the number of cyclists per BMI category
bmi_category_counts = cyclists_df['BMI'].value_counts().sort_index()

# Plot the results
plt.figure(figsize=(10, 6))
ax = bmi_category_counts.plot(kind='bar')
plt.title('Number of Cyclists per BMI Category')
plt.xlabel('BMI Category')
plt.ylabel('Number of Cyclists')

# Add the count above each bin
for i, count in enumerate(bmi_category_counts):
    ax.text(i, count + 5, str(count), ha='center', va='bottom')

# Rotate x-axis labels to horizontal
plt.xticks(rotation=0)

plt.show()

As expected the most of the cyclists are normal weight

In [None]:
cyclists_df = cyclists_df.drop(columns=['BMI_idx'])
# Create a pairplot excluding the 'categorical_height' column
sns.pairplot(cyclists_df.drop(columns=[]), hue='BMI')
plt.show()

#### ```continent``` column

In [None]:
# Function to convert country name to continent
def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except:
        return 'Unknown'

# Apply the function to create a new column 'continent'
cyclists_df['continent'] = cyclists_df['nationality'].apply(country_to_continent)

# Display the first few rows to verify
print(cyclists_df[['nationality', 'continent']].head(10))

In [None]:
# Count the number of cyclists per continent
continent_counts = cyclists_df['continent'].value_counts()

# Plot the number of cyclists per continent as a histogram
plt.figure(figsize=(8, 5))
ax = continent_counts.plot(kind='bar')
plt.title('Number of Cyclists per Continent')
plt.xlabel('Continent')
plt.ylabel('Number of Cyclists')
plt.xticks(rotation=45)

# Add the count above each bin
for i, count in enumerate(continent_counts):
    ax.text(i, count + 50, str(count), ha='center', va='bottom')

plt.show()

In [None]:
# Create a pairplot excluding the 'categorical_height' column
sns.pairplot(cyclists_df.drop(columns=['categorical_height']), hue='continent')
plt.show()

Given that majority of cyclists are european let's deep into european cyclist analysis

In [13]:
# Filter cyclists by European continent
european_df = cyclists_df[cyclists_df['continent'] == 'Europe'].drop(columns=['continent'])

In [None]:
# Count the number of cyclists per nationality
nationality_counts = european_df['nationality'].value_counts()

# Plot the number of cyclists per nationality as a histogram
plt.figure(figsize=(14, 7))
ax = nationality_counts.plot(kind='bar')
plt.title('Number of Cyclists per Nationality')
plt.xlabel('Nationality')
plt.ylabel('Number of Cyclists')
plt.xticks(rotation=90)

# Add the count above each bin
for i, count in enumerate(nationality_counts):
    ax.text(i, count + 10, str(count), ha='center', va='bottom')

plt.show()

In [None]:
# Create a pairplot excluding the 'categorical_height' column
sns.pairplot(european_df.drop(columns=['categorical_height']), hue='nationality')
plt.show()