In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load the dataset
file_path = '../data/marathon_time_predictions.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Handle missing values (fill with mean for numeric columns)
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_features] = df[numeric_features].fillna(df[numeric_features].mean())

# Verify again for missing values (should be filled now)
print(df.isnull().sum())

# Normalize numerical features
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Identify categorical columns
categorical_features = df.select_dtypes(include=['object']).columns

# Encode categorical features
encoder = OneHotEncoder()  # Remove 'sparse=False'
encoded_categorical = encoder.fit_transform(df[categorical_features])
encoded_categorical_df = pd.DataFrame(encoded_categorical.toarray(), columns=encoder.get_feature_names_out(categorical_features))

# Combine numerical and encoded categorical features
df_processed = pd.concat([df[numeric_features], encoded_categorical_df], axis=1)

# Display the processed dataset
print(df_processed.head())

# Save preprocessed data to a new CSV file
df_processed.to_csv('../data/preprocessed_marathon_data.csv', index=False)


   id  Marathon           Name Category  km4week    sp4week CrossTraining  \
0   1  Prague17   Blair MORGAN      MAM    132.8  14.434783           NaN   
1   2  Prague17  Robert Heczko      MAM     68.6  13.674419           NaN   
2   3  Prague17  Michon Jerome      MAM     82.7  13.520436           NaN   
3   4  Prague17  Daniel Or lek      M45    137.5  12.258544           NaN   
4   5  Prague17   Luk ? Mr zek      MAM     84.6  13.945055           NaN   

  Wall21  MarathonTime CATEGORY  
0   1.16          2.37        A  
1   1.23          2.59        A  
2   1.30          2.66        A  
3   1.32          2.68        A  
4   1.36          2.74        A  
id                0
Marathon          0
Name              0
Category          6
km4week           0
sp4week           0
CrossTraining    74
Wall21            0
MarathonTime      0
CATEGORY          0
dtype: int64
id                0
Marathon          0
Name              0
Category          6
km4week           0
sp4week           0
