In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from load_data import load_raw_data
from preprocess import (
    clean_data, encode_features, scale_features, 
    create_train_test_split, NUMERICAL_FEATURES 
)

#### Loading the data

In [15]:
df_raw = load_raw_data()
df_raw.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [16]:
df_clean = clean_data(df_raw)

print("\nMissing values after cleaning:")
print(df_clean.isnull().sum())


Missing values after cleaning:
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [17]:
df_clean.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


#### Feature encoding


In [18]:
# Encode features
df_encoded = encode_features(df_clean)

print(f"\nShape after encoding: {df_encoded.shape}")
print(f"\nColumns after encoding: {list(df_encoded.columns)}")


Shape after encoding: (333, 8)

Columns after encoding: ['species', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'island_Dream', 'island_Torgersen', 'sex_MALE']


In [19]:
# Display encoded data
df_encoded.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Dream,island_Torgersen,sex_MALE
0,0,39.1,18.7,181.0,3750.0,False,True,True
1,0,39.5,17.4,186.0,3800.0,False,True,False
2,0,40.3,18.0,195.0,3250.0,False,True,False
4,0,36.7,19.3,193.0,3450.0,False,True,False
5,0,39.3,20.6,190.0,3650.0,False,True,True


#### Step 3: Feature Scaling

In [20]:

print("Statistics before scaling:")
print(df_encoded[NUMERICAL_FEATURES].describe())

Statistics before scaling:
       bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g
count      333.000000     333.000000         333.000000   333.000000
mean        43.992793      17.164865         200.966967  4207.057057
std          5.468668       1.969235          14.015765   805.215802
min         32.100000      13.100000         172.000000  2700.000000
25%         39.500000      15.600000         190.000000  3550.000000
50%         44.500000      17.300000         197.000000  4050.000000
75%         48.600000      18.700000         213.000000  4775.000000
max         59.600000      21.500000         231.000000  6300.000000


In [21]:
# Scale features
df_scaled, scaler = scale_features(df_encoded, fit=True)


Scaler saved to ./models/scaler.pkl


In [22]:
print("statistics after scaling:")
print(df_scaled[NUMERICAL_FEATURES].describe())

statistics after scaling:
       bill_length_mm  bill_depth_mm  flipper_length_mm   body_mass_g
count    3.330000e+02   3.330000e+02       3.330000e+02  3.330000e+02
mean     3.840772e-16   6.401286e-16       2.133762e-16 -1.707010e-16
std      1.001505e+00   1.001505e+00       1.001505e+00  1.001505e+00
min     -2.177987e+00  -2.067291e+00      -2.069852e+00 -1.874435e+00
25%     -8.227879e-01  -7.958519e-01      -7.836512e-01 -8.172292e-01
50%      9.288742e-02   6.872642e-02      -2.834620e-01 -1.953432e-01
75%      8.437412e-01   7.807321e-01       8.598276e-01  7.063915e-01
max      2.858227e+00   2.204743e+00       2.146028e+00  2.603144e+00


#### Create Train/Test Split

Split the processed data into training and testing sets.

In [23]:
# Create train/test split
splits = create_train_test_split(df_scaled, save_output=True)

X_train = splits['X_train']
X_test = splits['X_test']
y_train = splits['y_train']
y_test = splits['y_test']

print("\nDataset splits:")
print(f"Training features shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")


Saved train/test splits to ./data/processed/penguins_train_test.pkl

Dataset splits:
Training features shape: (266, 7)
Training labels shape: (266,)
Test features shape: (67, 7)
Test labels shape: (67,)


In [24]:
# Check class distribution in splits
print("Training set species distribution:")
print(y_train.value_counts().sort_index())
print()

print("Test set species distribution:")
print(y_test.value_counts().sort_index())

Training set species distribution:
species
0    115
1     55
2     96
Name: count, dtype: int64

Test set species distribution:
species
0    31
1    13
2    23
Name: count, dtype: int64
