In [32]:
%load_ext autoreload
%autoreload 2

In [42]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from load_data import load_raw_data
from preprocess import (
    clean_data, encode_features, scale_features, 
    preprocess_pipeline, create_train_test_split, NUMERICAL_FEATURES 
)

#### Loading the data

In [34]:
df_raw = load_raw_data()
df_raw.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [35]:
df_clean = clean_data(df_raw)

print("\nMissing values after cleaning:")
print(df_clean.isnull().sum())


Missing values after cleaning:
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [36]:
df_clean.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


#### Feature encoding


In [37]:
# Encode features
df_encoded = encode_features(df_clean)

print(f"\nShape after encoding: {df_encoded.shape}")
print(f"\nColumns after encoding: {list(df_encoded.columns)}")


Shape after encoding: (333, 8)

Columns after encoding: ['species', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'island_Dream', 'island_Torgersen', 'sex_MALE']


In [38]:
# Display encoded data
df_encoded.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Dream,island_Torgersen,sex_MALE
0,0,39.1,18.7,181.0,3750.0,False,True,True
1,0,39.5,17.4,186.0,3800.0,False,True,False
2,0,40.3,18.0,195.0,3250.0,False,True,False
4,0,36.7,19.3,193.0,3450.0,False,True,False
5,0,39.3,20.6,190.0,3650.0,False,True,True


#### Step 3: Feature Scaling

In [39]:

print("Statistics before scaling:")
print(df_encoded[NUMERICAL_FEATURES].describe())

Statistics before scaling:
       bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g
count      333.000000     333.000000         333.000000   333.000000
mean        43.992793      17.164865         200.966967  4207.057057
std          5.468668       1.969235          14.015765   805.215802
min         32.100000      13.100000         172.000000  2700.000000
25%         39.500000      15.600000         190.000000  3550.000000
50%         44.500000      17.300000         197.000000  4050.000000
75%         48.600000      18.700000         213.000000  4775.000000
max         59.600000      21.500000         231.000000  6300.000000


In [None]:
# Scale features
df_scaled, scaler = scale_features(df_encoded, fit=True)


NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# Verify scaled data
df_scaled.head()

## Complete Preprocessing Pipeline

Run the entire preprocessing pipeline at once and save outputs.

In [None]:
# Run complete preprocessing pipeline
df_processed, scaler = preprocess_pipeline(df_raw, save_output=True)

print("\nPreprocessing complete!")
print(f"Final shape: {df_processed.shape}")

In [None]:
# Display final processed data
df_processed.head()

## Create Train/Test Split

Split the processed data into training and testing sets.

In [None]:
# Create train/test split
splits = create_train_test_split(df_processed, save_output=True)

X_train = splits['X_train']
X_test = splits['X_test']
y_train = splits['y_train']
y_test = splits['y_test']

print("\nDataset splits:")
print(f"Training features shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")

In [None]:
# Check class distribution in splits
print("Training set species distribution:")
print(y_train.value_counts().sort_index())
print()

print("Test set species distribution:")
print(y_test.value_counts().sort_index())

## Verify Saved Artifacts

Check that all preprocessing artifacts were saved correctly.

In [None]:
from src.config import PROCESSED_DATA_FILE, TRAIN_TEST_FILE, SCALER_FILE

print("Saved files:")
print(f"✓ Processed data: {PROCESSED_DATA_FILE}")
print(f"  Exists: {PROCESSED_DATA_FILE.exists()}")
print()
print(f"✓ Train/test splits: {TRAIN_TEST_FILE}")
print(f"  Exists: {TRAIN_TEST_FILE.exists()}")
print()
print(f"✓ Scaler: {SCALER_FILE}")
print(f"  Exists: {SCALER_FILE.exists()}")

## Load and Verify Saved Data

Test loading the saved artifacts to ensure they can be used later.

In [None]:
from src.preprocess import load_train_test_split, load_scaler, load_processed_data

# Load processed data
df_loaded = load_processed_data()
print(f"Loaded processed data shape: {df_loaded.shape}")
print()

# Load train/test splits
splits_loaded = load_train_test_split()
print(f"Loaded training set shape: {splits_loaded['X_train'].shape}")
print(f"Loaded test set shape: {splits_loaded['X_test'].shape}")
print()

# Load scaler
scaler_loaded = load_scaler()
print(f"Scaler loaded successfully")
print(f"Scaler mean: {scaler_loaded.mean_[:2]}...")  # Show first 2 values
print(f"Scaler scale: {scaler_loaded.scale_[:2]}...")

## Summary

The preprocessing pipeline:

1. ✓ Loaded raw data from source
2. ✓ Cleaned data by removing missing values
3. ✓ Encoded categorical features (island, sex) using one-hot encoding
4. ✓ Mapped species names to numerical labels
5. ✓ Scaled numerical features using StandardScaler
6. ✓ Split data into training (80%) and testing (20%) sets
7. ✓ Saved all artifacts for future use

The processed data is now ready for model training!