# Supervised Modeling

In [1]:
# Core libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt

# Scikit-learn utilities
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## Data Loading and Preprocessing Decisions

In [2]:
# Configuration and constants
RANDOM_STATE = 42
TARGET_COL = "SalePrice"

# Skewed numeric features to log-transform (from EDA analysis with skewness > 0.75)
SKEWED_FEATURES = [
    'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
    'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
    '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'
]

# Columns to drop (Id is not useful for prediction)
DROP_COLS = ['Id']

print(f"Configuration:")
print(f"  Random State: {RANDOM_STATE}")
print(f"  Target Column: {TARGET_COL}")
print(f"  Skewed Features to Transform: {len(SKEWED_FEATURES)}")
print(f"  Columns to Drop: {DROP_COLS}")

Configuration:
  Random State: 42
  Target Column: SalePrice
  Skewed Features to Transform: 18
  Columns to Drop: ['Id']


In [3]:
# Load training data (same as in EDA)
train_path = "../data/train.csv"
train_df = pd.read_csv(train_path)

print(f"Training data loaded:")
print(f"  Shape: {train_df.shape}")
print(f"  Samples: {train_df.shape[0]}")
print(f"  Features: {train_df.shape[1]}")

# Drop unnecessary columns
if any(col in train_df.columns for col in DROP_COLS):
    train_df = train_df.drop(columns=DROP_COLS, errors='ignore')
    print(f"\nDropped columns: {DROP_COLS}")
    print(f"  New shape: {train_df.shape}")

# Quick check for missing values
print(f"\nMissing values summary:")
missing_total = train_df.isna().sum().sum()
print(f"  Total missing values: {missing_total}")
print(f"  Columns with missing values: {(train_df.isna().sum() > 0).sum()}")

Training data loaded:
  Shape: (1460, 81)
  Samples: 1460
  Features: 81

Dropped columns: ['Id']
  New shape: (1460, 80)

Missing values summary:
  Total missing values: 7829
  Columns with missing values: 19


In [4]:
# Define target and features
# Apply log transformation to target (decision from EDA)
y = np.log1p(train_df[TARGET_COL])

# Features: all columns except target
X = train_df.drop(columns=[TARGET_COL])

print(f"Target variable (y):")
print(f"  Name: log1p({TARGET_COL})")
print(f"  Shape: {y.shape}")
print(f"  Min: {y.min():.4f}, Max: {y.max():.4f}, Mean: {y.mean():.4f}")

print(f"\nFeature matrix (X):")
print(f"  Shape: {X.shape}")
print(f"  Features: {X.shape[1]}")

Target variable (y):
  Name: log1p(SalePrice)
  Shape: (1460,)
  Min: 10.4603, Max: 13.5345, Mean: 12.0241

Feature matrix (X):
  Shape: (1460, 79)
  Features: 79


In [5]:
# Split numeric vs categorical features (same as in EDA)
numeric_features = X.select_dtypes(include=["number"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

print(f"Feature type breakdown:")
print(f"  Numeric features: {len(numeric_features)}")
print(f"  Categorical features: {len(categorical_features)}")
print(f"  Total: {len(numeric_features) + len(categorical_features)}")

print(f"\nNumeric features ({len(numeric_features)}):")
print(numeric_features)

print(f"\nCategorical features ({len(categorical_features)}):")
print(categorical_features)

Feature type breakdown:
  Numeric features: 36
  Categorical features: 43
  Total: 79

Numeric features (36):
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

Categorical features (43):
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', '

## Preprocessing Strategy and Feature Space Definitions



In [6]:
print("Preprocessing strategy defined:")
print("  Missing values: Median (numeric), Most Frequent (categorical)")
print("  Scaling: StandardScaler for all numeric features")
print("  Target: log1p(SalePrice) for training, expm1 for predictions")
print("  Feature spaces: Z0 (baseline), Z1 (log), Z2 (polynomial), Z3 (PCA)")
print("  Outliers: Keep all data, rely on model robustness")

Preprocessing strategy defined:
  Missing values: Median (numeric), Most Frequent (categorical)
  Scaling: StandardScaler for all numeric features
  Target: log1p(SalePrice) for training, expm1 for predictions
  Feature spaces: Z0 (baseline), Z1 (log), Z2 (polynomial), Z3 (PCA)
  Outliers: Keep all data, rely on model robustness


In [7]:
# Outlier handling: Implement decision to keep all data
# Based on EDA, we identified some houses with large GrLivArea (>4000 sq ft)
# Decision: Keep all observations, rely on model robustness and regularization

print(f"Before outlier filtering:")
print(f"  X shape: {X.shape}")
print(f"  y shape: {y.shape}")

# Optionally, identify potential outliers for documentation
outlier_threshold_grliv = 4000
potential_outliers = X[X['GrLivArea'] > outlier_threshold_grliv]
print(f"\nPotential outliers identified (GrLivArea > {outlier_threshold_grliv}): {len(potential_outliers)}")

if len(potential_outliers) > 0:
    print(f"  These {len(potential_outliers)} observations will be KEPT in the training set.")
    print(f"  Rationale: Insufficient evidence of data errors, models are robust to outliers.")

# No filtering applied - all data retained
print(f"\nAfter outlier filtering:")
print(f"  X shape: {X.shape} (no change)")
print(f"  y shape: {y.shape} (no change)")

Before outlier filtering:
  X shape: (1460, 79)
  y shape: (1460,)

Potential outliers identified (GrLivArea > 4000): 4
  These 4 observations will be KEPT in the training set.
  Rationale: Insufficient evidence of data errors, models are robust to outliers.

After outlier filtering:
  X shape: (1460, 79) (no change)
  y shape: (1460,) (no change)


### Build Preprocessing Pipelines

Now we implement the preprocessing pipelines that will be used across all feature spaces.

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# ------------------------------------------------------------------------------
# NUMERIC PREPROCESSING PIPELINE
# Strategy: Median imputation + Standardization (mean=0, std=1)
# Rationale: Median is robust to outliers; StandardScaler needed for 
#            distance-based and gradient-based models
# ------------------------------------------------------------------------------
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# ------------------------------------------------------------------------------
# CATEGORICAL PREPROCESSING PIPELINE  
# Strategy: Most frequent imputation + One-hot encoding
# Rationale: Missing often means "None" (e.g., no garage, no basement);
#            most_frequent captures this well. handle_unknown='ignore' for
#            robustness to unseen categories in test set.
# ------------------------------------------------------------------------------
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

print("Base preprocessing pipelines created:")
print("  Numeric: Median imputation → Standard scaling")
print("  Categorical: Most frequent imputation → One-hot encoding")

Base preprocessing pipelines created:
  Numeric: Median imputation → Standard scaling
  Categorical: Most frequent imputation → One-hot encoding


### Define Feature Spaces (Z0, Z1, Z2, Z3)

We create preprocessors for each feature space that will be used for model training and comparison.

In [9]:
# ==============================================================================
# Z0: BASELINE FEATURE SPACE
# Description: Standard preprocessing without feature engineering
# Components: Standardized numerics + one-hot categoricals
# ==============================================================================
preprocessor_z0 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

print("Z0 (Baseline) preprocessor created:")
print(f"  Numeric features: {len(numeric_features)}")
print(f"  Categorical features: {len(categorical_features)}")
print(f"  Total input features: {len(numeric_features) + len(categorical_features)}")

Z0 (Baseline) preprocessor created:
  Numeric features: 36
  Categorical features: 43
  Total input features: 79


In [10]:
# ==============================================================================
# Z1: LOG-TRANSFORMED FEATURE SPACE
# Description: Apply log1p to skewed numeric features before standardization
# Purpose: Reduce impact of extreme values, improve linear model assumptions
# ==============================================================================

# Create a copy of X and apply log transform to skewed features
X_z1 = X.copy()

# Apply log1p to skewed features that exist in the dataset
skewed_in_data = [col for col in SKEWED_FEATURES if col in X_z1.columns]
for col in skewed_in_data:
    # Clip to ensure non-negative values before log transform
    X_z1[col] = np.log1p(X_z1[col].clip(lower=0))

print(f"Z1 (Log-transformed) feature space created:")
print(f"  Skewed features transformed: {len(skewed_in_data)}")
print(f"  Features: {skewed_in_data[:5]}... (showing first 5)")

# Use same preprocessor as Z0 (will standardize the log-transformed features)
preprocessor_z1 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

print("  Preprocessor: Same as Z0 (median impute + scale)")

Z1 (Log-transformed) feature space created:
  Skewed features transformed: 18
  Features: ['LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF']... (showing first 5)
  Preprocessor: Same as Z0 (median impute + scale)


In [11]:
# ==============================================================================
# Z2: POLYNOMIAL FEATURE SPACE
# Description: Add degree-2 polynomial features for key predictors
# Purpose: Capture non-linear relationships and feature interactions
# Warning: Increases dimensionality significantly, use with regularization
# ==============================================================================

# Select key features for polynomial expansion (from EDA correlation analysis)
poly_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath']
poly_features = [f for f in poly_features if f in numeric_features]

print(f"Z2 (Polynomial) feature space definition:")
print(f"  Base features for polynomial expansion: {poly_features}")
print(f"  Polynomial degree: 2")
print(f"  Note: Will be combined with Z0 baseline features")

# Preprocessor for Z2: Apply polynomial features to selected numerics, then standardize
# We'll create this as a pipeline that first extracts poly features, then applies Z0

# Note: Full implementation will be done when training models
# For now, we define the feature names and strategy
print("  Implementation: PolynomialFeatures(degree=2, include_bias=False)")
print("  Combined with baseline Z0 preprocessing")

Z2 (Polynomial) feature space definition:
  Base features for polynomial expansion: ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath']
  Polynomial degree: 2
  Note: Will be combined with Z0 baseline features
  Implementation: PolynomialFeatures(degree=2, include_bias=False)
  Combined with baseline Z0 preprocessing


In [12]:
# ==============================================================================
# Z3: PCA-COMPRESSED FEATURE SPACE
# Description: Apply PCA to numeric features to reduce dimensionality
# Purpose: Reduce multicollinearity, speed up training, compress redundant info
# Based on: PCA analysis showing 90% variance captured by ~11 components
# ==============================================================================

# Configuration for PCA
PCA_N_COMPONENTS = 11  # Captures ~90% variance based on EDA analysis
PCA_VARIANCE_THRESHOLD = 0.90

print(f"Z3 (PCA-compressed) feature space definition:")
print(f"  Target components: {PCA_N_COMPONENTS} (explains ~{PCA_VARIANCE_THRESHOLD*100}% variance)")
print(f"  Applied to: {len(numeric_features)} numeric features")
print(f"  Categorical features: Kept as one-hot encoded (not compressed)")

# Create PCA transformer for numeric features
numeric_transformer_pca = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=PCA_N_COMPONENTS, random_state=RANDOM_STATE))
])

# Preprocessor for Z3: PCA on numerics, one-hot on categoricals
preprocessor_z3 = ColumnTransformer(
    transformers=[
        ('num_pca', numeric_transformer_pca, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

print(f"  Preprocessor created: PCA({PCA_N_COMPONENTS}) + StandardScaler + OneHotEncoder")

Z3 (PCA-compressed) feature space definition:
  Target components: 11 (explains ~90.0% variance)
  Applied to: 36 numeric features
  Categorical features: Kept as one-hot encoded (not compressed)
  Preprocessor created: PCA(11) + StandardScaler + OneHotEncoder


In [13]:
# Summary: All feature spaces defined
feature_spaces = {
    'Z0': {
        'name': 'Baseline',
        'description': 'Standard preprocessing (median impute + scale + one-hot)',
        'preprocessor': preprocessor_z0,
        'X_data': X
    },
    'Z1': {
        'name': 'Log-transformed',
        'description': f'Log1p on {len(skewed_in_data)} skewed features + standard preprocessing',
        'preprocessor': preprocessor_z1,
        'X_data': X_z1
    },
    'Z2': {
        'name': 'Polynomial',
        'description': f'Degree-2 polynomials on {len(poly_features)} key features + baseline',
        'preprocessor': None,  # Will be created during model training
        'X_data': X
    },
    'Z3': {
        'name': 'PCA-compressed',
        'description': f'PCA({PCA_N_COMPONENTS} components) on numerics + one-hot categoricals',
        'preprocessor': preprocessor_z3,
        'X_data': X
    }
}

print("\n" + "="*70)
print("FEATURE SPACE SUMMARY")
print("="*70)
for key, space in feature_spaces.items():
    print(f"\n{key}: {space['name']}")
    print(f"  {space['description']}")
    if space['preprocessor'] is not None:
        print(f"  Preprocessor: Ready")
    else:
        print(f"  Preprocessor: Will be built during model training")
print("="*70)


FEATURE SPACE SUMMARY

Z0: Baseline
  Standard preprocessing (median impute + scale + one-hot)
  Preprocessor: Ready

Z1: Log-transformed
  Log1p on 18 skewed features + standard preprocessing
  Preprocessor: Ready

Z2: Polynomial
  Degree-2 polynomials on 5 key features + baseline
  Preprocessor: Will be built during model training

Z3: PCA-compressed
  PCA(11 components) on numerics + one-hot categoricals
  Preprocessor: Ready
