# 01 - Data Preprocessing

This notebook:
- Loads Kaggle House Prices dataset
- Performs feature engineering and preprocessing
- Applies log transformation to target
- Splits into train/validation
- Saves processed CSV files
- Uploads processed data to S3 for SageMaker training


In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Train shape: (1460, 81)
Test shape: (1459, 80)


In [3]:
y = train_df["SalePrice"].copy()
X_train_raw = train_df.drop(columns=["SalePrice"])
X_test_raw = test_df.copy()


In [4]:
full = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True)


In [5]:
num_cols = full.select_dtypes(include=[np.number]).columns
cat_cols = full.select_dtypes(exclude=[np.number]).columns

full[num_cols] = full[num_cols].fillna(full[num_cols].median())
full[cat_cols] = full[cat_cols].fillna("Missing")


In [6]:
full_encoded = pd.get_dummies(full, columns=cat_cols, drop_first=False)


In [7]:
X_all = full_encoded.iloc[:len(X_train_raw), :]
X_test = full_encoded.iloc[len(X_train_raw):, :]


In [9]:
y_log = np.log1p(y)


In [10]:
X_train, X_val, y_train, y_val = train_test_split(
    X_all, y_log, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape)
print("Validation size:", X_val.shape)


Train size: (1168, 311)
Validation size: (292, 311)


In [11]:
out_dir = "../processed"
os.makedirs(out_dir, exist_ok=True)

train_out = pd.concat([y_train.reset_index(drop=True),
                       X_train.reset_index(drop=True)], axis=1)

val_out = pd.concat([y_val.reset_index(drop=True),
                     X_val.reset_index(drop=True)], axis=1)

train_out.to_csv(f"{out_dir}/train.csv", index=False, header=False)
val_out.to_csv(f"{out_dir}/validation.csv", index=False, header=False)
X_test.to_csv(f"{out_dir}/test.csv", index=False, header=False)

print("Processed files saved.")


Processed files saved.


In [12]:
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = "house-price-kaggle-xgb"

train_s3 = sess.upload_data(path="../processed/train.csv",
                            bucket=bucket,
                            key_prefix=f"{prefix}/data")

val_s3 = sess.upload_data(path="../processed/validation.csv",
                          bucket=bucket,
                          key_prefix=f"{prefix}/data")

test_s3 = sess.upload_data(path="../processed/test.csv",
                           bucket=bucket,
                           key_prefix=f"{prefix}/data")

print("Data uploaded to S3.")
print(train_s3)
print(val_s3)


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Data uploaded to S3.
s3://sagemaker-us-east-1-730223111567/house-price-kaggle-xgb/data/train.csv
s3://sagemaker-us-east-1-730223111567/house-price-kaggle-xgb/data/validation.csv
