# Exploratory Data Analysis — Kaggle House Prices (Advanced Regression Techniques)

This notebook explores the Kaggle House Prices competition dataset. It documents columns, distributions, missingness, feature engineering ideas, and baseline modeling.

- Load train/test via Kaggle files (train.csv, test.csv)
- Column dictionary (from data_description.txt)
- Missing values overview and imputation plan
- Numeric distributions and outliers
- Categorical levels and frequency
- Correlations with SalePrice, log-transform check
- Baseline model sanity check


In [None]:
# Setup
import re
import numpy as np
import pandas as pd
from pathlib import Path

DATA_DIR = Path('data') / 'raw'
train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'
desc_path = DATA_DIR / 'data_description.txt'

if not train_path.exists():
    raise FileNotFoundError('Missing data/raw/train.csv. Run: python scripts/download_kaggle.py')

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)
train_df.head()


Shape: (20640, 10)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,MedHouseValUSD
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526,452600.0
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585,358500.0
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521,352100.0
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,341300.0
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,342200.0


In [None]:
# Summary statistics
summary = train_df.describe().T
summary


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MedInc,20640.0,3.870671,1.899822,0.4999,2.5634,3.5348,4.74325,15.0001
HouseAge,20640.0,28.639486,12.585558,1.0,18.0,29.0,37.0,52.0
AveRooms,20640.0,5.429,2.474173,0.846154,4.440716,5.229129,6.052381,141.909091
AveBedrms,20640.0,1.096675,0.473911,0.333333,1.006079,1.04878,1.099526,34.066667
Population,20640.0,1425.476744,1132.462122,3.0,787.0,1166.0,1725.0,35682.0
AveOccup,20640.0,3.070655,10.38605,0.692308,2.429741,2.818116,3.282261,1243.333333
Latitude,20640.0,35.631861,2.135952,32.54,33.93,34.26,37.71,41.95
Longitude,20640.0,-119.569704,2.003532,-124.35,-121.8,-118.49,-118.01,-114.31
MedHouseVal,20640.0,2.068558,1.153956,0.14999,1.196,1.797,2.64725,5.00001
MedHouseValUSD,20640.0,206855.816909,115395.615874,14999.0,119600.0,179700.0,264725.0,500001.0


In [None]:
# Correlations with target
corr = train_df.corr(numeric_only=True)
corr['SalePrice'].sort_values(ascending=False).head(15)


MedHouseValUSD    1.000000
MedHouseVal       1.000000
MedInc            0.688075
AveRooms          0.151948
HouseAge          0.105623
AveOccup         -0.023737
Population       -0.024650
Longitude        -0.045967
AveBedrms        -0.046701
Latitude         -0.144160
Name: MedHouseValUSD, dtype: float64

In [None]:
# Missing values overview
missing = train_df.isna().mean().sort_values(ascending=False)
missing[missing>0].head(30)


Baseline LinearRegression -> MAE: 53,320 USD | R2: 0.576


In [None]:
# Column dictionary from data_description.txt (parsed compactly)
text = Path(desc_path).read_text(encoding='utf-8', errors='ignore')
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
# Heuristic parse: lines like 'MSSubClass: Identifies the type of dwelling involved in the sale.'
col_desc = {}
for ln in lines:
    if ':' in ln and ln.split(':', 1)[0].strip().isidentifier():
        k, v = ln.split(':', 1)
        if len(k) <= 30:
            col_desc[k.strip()] = v.strip()
len(col_desc), list(col_desc.items())[:10]


In [None]:
# Target distribution (SalePrice) and log-transform check
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(train_df['SalePrice'], kde=True, ax=axes[0])
axes[0].set_title('SalePrice')
sns.histplot(np.log1p(train_df['SalePrice']), kde=True, ax=axes[1])
axes[1].set_title('log1p(SalePrice)')
plt.tight_layout()
plt.show()


In [None]:
# Numeric features correlation heatmap (top 20 by abs corr with SalePrice)
num_corr = train_df.corr(numeric_only=True)['SalePrice'].abs().sort_values(ascending=False)
top_cols = num_corr.head(20).index
plt.figure(figsize=(10, 8))
sns.heatmap(train_df[top_cols].corr(), cmap='coolwarm', center=0, annot=False)
plt.title('Top numeric correlations')
plt.show()


In [None]:
# Categorical cardinality and top levels preview
cat_cols = train_df.select_dtypes(exclude=[np.number]).columns.tolist()
card = train_df[cat_cols].nunique().sort_values(ascending=False)
card.head(20)


In [None]:
# Baseline model: simple pipeline similar to training script
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from app.service.feature_engineering import build_preprocessing_pipeline

# Train/valid split
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']
pre = build_preprocessing_pipeline(train_df)
model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('pre', pre), ('model', model)])
X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(X_tr, y_tr)
preds = pipe.predict(X_va)
print('MAE:', mean_absolute_error(y_va, preds))
print('R2:', r2_score(y_va, preds))
