In [None]:
# !pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


# House Price Prediction with Geospatial Features

**Goal**: Predict the sale price of a house using its attributes (size, rooms, age, location, etc.) and geospatial context (distance to downtown, schools, parks etc.).

Some key questions I aim to answer:
- *What are the top features influencing house prices in this region?*
- *Can we predict a home's market value within $\pm 10\%$ accuracy?*

## Stages to follow for the analysis:
(This will help me keep track)

1. Load the data
2. Clean/preprocess
3. Split train/validation/test
4. Train model
5. Evaluate model
6. Save model/predictions

## Data
- Source: [Ames Housing Data](https://www.kaggle.com/datasets/prevek18/ames-housing-dataset/data) from Kaggle
- Also using [this reference notebook](https://www.kaggle.com/code/lauvfpitipak/house-price-advanced-regression-xgboost-with-ridge) for analysis

In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingRegressor

import xgboost as xgb

import os
import sys

In [2]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,...,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,2760,906392120,20,RL,11645,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,8,2006,New,Partial,294900
1,809,906226080,90,RL,7018,Pave,Reg,Bnk,AllPub,Inside,...,0,0,0,0,0,6,2009,WD,Alloca,153337
2,581,534127130,20,RL,11717,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2009,WD,Normal,185000
3,1144,531385060,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,190500
4,553,531477020,20,RH,8400,Pave,Reg,Lvl,AllPub,Inside,...,24,0,0,0,0,9,2009,WD,Normal,82000


In [3]:
df.describe()

Unnamed: 0,Order,PID,MS SubClass,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
count,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2032.0,2050.0,...,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0
mean,1471.05412,717268000.0,56.923452,10197.518284,6.068747,5.555339,1971.288152,1984.082886,100.843504,447.444878,...,93.768406,46.800098,23.94783,2.845929,15.05412,2.544125,55.010239,6.168211,2007.788396,180071.861043
std,842.460771,188862400.0,42.585277,8032.955701,1.425432,1.115406,30.113223,20.826108,180.779553,461.323101,...,126.300352,66.782384,66.518574,25.302379,54.100417,36.725446,577.304449,2.7114,1.312262,79680.819439
min,2.0,526302000.0,20.0,1470.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,13100.0
25%,745.0,528488100.0,20.0,7484.0,5.0,5.0,1954.0,1965.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129500.0
50%,1469.0,535475000.0,50.0,9503.0,6.0,5.0,1973.0,1993.0,0.0,376.0,...,0.0,24.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,160000.0
75%,2205.5,907200200.0,70.0,11619.0,7.0,6.0,2000.0,2003.0,161.0,744.75,...,168.0,69.5,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,212000.0
max,2930.0,1007100000.0,190.0,215245.0,10.0,9.0,2010.0,2010.0,1378.0,5644.0,...,1424.0,742.0,1012.0,407.0,576.0,738.0,17000.0,12.0,2010.0,755000.0
