## Kaggle: Housing Prices Competition

Entry for the Kaggle Housing Prices Competition.  Given a set of historical data on housing sale prices, predict the sale price of a home.

In [6]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

## EDA

Load and explore the data.

In [2]:
# iowa_file_path = '../input/train.csv'  # competition data
iowa_file_path = 'data/train.csv'  # local data
df_raw = pd.read_csv(iowa_file_path)

df_raw.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
df_raw.shape

In [None]:
df_raw.dtypes

In [None]:
df_raw.describe()

In [None]:
plt.hist(df_raw['SalePrice'], label='SalePrice')
plt.legend()
plt.show()

In [None]:
plt.hist(df_raw['BedroomAbvGr'], label='BedroomAbvGr')
plt.legend()
plt.show()

plt.scatter(df_raw['BedroomAbvGr'], df_raw['SalePrice'])
plt.show()

In [3]:
# drop rows without a sale price
df_raw.dropna(axis=0, subset=['SalePrice'], inplace=True)

# add bias term
df_raw['ones'] = 1

y = df_raw.SalePrice

In [None]:
# 1D linear regression
def linear_regression_plot(X, y, label=None):
    denominator = X.dot(X) - X.mean() * X.sum()
    a = ( X.dot(y) - y.mean()*X.sum() ) / denominator
    b = ( y.mean() * X.dot(X) - X.mean() * X.dot(y) ) / denominator

    # let's calculate the predicted Y
    test_preds = a*X + b

    # let's plot everything together to make sure it worked
    # plt.scatter(X, y)
    # plt.plot(X, test_preds)
    # plt.show()
    
    fig = plt.figure()
    fig.subplots_adjust(top=0.8)
    ax1 = fig.add_subplot(211)
    ax1.set_ylabel('SalePrice')
    if label:
        ax1.set_title(label)
    ax1.scatter(X, y)
    ax1.plot(X, test_preds, color='r')
    
    ax2 = fig.add_axes([0.15, 0.1, 0.7, 0.3])
    ax2.hist(X)
    ax2.set_xlabel('hist')
    
    plt.show()
    
    val_mae = mean_absolute_error(test_preds, y)
    print("Validation MAE for {}: {:,.0f}".format(label, val_mae))

In [5]:
# let's take a look at all the features we might be interested in
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

for feature in features:
    linear_regression_plot(df_raw[feature], y, label=feature)

NameError: name 'MLUtils' is not defined

In [None]:
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
features2 = ['YearBuilt', '1stFlrSF']

df = df_raw.drop(['SalePrice'], axis=1).select_dtypes(exclude=[object])
train_X, val_X, train_y, val_y = train_test_split(df.values, y.values, test_size=0.25)
xgboost_preds, xgboost_mae = run_xgboost(train_X, train_y, val_X, val_y)
print("Validation MAE for xgboost: {:,.0f}".format(xgboost_mae))


In [None]:
# create a new model based on all the training data
my_model = XGBRegressor(n_estimators=170, learning_rate=0.05)
my_model.fit(X, y)

In [None]:
# test_data_path = '../input/test.csv'
test_data_path = 'data/kaggle-housing-prices/test.csv'
test_df = pd.read_csv(test_data_path)

# test_X with the dimensions we chose for our model
val_X = test_df[features]

# create predictions
val_preds = my_model.predict(val_X)

output = pd.DataFrame({'Id': test_df.Id, 
                       'SalePrice': val_preds})
output.to_csv('submission.csv', index=False)