# Build Model

## Step-1 : Load and Shape Data

In [1]:
import os
import urllib.request

data_location = "../data/house-prices/house-sales-full.csv"
data_url = 'https://elephantscale-public.s3.amazonaws.com/data/house-prices/house-sales-full.csv'

if not os.path.exists (data_location):
    data_location = os.path.basename(data_location)
    if not os.path.exists(data_location):
        print("Downloading : ", data_url)
        urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

data_location: house-sales-full.csv


In [2]:
import pandas as pd

data = pd.read_csv(data_location)
data.sample(5)

Unnamed: 0,DocumentID,Date,SalePrice,PropertyID,PropertyType,ym,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,...,Bathrooms,Bedrooms,BldgGrade,YrBuilt,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode,NewConstruction
18840,18841,8/24/09,550000,6979900280,Single Family,8/1/09,352600,0.810202,678843.0,1,...,2.5,4,8,1997,0,0,249000,303000,98053,False
15861,15862,2/18/15,175000,5272200005,Single Family,2/1/15,417900,0.960248,182245.0,1,...,1.0,2,7,1947,0,1,197000,122000,98125,False
6591,6592,6/19/08,628500,2141330560,Single Family,6/1/08,403200,0.926471,678381.0,1,...,2.25,4,8,1977,0,0,336000,270000,98006,False
19332,19333,3/1/07,490000,7202380350,Single Family,3/1/07,429600,0.987132,496387.0,1,...,2.5,3,7,2005,0,0,191000,240000,98053,False
3762,3763,2/19/08,650000,1342300040,Townhouse,2/1/08,418400,0.961397,676099.0,1,...,2.5,2,9,2012,0,1,254000,317000,-1,False


In [3]:
from sklearn.model_selection import train_test_split

X = data[['Bedrooms', 'Bathrooms', 'SqFtTotLiving', 'SqFtLot',  'LandVal']]
y = data['SalePrice']

X_train,X_test,y_train, y_test = train_test_split(X,y,  test_size=0.2)

print ("X_train :" , X_train.shape )
print ("X_test :", X_test.shape)
print ("y_train :", y_train.shape)
print ("y_test :", y_test.shape)

X_train : (21650, 5)
X_test : (5413, 5)
y_train : (21650,)
y_test : (5413,)


## Step-2: Build a Model

In [4]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()
model = gb.fit(X_train, y_train)

In [5]:
print ("Training score: ", model.score(X_train, y_train))
print ("Test score: ", model.score(X_test, y_test))

Training score:  0.8387588991510906
Test score:  0.80778632705065


## Step-3: Save Model with Pickle

Pickle models may need exact Python versions to load back.

JobLib is a better choice as it saves models efficiently.

In [6]:
model_file = 'model.pkl'

In [7]:
# ## Use Pickle
# import pickle

# with open(model_file,'wb') as f:
#     pickle.dump(model,f)
# print ("model saved to : ", model_file)

In [8]:
## Use JobLib

import joblib

joblib.dump(model, model_file) 
print ("model saved to : ", model_file)

model saved to :  model.pkl
