In [1]:
import numpy as np
import pandas as pd
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score


In [4]:
dataset_raw = pd.read_csv("sample-data/model-api/kc_house_data.csv")

In [5]:
dataset_raw.head()

Unnamed: 0,event_id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [6]:
# Dropping id and date for regression
dataset = dataset_raw.drop(['event_id', 'date'], axis=1)
dataset.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [7]:
#Split into independents and dependents for training the model
X = dataset.drop(['price'], axis=1)
y = dataset.loc[:,'price']

In [8]:
#Splitting the data into train,test data 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=0)

In [9]:
tr_regressor = DecisionTreeRegressor(random_state=0)
tr_regressor.fit(X_train.values,y_train.values)
tr_regressor.score(X_test.values,y_test.values)
pred_tr = tr_regressor.predict(X_test.values)
decision_score=tr_regressor.score(X_test.values,y_test.values)
expl_tr = explained_variance_score(pred_tr,y_test)

In [10]:
models_score =pd.DataFrame({'Model':['Decision Tree Regressor'],
                            'Score':[decision_score],
                            'Explained Variance Score':[expl_tr]
                           })
models_score

Unnamed: 0,Model,Score,Explained Variance Score
0,Decision Tree Regressor,0.744052,0.701722


In [11]:
# Save the model to disk
filename = 'price_dt_py.sav'
pickle.dump(tr_regressor, open(filename, 'wb'))

In [12]:
# For testing
import pickle

# Load the pickle file
model_file_name = "price_dt_py.sav"
model = pickle.load(open(model_file_name, 'rb'))

X_new = [[1.417779575520277,1.2235334622079015,1376.2808414176138,-44282.09192720655,
1.0663144705090994,-0.016425213918793445,-0.12412711151320194,3.010860708305122,
6.446308650295107,1134.2601420161861,83.01687506148825,1950.9720074899026,38.33997291730212,
98187.71575450613,47.512356777412954,-122.29738607549102,1162.588195657737,-15814.149590757263]]

model.predict(X_new)

array([265000.])