In [20]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import json

%matplotlib inline


In [21]:
print('DOTSCIENCE_INPUTS=["agent1", "agent2"]')
print('DOTSCIENCE_OUTPUTS=["model"]')
print('DOTSCIENCE_LABELS={"model_type": "random_forest"}')

DOTSCIENCE_INPUTS=["agent1", "agent2"]
DOTSCIENCE_OUTPUTS=["model"]
DOTSCIENCE_LABELS={"model_type": "random_forest"}


We are using a housing price dataset sourced from Bay Area Home Sales Database and Zillow. This dataset was based on the homes sold between January 2013 and December 2015. 

In [22]:
inputs = [pd.read_csv('./agent1/bay_area_zillow_agent1.csv'), pd.read_csv('./agent2/bay_area_zillow_agent2.csv')]
df = pd.concat(f for f in inputs)

In [23]:
# randomise my dataframe rows to remove any ordering in the data
# TODO fix seed to preserve reproducibility
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [24]:
df.describe(include = "all")

Unnamed: 0.1,Unnamed: 0,address,info,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
count,11330.0,11330,11330,11330,11330.0,11330.0,11330.0,11330,11330.0,11330.0,11330.0,11330,11330.0,11330,11330.0,11330.0,11330.0,11330.0,11330.0
unique,,10730,11232,10684,,,,954,,,,71,,10,,,,,
top,,Address: 1300 Eddy Street,San FranciscoSales price: 2800000Sales date: 0...,1300 Eddy St,,,,11/08/2013,,,,Mission,,SingleFamily,,,,,
freq,,5,3,5,,,,46,,,,540,,5803,,,,,
mean,9171.729214,,,,1.980229,2.614475,1585.420918,,1263928.0,37.759711,-122.436518,,6.111562,,1948.498147,1565695.0,1320205.0,94116.912004,36899730.0
std,4921.941074,,,,1.047358,1.299457,921.978245,,1042079.0,0.025578,0.030743,,12.125819,,37.911196,1229417.0,584817.0,9.400877,78007410.0
min,2.0,,,,0.5,0.0,1.0,,535.0,37.70817,-122.510726,,1.0,,1860.0,432385.0,688100.0,94102.0,15063290.0
25%,5039.75,,,,1.0,2.0,1019.0,,729250.0,37.739286,-122.455157,,4.0,,1916.0,905237.5,982900.0,94110.0,15108470.0
50%,9198.5,,,,2.0,2.0,1362.0,,990000.0,37.760513,-122.43251,,5.0,,1940.0,1230758.0,1211900.0,94115.0,15156970.0
75%,13374.75,,,,2.0,3.0,1876.0,,1450000.0,37.781386,-122.413359,,7.0,,1986.0,1731170.0,1480400.0,94123.0,59700400.0


In [25]:
df.columns

Index(['Unnamed: 0', 'address', 'info', 'z_address', 'bathrooms', 'bedrooms',
       'finishedsqft', 'lastsolddate', 'lastsoldprice', 'latitude',
       'longitude', 'neighborhood', 'totalrooms', 'usecode', 'yearbuilt',
       'zestimate', 'zindexvalue', 'zipcode', 'zpid'],
      dtype='object')

In [26]:
# drop unneeded columns
df.drop(df.columns[[0, 1, 2, 3, 11, 13, 14, 15, 16, 17, 18]], axis=1, inplace=True)

In [27]:
df.describe(include = "all")

Unnamed: 0,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,totalrooms
count,11330.0,11330.0,11330.0,11330,11330.0,11330.0,11330.0,11330.0
unique,,,,954,,,,
top,,,,11/08/2013,,,,
freq,,,,46,,,,
mean,1.980229,2.614475,1585.420918,,1263928.0,37.759711,-122.436518,6.111562
std,1.047358,1.299457,921.978245,,1042079.0,0.025578,0.030743,12.125819
min,0.5,0.0,1.0,,535.0,37.70817,-122.510726,1.0
25%,1.0,2.0,1019.0,,729250.0,37.739286,-122.455157,4.0
50%,2.0,2.0,1362.0,,990000.0,37.760513,-122.43251,5.0
75%,2.0,3.0,1876.0,,1450000.0,37.781386,-122.413359,7.0


In [28]:
# check none of our data is null or NaN
df.isnull().any()

bathrooms        False
bedrooms         False
finishedsqft     False
lastsolddate     False
lastsoldprice    False
latitude         False
longitude        False
totalrooms       False
dtype: bool

In [29]:
# check that we have sensible datatypes for our features
df.dtypes

bathrooms        float64
bedrooms           int64
finishedsqft       int64
lastsolddate      object
lastsoldprice      int64
latitude         float64
longitude        float64
totalrooms         int64
dtype: object

In [30]:
# get time data as datetime datatype. convert floats to ints

df['bathrooms'] = df['bathrooms'].astype('int64', copy=False)
df['lastsolddate'] = pd.to_datetime(df['lastsolddate'])


In [31]:
# datatypes look more consistent now, and dates will be interpreted correctlu

df.dtypes

bathrooms                 int64
bedrooms                  int64
finishedsqft              int64
lastsolddate     datetime64[ns]
lastsoldprice             int64
latitude                float64
longitude               float64
totalrooms                int64
dtype: object

Now let's try a random forest model on some of these features to predict `lastsoldprice`

In [32]:
X = df[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'longitude', 'latitude']]
Y = df['lastsoldprice']

In [33]:
# Split data into test and training set. Use random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# input a range of parameters to try. We will fit models with every possible combination of these parameters.
param_grid = [
    {'n_estimators': [3, 10, 12, 14], 'max_features': [1,2, 3, 4, 5, 6], 'bootstrap': [False]}
]

rand_forest_regressor = RandomForestRegressor()

In [34]:
rand_forest_regressor.get_params().keys()

dict_keys(['bootstrap', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [35]:
grid_search = GridSearchCV(rand_forest_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')

In [36]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 12, 14], 'max_features': [1, 2, 3, 4, 5, 6], 'bootstrap': [False]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [37]:
# These are our runs

array_runs = grid_search.cv_results_

for mean_score, params in zip(array_runs["mean_test_score"], array_runs["params"]):
    print(np.sqrt(-mean_score), params)
    


615387.3510853354 {'bootstrap': False, 'max_features': 1, 'n_estimators': 3}
557446.7902663273 {'bootstrap': False, 'max_features': 1, 'n_estimators': 10}
548120.6930112323 {'bootstrap': False, 'max_features': 1, 'n_estimators': 12}
546756.5601205274 {'bootstrap': False, 'max_features': 1, 'n_estimators': 14}
613313.9693200146 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
549758.6396466137 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
550144.3930132751 {'bootstrap': False, 'max_features': 2, 'n_estimators': 12}
544421.3201317303 {'bootstrap': False, 'max_features': 2, 'n_estimators': 14}
599302.8665763041 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
552307.3788286528 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
559854.2793213297 {'bootstrap': False, 'max_features': 3, 'n_estimators': 12}
546502.6513734278 {'bootstrap': False, 'max_features': 3, 'n_estimators': 14}
620464.4483515944 {'bootstrap': False, 'max_features': 4, 'n_estima

In [38]:
# scikitlearn uses convention of 'higher score is better', so mean squared error is negated. We take square root and negate that to get rmse

for mean_score, params in zip(array_runs["mean_test_score"], array_runs["params"]):
    param_dict = dict((param_key, params[param_key]) for param_key in params)
    print('DOTSCIENCE_PARAMETERS=' + json.dumps(param_dict))
    print('DOTSCIENCE_SUMMARY=' + json.dumps({"rmse": "%.2f" % np.sqrt(-mean_score)}))

DOTSCIENCE_PARAMETERS={"bootstrap": false, "max_features": 1, "n_estimators": 3}
DOTSCIENCE_SUMMARY={"rmse": "615387.35"}
DOTSCIENCE_PARAMETERS={"bootstrap": false, "max_features": 1, "n_estimators": 10}
DOTSCIENCE_SUMMARY={"rmse": "557446.79"}
DOTSCIENCE_PARAMETERS={"bootstrap": false, "max_features": 1, "n_estimators": 12}
DOTSCIENCE_SUMMARY={"rmse": "548120.69"}
DOTSCIENCE_PARAMETERS={"bootstrap": false, "max_features": 1, "n_estimators": 14}
DOTSCIENCE_SUMMARY={"rmse": "546756.56"}
DOTSCIENCE_PARAMETERS={"bootstrap": false, "max_features": 2, "n_estimators": 3}
DOTSCIENCE_SUMMARY={"rmse": "613313.97"}
DOTSCIENCE_PARAMETERS={"bootstrap": false, "max_features": 2, "n_estimators": 10}
DOTSCIENCE_SUMMARY={"rmse": "549758.64"}
DOTSCIENCE_PARAMETERS={"bootstrap": false, "max_features": 2, "n_estimators": 12}
DOTSCIENCE_SUMMARY={"rmse": "550144.39"}
DOTSCIENCE_PARAMETERS={"bootstrap": false, "max_features": 2, "n_estimators": 14}
DOTSCIENCE_SUMMARY={"rmse": "544421.32"}
DOTSCIENCE_PARAMET

In [None]:
for mean_score, params in zip(array_runs["mean_test_score"], array_runs["params"]):
    param_dict = dict((param_key, params[param_key]) for param_key in params)
    ds.params(param_dict)
    ds.summary("rmse", "%.2f" % np.sqrt(-mean_score)
    ds.clear()