In [None]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import json

%matplotlib inline


In [None]:
print('DOTSCIENCE_INPUTS=["agent1", "agent2"]')
print('DOTSCIENCE_OUTPUTS=["model"]')
print('DOTSCIENCE_LABELS={"model_type": "random_forest"}')

We are using a housing price dataset sourced from Bay Area Home Sales Database and Zillow. This dataset was based on the homes sold between January 2013 and December 2015. 

In [None]:
inputs = [pd.read_csv('./agent1/bay_area_zillow_agent1.csv'), pd.read_csv('./agent2/bay_area_zillow_agent2.csv')]
df = pd.concat(f for f in inputs)

In [None]:
# randomise my dataframe rows to remove any ordering in the data
# TODO fix seed to preserve reproducibility
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
df.describe(include = "all")

In [None]:
df.columns

In [None]:
# drop unneeded columns
df.drop(df.columns[[0, 1, 2, 3, 11, 13, 14, 15, 16, 17, 18]], axis=1, inplace=True)

In [None]:
df.describe(include = "all")

In [None]:
# check none of our data is null or NaN
df.isnull().any()

In [None]:
# check that we have sensible datatypes for our features
df.dtypes

In [None]:
# get time data as datetime datatype. convert floats to ints

df['bathrooms'] = df['bathrooms'].astype('int64', copy=False)
df['lastsolddate'] = pd.to_datetime(df['lastsolddate'])


In [None]:
# datatypes look more consistent now, and dates will be interpreted correctlu

df.dtypes

Now let's try a random forest model on some of these features to predict `lastsoldprice`

In [None]:
X = df[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'longitude', 'latitude']]
Y = df['lastsoldprice']

In [None]:
# Split data into test and training set. Use random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# input a range of parameters to try. We will fit models with every possible combination of these parameters.
param_grid = [
    {'n_estimators': [3, 10, 12, 14], 'max_features': [1,2, 3, 4, 5, 6], 'bootstrap': [False]}
]

rand_forest_regressor = RandomForestRegressor()

In [None]:
rand_forest_regressor.get_params().keys()

In [None]:
grid_search = GridSearchCV(rand_forest_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
# These are our runs

array_runs = grid_search.cv_results_

for mean_score, params in zip(array_runs["mean_test_score"], array_runs["params"]):
    print(np.sqrt(-mean_score), params)
    


In [None]:
# scikitlearn uses convention of 'higher score is better', so mean squared error is negated. We take square root and negate that to get rmse

for mean_score, params in zip(array_runs["mean_test_score"], array_runs["params"]):
    param_dict = dict((param_key, params[param_key]) for param_key in params)
    print('DOTSCIENCE_PARAMETERS=' + json.dumps(param_dict))
    print('DOTSCIENCE_SUMMARY=' + json.dumps({"rmse": "%.2f" % np.sqrt(-mean_score)}))