In [None]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline


We are using a housing price dataset sourced from Bay Area Home Sales Database and Zillow. This dataset was based on the homes sold between January 2013 and December 2015. 

In [None]:
inputs = [pd.read_csv('./agent1/bay_area_zillow_agent1.csv'), pd.read_csv('./agent2/bay_area_zillow_agent2.csv')]
df = pd.concat(f for f in inputs)

In [None]:
# randomise my dataframe rows to remove any ordering in the data
# TODO fix seed to preserve reproducibility
df = df.sample(frac=1).reset_index(drop=True)


In [None]:
df.describe(include = "all")

In [None]:
df.columns

In [None]:
# drop unneeded columns
df.drop(df.columns[[0, 1, 2, 3, 11, 13, 14, 15, 16, 17, 18]], axis=1, inplace=True)

In [None]:
df.describe(include = "all")

In [None]:
# check none of our data is null or NaN
df.isnull().any()

In [None]:
df.dtypes


In [None]:
df['bathrooms'] = df['bathrooms'].astype('int64', copy=False)
df['lastsolddate'] = pd.to_datetime(df['lastsolddate'])


In [None]:
df.dtypes

We hypothesise that `finishedsqft`, `bathrooms` and `bedrooms` are positively correlated with `lastsoldprice`. Let's plot these to see.

In [None]:
df.plot(x='finishedsqft', y='lastsoldprice', style='o')
df.plot(x='bathrooms', y='lastsoldprice', style='o')
df.plot(x='bedrooms', y='lastsoldprice', style='o')
df.plot(x='totalrooms', y='lastsoldprice', style='o')

Now let's try a random forest model on those features to predict `lastsoldprice`

In [None]:
X = df[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'longitude', 'latitude']]
Y = df['lastsoldprice']

These are our features:

In [None]:
X.head()

In [None]:
# Split data into test and training set. Use random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 12, 14], 'max_features': [1,2, 3, 4, 5, 6]},
    {'bootstrap': [False]}
]

rand_forest_regressor = RandomForestRegressor()

# grid_search = GridSearchCV(rand_forest_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')

# grid_search.fit(X_train, y_train)

In [None]:
rand_forest_regressor.get_params().keys()

In [None]:
grid_search = GridSearchCV(rand_forest_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')


In [None]:
#rand_forest_regressor.fit(X_train, y_train)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
y_pred = grid_search.predict(X_test)

In [None]:
#rand_forest_regressor.score(X_test, y_test)

In [None]:
grid_search.score(X_test, y_test)

In [None]:
print('Linear Regression coefficient of determination (R squared): %.4f' % grid_search.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_pred, y_test)
lin_rmse = np.sqrt(lin_mse)
print('Linear Regression RMSE: %.4f' % lin_rmse)