### Non Hedonic Model

This notebook was used to generate and score the ML model used for the web-based calculator tool.

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
import math
from scipy import interpolate, stats
import joblib

In [3]:
df = pd.read_csv("../joined_combined_filtered_mined_soil_water_df_td_17_4.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [96]:
df.shape

(25342, 107)

In [97]:
# Parcel filtering and feature extraction

fdf = df[(df["pool"] != True) & (df["secondfloor"] < 1) & (df["mhome"] < 1)  & (df["pers_prop_val"] < 1) & (df["parval"] < 1000000) & (df["parval"] > 10000) & (df["lot_area"] < 500)].copy()
fdf["add_g_size"] = fdf["addsize"] + fdf["gize"]
feature_df = fdf[["lot_area", "TotalWater", "water_dist", "castorieindex", "firstfloor", "city_dist", "parval"]]
feature_df.shape

(15426, 7)

In [5]:
feature_df = feature_df.dropna()
n = (feature_df.shape[1] - 1)
X = feature_df[feature_df.columns[0:n]].values
y = feature_df[feature_df.columns[n]].values
feature_df.shape

(14439, 7)

In [6]:
n_split = int(0.85*len(X))
X,y = shuffle(X,y)
X_train = X[0:n_split]
y_train = y[0:n_split]
X_test = X[n_split:]
y_test = y[n_split:]

In [7]:
regr = RandomForestRegressor(n_estimators = 100, min_samples_split = 50, min_samples_leaf = 20)

In [8]:
regr.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=20,
                      min_samples_split=50, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [9]:
regr.score(X_test, y_test)

0.5317921647956995

In [10]:
regr.feature_importances_

array([0.60510705, 0.09430442, 0.02863637, 0.0185514 , 0.16421824,
       0.08918251])

In [11]:
y_pred = regr.predict(X_test)

In [None]:
plt.scatter(y_pred, y_test)

In [94]:
math.sqrt(mean_squared_error(y_test, y_pred))

139143.4989190485

In [95]:
feature_df.to_csv("fdf.csv")

In [None]:
joblib.dump(regr, 'regr.pkl') 
