In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

%matplotlib inline
np.random.seed(42)

In [15]:
train_df = pd.read_csv("~/real_estate/data/v14/train_df.csv")
test_df = pd.read_csv("~/real_estate/data/v14/test_df.csv")

features = train_df.columns[:-2]

X_train = train_df[features]
X_test = test_df[features]
y_train = train_df["target"]
y_test = test_df["target"]

In [16]:
rf = RandomForestRegressor(n_estimators=500, max_depth=3)

In [17]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [18]:
y_test_hat_1 = rf.predict(X_test)
y_train_hat_1 = rf.predict(X_train)
print("R^2 train: ", np.square(np.corrcoef(y_train, y_train_hat_1)[0,1]))
print("R^2 test: ", np.square(np.corrcoef(y_test, y_test_hat_1)[0,1]))

R^2 train:  0.4182280150225608
R^2 test:  0.2537948335884311


In [19]:
joblib.dump(rf, '/home/gnazareths/real_estate/v14/models/rf_0215.joblib')

['/home/gnazareths/real_estate/v14/models/rf_0215.joblib']

In [20]:
feature_importances = [(i,j) for i,j in zip(features,rf.feature_importances_)]

In [21]:
sorted(feature_importances, key=lambda tup: tup[1])

[('year', 0.0),
 ('month', 0.0),
 ('90_day_treasury_bill_rate', 0.0),
 ('mean_mortgage_interest_hh_lag_log', 0.0),
 ('business_500_count_cbp_lag', 0.0),
 ('bar_count_cbp_lag', 0.0),
 ('mean_rating', 0.0),
 ('mean_fixed_mortgage_rate_mo', 0.0),
 ('zri_sqft_value_mo', 0.0),
 ('percent_increasing_mo', 0.0),
 ('90_day_treasury_bill_rate_mo', 0.0),
 ('total_nonfarm_payroll_mo', 0.0),
 ('mean_hourly_earnings_adj_mo', 0.0),
 ('mean_rating_mo', 0.0),
 ('review_count_mo', 0.0),
 ('restaurant_review_count_mo', 0.0),
 ('bar_review_count_mo', 0.0),
 ('price_2_review_count_mo', 0.0),
 ('median_county_hourly_earnings_adj_mo', 0.0),
 ('total_nonfarm_payroll_yr', 0.0),
 ('mean_hourly_earnings_adj_yr', 0.0),
 ('business_count_cbp_lag_yr', 0.0),
 ('employment_count_cbp_lag_yr', 0.0),
 ('restaurant_count_cbp_lag_yr', 0.0),
 ('business_500_count_cbp_lag_yr', 0.0),
 ('bar_count_cbp_lag_yr', 0.0),
 ('review_count_yr', 0.0),
 ('restaurant_review_count_yr', 0.0),
 ('bar_review_count_yr', 0.0),
 ('median_count