In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

%matplotlib inline
np.random.seed(42)

In [2]:
train_df = pd.read_csv("~/real_estate/data/v4/final/yelp/yelp_train_df.csv")
test_df = pd.read_csv("~/real_estate/data/v4/final/yelp/yelp_test_df.csv")

features = train_df.columns[:-2]

X_train = train_df[features]
X_test = test_df[features]
y_train = train_df["target"]
y_test = test_df["target"]

In [3]:
rf = RandomForestRegressor(n_estimators=500, max_depth=3)

In [None]:
rf.fit(X_train, y_train)

In [5]:
y_test_hat_1 = rf.predict(X_test)
y_train_hat_1 = rf.predict(X_train)
print("R^2 train: ", np.square(np.corrcoef(y_train, y_train_hat_1)[0,1]))
print("R^2 test: ", np.square(np.corrcoef(y_test, y_test_hat_1)[0,1]))

R^2 train:  0.2601728031010158
R^2 test:  0.09313710894243785


In [9]:
### joblib.dump(rf, '/home/gnazareths/real_estate/v11/models/rf_0215.joblib')

['/home/gnazareths/real_estate/v11/models/rf_0215.joblib']

In [10]:
feature_importances = [(i,j) for i,j in zip(features,rf.feature_importances_)]

In [11]:
sorted(feature_importances, key=lambda tup: tup[1])

[('year', 0.0),
 ('month', 0.0),
 ('90_day_treasury_bill_rate', 0.0),
 ('mean_adjusted_income_lag_log', 0.0),
 ('mean_fixed_mortgage_rate', 0.0),
 ('mean_mortgage_interest_hh_lag_log', 0.0),
 ('median_sqft_value_log', 0.0),
 ('people_count_lag_log', 0.0),
 ('percent_decreasing', 0.0),
 ('zhvi_condo_log', 0.0),
 ('zhvi_singlefam_log', 0.0),
 ('zri_sqft_value_log', 0.0),
 ('business_count_cbp_lag_log', 0.0),
 ('business_500_count_cbp_lag', 0.0),
 ('bar_count_cbp_lag', 0.0),
 ('mean_county_zhvi_condo_log', 0.0),
 ('median_county_zhvi_condo_log', 0.0),
 ('mean_county_sqft_value_log', 0.0),
 ('median_county_sqft_value_log', 0.0),
 ('mean_fixed_mortgage_rate_mo', 0.0),
 ('percent_decreasing_mo', 0.0),
 ('zhvi_singlefam_mo', 0.0),
 ('zri_sqft_value_mo', 0.0),
 ('percent_increasing_mo', 0.0),
 ('90_day_treasury_bill_rate_mo', 0.0),
 ('total_nonfarm_payroll_mo', 0.0),
 ('mean_hourly_earnings_adj_mo', 0.0),
 ('median_sqft_value_mo', 0.0),
 ('percent_increasing_yr', 0.0),
 ('adult_count_lag_yr', 