In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

%matplotlib inline
np.random.seed(42)

In [2]:
train_df = pd.read_csv("~/real_estate/data/v4/final/yelp/yelp_train_df.csv")
test_df = pd.read_csv("~/real_estate/data/v4/final/yelp/yelp_test_df.csv")

features = train_df.columns[:-2]

X_train = train_df[features]
X_test = test_df[features]
y_train = train_df["target"]
y_test = test_df["target"]

In [3]:
rf = RandomForestRegressor(n_estimators=500, max_depth=3)

In [4]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [5]:
y_test_hat_1 = rf.predict(X_test)
y_train_hat_1 = rf.predict(X_train)
print("R^2 train: ", np.square(np.corrcoef(y_train, y_train_hat_1)[0,1]))
print("R^2 test: ", np.square(np.corrcoef(y_test, y_test_hat_1)[0,1]))

R^2 train:  0.402976964175269
R^2 test:  0.24371738611934468


In [7]:
joblib.dump(rf, '/home/gnazareths/real_estate/v4/models/rf_0215.joblib')

['/home/gnazareths/real_estate/v4/models/rf_0215.joblib']

In [8]:
feature_importances = [(i,j) for i,j in zip(features,rf.feature_importances_)]

In [9]:
sorted(feature_importances, key=lambda tup: tup[1])

[('90_day_treasury_bill_rate', 0.0),
 ('mean_mortgage_interest_hh_lag_log', 0.0),
 ('mean_rating', 0.0),
 ('month', 0.0),
 ('price_2_review_count', 0.0),
 ('coffee_review_count', 0.0),
 ('adult_count_lag_mo', 0.0),
 ('bar_review_count_mo', 0.0),
 ('mean_mortgage_interest_hh_lag_mo', 0.0),
 ('mean_fixed_mortgage_rate_mo', 0.0),
 ('mean_adjusted_income_with_dep_lag_mo', 0.0),
 ('restaurant_review_count_mo', 0.0),
 ('percent_increasing_mo', 0.0),
 ('90_day_treasury_bill_rate_mo', 0.0),
 ('total_nonfarm_payroll_mo', 0.0),
 ('mean_adjusted_income_lag_mo', 0.0),
 ('mean_real_estate_taxes_hh_lag_mo', 0.0),
 ('mean_hourly_earnings_adj_mo', 0.0),
 ('review_count_mo', 0.0),
 ('mean_rating_mo', 0.0),
 ('people_count_lag_mo', 0.0),
 ('price_3_review_count_mo', 0.0),
 ('coffee_review_count_mo', 0.0),
 ('price_2_review_count_mo', 0.0),
 ('coffee_review_count_yr', 0.0),
 ('price_3_review_count_yr', 0.0),
 ('price_2_review_count_yr', 0.0),
 ('zhvi_singlefam_yr', 0.0),
 ('restaurant_review_count_yr', 0