In [1]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt

from helpers import *
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.externals import joblib

% matplotlib inline

In [2]:
# Load data
df = pd.read_csv("~/real_estate/data/v4/unsupervised/yelp_unsupervised_df.csv")

# Remove null values
df_full = df.dropna()

In [3]:
# Scale columns
log_columns = ["median_sqft_value","zri_sqft_value","zhvi_condo","zhvi_singlefam","people_count_lag",
               "adult_count_lag","mean_adjusted_income_lag","mean_adjusted_income_with_dep_lag",
               "mean_real_estate_taxes_hh_lag","mean_mortgage_interest_hh_lag"]

for i in log_columns:
    df_full.loc[:,i] = log_with_zeros(df_full.loc[:,i].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [4]:
# Get train and test sets
train_df, test_df = train_test_split(df_full, test_size=.2)

In [5]:
# Store columns to perform MinMax on
scaling_columns = ['median_sqft_value','percent_decreasing','percent_increasing','zri_sqft_value','zhvi_condo',
                   'zhvi_singlefam','month','year','mean_rating','bar_review_count','restaurant_review_count',
                   'coffee_review_count','price_3_review_count','price_2_review_count','review_count',
                   'dependent_count_lag','sum_adjusted_income_lag','sum_real_estate_taxes_lag',
                   'sum_mortgage_interest_paid_lag','tax_return_count_lag','joint_return_count_lag','adult_count_lag',
                   'people_count_lag','mean_adjusted_income_lag','mean_real_estate_taxes_hh_lag',
                   'mean_mortgage_interest_hh_lag','mean_adjusted_income_with_dep_lag','90_day_treasury_bill_rate',
                   'mean_hourly_earnings_adj', 'mean_fixed_mortgage_rate','total_nonfarm_payroll']

# MinMax scale df
scaler = MinMaxScaler()
scaler.fit(train_df[scaling_columns].values)

train_df_scaled = scaler.transform(train_df[scaling_columns].values)
train_df_scaled = pd.DataFrame(train_df_scaled, columns=scaling_columns)

test_df_scaled = scaler.transform(test_df[scaling_columns].values)
test_df_scaled = pd.DataFrame(test_df_scaled, columns=scaling_columns)

df_full_scaled = scaler.transform(df_full[scaling_columns].values)
df_full_scaled = pd.DataFrame(df_full_scaled, columns=scaling_columns)

In [6]:
# Store columns for individual regressions
pred_columns = [
    "90_day_treasury_bill_rate","adult_count_lag","bar_review_count","coffee_review_count", 
    "mean_adjusted_income_lag","mean_adjusted_income_with_dep_lag","mean_fixed_mortgage_rate",
    "mean_hourly_earnings_adj","mean_mortgage_interest_hh_lag","mean_rating","mean_real_estate_taxes_hh_lag",
    "people_count_lag","price_2_review_count","price_3_review_count","restaurant_review_count","review_count", 
    "total_nonfarm_payroll","zhvi_condo"]

# Percent increasing
X_pct_increasing_train = train_df_scaled[pred_columns].values
y_pct_increasing_train = train_df_scaled["percent_increasing"].values
X_pct_increasing_test = test_df_scaled[pred_columns].values
y_pct_increasing_test = test_df_scaled["percent_increasing"].values

# Median sqft value
X_median_sqft_value_train = train_df_scaled[pred_columns].values
y_median_sqft_value_train = train_df_scaled["median_sqft_value"].values
X_median_sqft_value_test = test_df_scaled[pred_columns].values
y_median_sqft_value_test = test_df_scaled["median_sqft_value"].values

# ZHVI singlefam
X_zhvi_singlefam_train = train_df_scaled[pred_columns].values
y_zhvi_singlefam_train = train_df_scaled["zhvi_singlefam"].values
X_zhvi_singlefam_test = test_df_scaled[pred_columns].values
y_zhvi_singlefam_test = test_df_scaled["zhvi_singlefam"].values

# ZRI sqft value
X_zri_sqft_value_train = train_df_scaled[pred_columns].values
y_zri_sqft_value_train = train_df_scaled["zri_sqft_value"].values
X_zri_sqft_value_test = test_df_scaled[pred_columns].values
y_zri_sqft_value_test = test_df_scaled["zri_sqft_value"].values

### Evaluate Predictive Power over percent_increasing

In [7]:
# Fit
svr_pct_increasing = SVR(gamma='auto')
svr_pct_increasing.fit(X_pct_increasing_train, y_pct_increasing_train)

# Predict
yhat_pct_increasing_train = svr_pct_increasing.predict(X_pct_increasing_train)
yhat_pct_increasing_test = svr_pct_increasing.predict(X_pct_increasing_test)

# Fetch errors
train_errors_pct_increasing = yhat_pct_increasing_train - y_pct_increasing_train
test_errors_pct_increasing = yhat_pct_increasing_test - y_pct_increasing_test

In [8]:
print("Mean test error: ", np.mean(abs(test_errors_pct_increasing)))
print("Mean test value: ", np.mean(abs(y_pct_increasing_test)))
print("R^2 test: ", np.square(np.corrcoef(yhat_pct_increasing_test, y_pct_increasing_test)[0,1]))
print('---')
print("Mean train error: ", np.mean(abs(train_errors_pct_increasing)))
print("Mean train value: ", np.mean(abs(y_pct_increasing_train)))
print("R^2 train: ", np.square(np.corrcoef(yhat_pct_increasing_train, y_pct_increasing_train)[0,1]))

Mean test error:  0.1286002808739331
Mean test value:  0.6543375944303251
R^2 test:  0.6100134406213807
---
Mean train error:  0.1267129616692024
Mean train value:  0.6564860304826241
R^2 train:  0.6148334503411433


### Evaluate Predictive Power over median_sqft_value

In [9]:
# Fit
svr_median_sqft_value = SVR(gamma='auto')
svr_median_sqft_value.fit(X_median_sqft_value_train, y_median_sqft_value_train)

# Predict
yhat_median_sqft_value_train = svr_median_sqft_value.predict(X_median_sqft_value_train)
yhat_median_sqft_value_test = svr_median_sqft_value.predict(X_median_sqft_value_test)

# Fetch errors
train_errors_median_sqft_value = yhat_median_sqft_value_train - y_median_sqft_value_train
test_errors_median_sqft_value = yhat_median_sqft_value_test - y_median_sqft_value_test

In [10]:
print("Mean test error: ", np.mean(abs(test_errors_median_sqft_value)))
print("Mean test value: ", np.mean(abs(y_median_sqft_value_test)))
print("R^2 test: ", np.square(np.corrcoef(yhat_median_sqft_value_test, y_median_sqft_value_test)[0,1]))
print('---')
print("Mean train error: ", np.mean(abs(train_errors_median_sqft_value)))
print("Mean train value: ", np.mean(abs(y_median_sqft_value_train)))
print("R^2 train: ", np.square(np.corrcoef(yhat_median_sqft_value_train, y_median_sqft_value_train)[0,1]))

Mean test error:  0.04591374789875634
Mean test value:  0.32311844686105073
R^2 test:  0.7318964899304715
---
Mean train error:  0.04546816436794125
Mean train value:  0.32155248393150637
R^2 train:  0.7179956334282837


### Evaluate Predictive Power over zhvi_singlefam

In [11]:
# Fit
svr_zhvi_singlefam = SVR(gamma='auto')
svr_zhvi_singlefam.fit(X_zhvi_singlefam_train, y_zhvi_singlefam_train)

# Predict
yhat_zhvi_singlefam_train = svr_zhvi_singlefam.predict(X_zhvi_singlefam_train)
yhat_zhvi_singlefam_test = svr_zhvi_singlefam.predict(X_zhvi_singlefam_test)

# Fetch errors
train_errors_zhvi_singlefam = yhat_zhvi_singlefam_train - y_zhvi_singlefam_train
test_errors_zhvi_singlefam = yhat_zhvi_singlefam_test - y_zhvi_singlefam_test

In [12]:
print("Mean test error: ", np.mean(abs(test_errors_zhvi_singlefam)))
print("Mean test value: ", np.mean(abs(y_zhvi_singlefam_test)))
print("R^2 test: ", np.square(np.corrcoef(yhat_zhvi_singlefam_test, y_zhvi_singlefam_test)[0,1]))
print('---')
print("Mean train error: ", np.mean(abs(train_errors_zhvi_singlefam)))
print("Mean train value: ", np.mean(abs(y_zhvi_singlefam_train)))
print("R^2 train: ", np.square(np.corrcoef(yhat_zhvi_singlefam_train, y_zhvi_singlefam_train)[0,1]))

Mean test error:  0.037838468171962306
Mean test value:  0.2908709207475014
R^2 test:  0.8240981811673462
---
Mean train error:  0.037997465991255364
Mean train value:  0.28967308486291954
R^2 train:  0.8097365499641588


### Evaluate Predictive Power over zri_sqft_value

In [13]:
# Fit
svr_zri_sqft_value = SVR(gamma='auto')
svr_zri_sqft_value.fit(X_zri_sqft_value_train, y_zri_sqft_value_train)

# Predict
yhat_zri_sqft_value_train = svr_zri_sqft_value.predict(X_zri_sqft_value_train)
yhat_zri_sqft_value_test = svr_zri_sqft_value.predict(X_zri_sqft_value_test)

# Fetch errors
train_errors_zri_sqft_value = yhat_zri_sqft_value_train - y_zri_sqft_value_train
test_errors_zri_sqft_value = yhat_zri_sqft_value_test - y_zri_sqft_value_test

In [14]:
print("Mean test error: ", np.mean(abs(test_errors_zri_sqft_value)))
print("Mean test value: ", np.mean(abs(y_zri_sqft_value_test)))
print("R^2 test: ", np.square(np.corrcoef(yhat_zri_sqft_value_test, y_zri_sqft_value_test)[0,1]))
print('---')
print("Mean train error: ", np.mean(abs(train_errors_zri_sqft_value)))
print("Mean train value: ", np.mean(abs(y_zri_sqft_value_train)))
print("R^2 train: ", np.square(np.corrcoef(yhat_zri_sqft_value_train, y_zri_sqft_value_train)[0,1]))

Mean test error:  0.051165655990302907
Mean test value:  0.22227839266298577
R^2 test:  0.6479689076875486
---
Mean train error:  0.05089579684138086
Mean train value:  0.22031979535099228
R^2 train:  0.629714240852214


### Training Predictors with Full Dataset

In [18]:
# Create SVRs
svr_pct_increasing_ = SVR(gamma='auto')
svr_pct_decreasing_ = SVR(gamma='auto')
svr_median_sqft_value_ = SVR(gamma='auto')
svr_zhvi_singlefam_ = SVR(gamma='auto')
svr_zri_sqft_value_ = SVR(gamma='auto')

# Fit
svr_pct_increasing_.fit(df_full_scaled[pred_columns], df_full_scaled["percent_increasing"])
svr_pct_decreasing_.fit(df_full_scaled[pred_columns], df_full_scaled["percent_decreasing"])
svr_median_sqft_value_.fit(df_full_scaled[pred_columns], df_full_scaled["median_sqft_value"])
svr_zhvi_singlefam_.fit(df_full_scaled[pred_columns], df_full_scaled["zhvi_singlefam"])
svr_zri_sqft_value_.fit(df_full_scaled[pred_columns], df_full_scaled["zri_sqft_value"])

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

### Exporting predictors

In [19]:
with open("../../real_estate/data/v6/predicting_nulls/scaler.pickle", "wb") as output_file:
    pickle.dump(scaler, output_file)

In [20]:
with open("../../real_estate/data/v6/predicting_nulls/svr_pct_increasing.pickle", "wb") as output_file:
    pickle.dump(svr_pct_increasing_, output_file)

In [21]:
with open("../../real_estate/data/v6/predicting_nulls/svr_pct_decreasing.pickle", "wb") as output_file:
    pickle.dump(svr_pct_decreasing_, output_file)

In [22]:
with open("../../real_estate/data/v6/predicting_nulls/svr_median_sqft_value.pickle", "wb") as output_file:
    pickle.dump(svr_median_sqft_value_, output_file)

In [23]:
with open("../../real_estate/data/v6/predicting_nulls/svr_zhvi_singlefam.pickle", "wb") as output_file:
    pickle.dump(svr_zhvi_singlefam_, output_file)

In [24]:
with open("../../real_estate/data/v6/predicting_nulls/svr_zri_sqft_value.pickle", "wb") as output_file:
    pickle.dump(svr_zri_sqft_value_, output_file)