In [1]:
import numpy as np
import pandas as pd
import pickle

from helpers import *

In [2]:
# Load data
df_ = pd.read_csv("~/real_estate/data/v4/unsupervised/yelp_unsupervised_df.csv")
df = df_.copy(deep=True)

# Load predictors, scaler
with open("../../real_estate/data/v6/predicting_nulls/scaler.pickle", "rb") as f:
    scaler = pickle.load(f)
with open("../../real_estate/data/v6/predicting_nulls/svr_pct_increasing.pickle", "rb") as f:
    svr_pct_increasing = pickle.load(f)  
with open("../../real_estate/data/v6/predicting_nulls/svr_pct_decreasing.pickle", "rb") as f:
    svr_pct_decreasing = pickle.load(f)  
with open("../../real_estate/data/v6/predicting_nulls/svr_median_sqft_value.pickle", "rb") as f:
    svr_median_sqft_value = pickle.load(f)
with open("../../real_estate/data/v6/predicting_nulls/svr_zhvi_singlefam.pickle", "rb") as f:
    svr_zhvi_singlefam = pickle.load(f)   
with open("../../real_estate/data/v6/predicting_nulls/svr_zri_sqft_value.pickle", "rb") as f:
    svr_zri_sqft_value = pickle.load(f)

In [3]:
# Scale columns
log_columns = ["zhvi_condo","people_count_lag","adult_count_lag","mean_adjusted_income_lag",
               "mean_adjusted_income_with_dep_lag","mean_real_estate_taxes_hh_lag","mean_mortgage_interest_hh_lag",
               "zhvi_singlefam","median_sqft_value","zri_sqft_value"]

for i in log_columns:
    df.loc[:,i] = log_with_zeros(df.loc[:,i].values)
    
scaling_columns = ['median_sqft_value','percent_decreasing','percent_increasing','zri_sqft_value','zhvi_condo',
                   'zhvi_singlefam','month','year','mean_rating','bar_review_count','restaurant_review_count',
                   'coffee_review_count','price_3_review_count','price_2_review_count','review_count',
                   'dependent_count_lag','sum_adjusted_income_lag','sum_real_estate_taxes_lag',
                   'sum_mortgage_interest_paid_lag','tax_return_count_lag','joint_return_count_lag','adult_count_lag',
                   'people_count_lag','mean_adjusted_income_lag','mean_real_estate_taxes_hh_lag',
                   'mean_mortgage_interest_hh_lag','mean_adjusted_income_with_dep_lag','90_day_treasury_bill_rate',
                   'mean_hourly_earnings_adj', 'mean_fixed_mortgage_rate','total_nonfarm_payroll']

# Scale the original df
df_scaled = scaler.transform(df[scaling_columns].values)
df_scaled = pd.DataFrame(df_scaled, columns=scaling_columns)

In [4]:
# Store indices of rows with null values across the four categories we're predicting
null_indices = {
    "pct_increasing": df_scaled.loc[df_scaled["percent_increasing"].isnull()].index,
    "pct_decreasing": df_scaled.loc[df_scaled["percent_decreasing"].isnull()].index,
    "median_sqft_value": df_scaled.loc[df_scaled["median_sqft_value"].isnull()].index,
    "zhvi_singlefam": df_scaled.loc[df_scaled["zhvi_singlefam"].isnull()].index,
    "zri_sqft_value": df_scaled.loc[df_scaled["zri_sqft_value"].isnull()].index,
}

In [5]:
columns_pred = [
    "90_day_treasury_bill_rate","adult_count_lag","bar_review_count","coffee_review_count", 
    "mean_adjusted_income_lag","mean_adjusted_income_with_dep_lag","mean_fixed_mortgage_rate",
    "mean_hourly_earnings_adj","mean_mortgage_interest_hh_lag","mean_rating","mean_real_estate_taxes_hh_lag",
    "people_count_lag","price_2_review_count","price_3_review_count","restaurant_review_count","review_count", 
    "total_nonfarm_payroll","zhvi_condo"]

pct_increasing_pred = svr_pct_increasing.predict(df_scaled.loc[null_indices["pct_increasing"],columns_pred].values)
pct_decreasing_pred = svr_pct_decreasing.predict(df_scaled.loc[null_indices["pct_decreasing"],columns_pred].values)
median_sqft_value_pred = svr_median_sqft_value.predict(df_scaled.loc[null_indices["median_sqft_value"],columns_pred].values)
zhvi_singlefam_pred = svr_zhvi_singlefam.predict(df_scaled.loc[null_indices["zhvi_singlefam"],columns_pred].values)
zri_sqft_value_pred = svr_zri_sqft_value.predict(df_scaled.loc[null_indices["zri_sqft_value"],columns_pred].values)

In [6]:
df_scaled_ = df_scaled.copy(deep=True)
df_scaled_.loc[null_indices["pct_increasing"],"percent_increasing"] = pct_increasing_pred
df_scaled_.loc[null_indices["pct_decreasing"],"percent_decreasing"] = pct_decreasing_pred
df_scaled_.loc[null_indices["median_sqft_value"],"median_sqft_value"] = median_sqft_value_pred
df_scaled_.loc[null_indices["zhvi_singlefam"],"zhvi_singlefam"] = zhvi_singlefam_pred
df_scaled_.loc[null_indices["zri_sqft_value"],"zri_sqft_value"] = zri_sqft_value_pred

In [7]:
# Rescale with minmax
df_rescaled = scaler.inverse_transform(df_scaled_)
df_rescaled = pd.DataFrame(df_rescaled, columns=scaling_columns)

# Rescale with log
log_columns_ = ["zhvi_condo","people_count_lag","adult_count_lag","mean_adjusted_income_lag",
               "mean_adjusted_income_with_dep_lag","mean_real_estate_taxes_hh_lag","mean_mortgage_interest_hh_lag",
               "zhvi_singlefam","median_sqft_value","zri_sqft_value"]

for i in log_columns_:
    df_rescaled[i] = np.exp(df_rescaled[i].values)

In [8]:
df_rescaled["postal_code"] = df_["postal_code"].values

In [10]:
# Export
df_rescaled.to_csv("~/real_estate/data/v6/semisupervised_df.csv",index=False)

df_rescaled.head()

Unnamed: 0,median_sqft_value,percent_decreasing,percent_increasing,zri_sqft_value,zhvi_condo,zhvi_singlefam,month,year,mean_rating,bar_review_count,...,people_count_lag,mean_adjusted_income_lag,mean_real_estate_taxes_hh_lag,mean_mortgage_interest_hh_lag,mean_adjusted_income_with_dep_lag,90_day_treasury_bill_rate,mean_hourly_earnings_adj,mean_fixed_mortgage_rate,total_nonfarm_payroll,postal_code
0,582.0,16.79,75.38,1.979505,534000.0,726474.826774,1.0,2010.0,5.0,0.0,...,4145.0,232.582764,4.039578,5.202903,210.025883,0.06,22.42,5.03,129799.0,2109
1,583.0,12.27,81.85,1.961174,536700.0,718840.207313,2.0,2010.0,5.0,0.0,...,4145.0,232.582764,4.039578,5.202903,210.025883,0.11,22.46,4.99,129726.0,2109
2,585.0,12.35,82.61,1.947382,533200.0,713926.511719,3.0,2010.0,5.0,0.0,...,4145.0,232.582764,4.039578,5.202903,210.025883,0.15,22.46,4.9675,129919.0,2109
3,582.0,12.12,82.23,1.961595,526600.0,730730.799716,4.0,2010.0,5.0,0.0,...,4145.0,232.582764,4.039578,5.202903,210.025883,0.16,22.5,5.098,130140.0,2109
4,578.0,17.09,74.17,1.936078,523400.0,714472.362867,5.0,2010.0,5.0,0.0,...,4145.0,232.582764,4.039578,5.202903,210.025883,0.16,22.53,4.8875,130662.0,2109
