In [21]:
import pandas as pd
import matplotlib.pyplot as plt

from helpers import *
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

% matplotlib inline

In [22]:
# Import
df = pd.read_csv("~/real_estate/data/v9/supervised_df.csv")

In [23]:
current_time_features = [
    "90_day_treasury_bill_rate", "adult_count_lag",
    "mean_adjusted_income_lag", "mean_adjusted_income_with_dep_lag", "mean_fixed_mortgage_rate",
    "mean_hourly_earnings_adj", "mean_mortgage_interest_hh_lag", "mean_real_estate_taxes_hh_lag",
    "median_sqft_value", "people_count_lag", "percent_decreasing", "percent_increasing", 
    "total_nonfarm_payroll", "zhvi_singlefam", "zri_sqft_value", "zhvi_condo"
]

target = "zhvi_condo_12_months"

In [24]:
df = df.dropna()

In [25]:
for i in current_time_features:
    mo = i+"_mo"
    yr = i+"_yr"
    change_mo = (df[i] + 0.1)/(df[mo] + 0.1) - 1
    change_yr = (df[i] + 0.1)/(df[yr] + 0.1) - 1
    df[mo] = change_mo
    df[yr] = change_yr

df[target] = df[target]/df["zhvi_condo"] - 1

In [26]:
print(df.shape)

(17256, 71)


In [27]:
log_columns = ["median_sqft_value","zri_sqft_value","zhvi_condo","zhvi_singlefam","people_count_lag",
               "adult_count_lag","mean_adjusted_income_lag","mean_adjusted_income_with_dep_lag",
               "mean_real_estate_taxes_hh_lag","mean_mortgage_interest_hh_lag"]

for i in log_columns:
    df[i] = log_with_zeros(df[i].values)
    df = df.rename(columns={i:i+"_log"})

In [28]:
columns = ['postal_code',
           '90_day_treasury_bill_rate',
           'mean_adjusted_income_lag_log', 
           'mean_adjusted_income_with_dep_lag_log', 
           'mean_fixed_mortgage_rate', 
           'mean_hourly_earnings_adj', 
           'mean_mortgage_interest_hh_lag_log',
           'mean_real_estate_taxes_hh_lag_log', 
           'median_sqft_value_log', 
           'month', 
           'people_count_lag_log', 
           'percent_decreasing', 
           'percent_increasing', 
           'total_nonfarm_payroll', 
           'year', 
           'zhvi_condo_log', 
           'zhvi_singlefam_log', 
           'zri_sqft_value_log', 
           'adult_count_lag_mo', 
           'mean_mortgage_interest_hh_lag_mo', 
           'mean_fixed_mortgage_rate_mo', 
           'percent_decreasing_mo', 
           'zhvi_singlefam_mo', 
           'mean_adjusted_income_with_dep_lag_mo', 
           'zri_sqft_value_mo', 
           'percent_increasing_mo', 
           '90_day_treasury_bill_rate_mo', 
           'total_nonfarm_payroll_mo', 
           'mean_adjusted_income_lag_mo', 
           'mean_real_estate_taxes_hh_lag_mo', 
           'mean_hourly_earnings_adj_mo', 
           'people_count_lag_mo', 
           'zhvi_condo_mo', 
           'median_sqft_value_mo', 
           'percent_decreasing_yr', 
           'percent_increasing_yr', 
           'adult_count_lag_yr', 
           'mean_fixed_mortgage_rate_yr', 
           'mean_adjusted_income_lag_yr', 
           'zhvi_singlefam_yr', 
           'zri_sqft_value_yr', 
           'zhvi_condo_yr', 
           'mean_mortgage_interest_hh_lag_yr', 
           'total_nonfarm_payroll_yr', 
           'people_count_lag_yr', 
           'mean_hourly_earnings_adj_yr', 
           'median_sqft_value_yr', 
           '90_day_treasury_bill_rate_yr', 
           'mean_real_estate_taxes_hh_lag_yr', 
           'mean_adjusted_income_with_dep_lag_yr', 
           'zhvi_condo_12_months']

df = df[columns]

In [31]:
train_df = df.loc[df["year"] < 2017]
test_df = df.loc[df["year"] == 2017]

features = df.columns[1:-1]
target = "zhvi_condo_12_months"

In [32]:
df.head()

Unnamed: 0,postal_code,90_day_treasury_bill_rate,mean_adjusted_income_lag_log,mean_adjusted_income_with_dep_lag_log,mean_fixed_mortgage_rate,mean_hourly_earnings_adj,mean_mortgage_interest_hh_lag_log,mean_real_estate_taxes_hh_lag_log,median_sqft_value_log,month,...,zhvi_condo_yr,mean_mortgage_interest_hh_lag_yr,total_nonfarm_payroll_yr,people_count_lag_yr,mean_hourly_earnings_adj_yr,median_sqft_value_yr,90_day_treasury_bill_rate_yr,mean_real_estate_taxes_hh_lag_yr,mean_adjusted_income_with_dep_lag_yr,zhvi_condo_12_months
128,8054,0.01,4.016298,3.687483,3.9925,23.19,1.591174,1.23395,4.859812,11,...,-0.076402,-0.038169,0.015004,0.118929,0.019702,-0.06517,-0.541667,0.039071,-0.141369,-0.086057
129,8054,0.01,4.016298,3.687483,3.958,23.23,1.591174,1.23395,4.859812,12,...,-0.070625,-0.038169,0.015974,0.118929,0.020114,-0.051433,-0.541667,0.039071,-0.141369,-0.075319
130,8054,0.03,4.05328,3.726045,3.915,23.26,1.52845,1.2919,4.85203,1,...,-0.073232,-0.059583,0.018292,0.004247,0.017422,-0.051813,-0.48,0.057974,0.039217,-0.063351
131,8054,0.09,4.05328,3.726045,3.89,23.28,1.52845,1.2919,4.844187,2,...,-0.082911,-0.059583,0.018656,0.004247,0.017849,-0.0522,-0.173913,0.057974,0.039217,-0.057281
132,8054,0.08,4.05328,3.726045,3.954,23.36,1.52845,1.2919,4.844187,3,...,-0.094125,-0.059583,0.018696,0.004247,0.021332,-0.0522,-0.1,0.057974,0.039217,-0.051604


In [33]:
# MinMax scale df
scaler = MinMaxScaler()
scaler.fit(train_df[features].values)

train_df_scaled = scaler.transform(train_df[features].values)
train_df_scaled = pd.DataFrame(train_df_scaled, columns=features)
train_df_scaled["target"] = train_df[target].values
train_df_scaled["postal_code"] = train_df["postal_code"].values

test_df_scaled = scaler.transform(test_df[features].values)
test_df_scaled = pd.DataFrame(test_df_scaled, columns=features)
test_df_scaled["target"] = test_df[target].values
test_df_scaled["postal_code"] = test_df["postal_code"].values

In [34]:
# Export
train_df_scaled.to_csv("~/real_estate/data/v9/train_df.csv",index=False)
test_df_scaled.to_csv("~/real_estate/data/v9/test_df.csv",index=False)
joblib.dump(scaler, "/home/gnazareths/real_estate/data/v9/scaler.pkl")

['/home/gnazareths/real_estate/data/v9/scaler.pkl']