In [5]:
import pandas as pd
import matplotlib.pyplot as plt

from helpers import *
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

% matplotlib inline

In [6]:
# Import
df = pd.read_csv("~/real_estate/data/v9/supervised_df.csv")
postal_code_df = pd.read_csv("~/real_estate/data/v9/zipcode.csv")

In [8]:
year_features = [
    "90_day_treasury_bill_rate", "adult_count_lag",
    "mean_adjusted_income_lag", "mean_adjusted_income_with_dep_lag", "mean_fixed_mortgage_rate",
    "mean_hourly_earnings_adj", "mean_mortgage_interest_hh_lag", "mean_real_estate_taxes_hh_lag",
    "median_sqft_value", "people_count_lag", "business_count_cbp_lag","employment_count_cbp_lag",
    "business_100_count_cbp_lag", "business_500_count_cbp_lag","restaurant_count_cbp_lag",
    "bar_count_cbp_lag","percent_decreasing", "percent_increasing", 
    "total_nonfarm_payroll", "zhvi_singlefam", "zri_sqft_value", "zhvi_condo"
]

month_features = [
    "90_day_treasury_bill_rate", "mean_fixed_mortgage_rate","mean_hourly_earnings_adj", "median_sqft_value", 
    "percent_decreasing", "percent_increasing", "total_nonfarm_payroll", "zhvi_singlefam", "zri_sqft_value", 
    "zhvi_condo"
]

target = "zhvi_condo_12_months"

In [9]:
print(df.shape, df.dropna().shape)
df = df.dropna()

(247170, 71) (144788, 71)


In [10]:
for i in year_features:
    yr = i+"_yr"
    change_yr = (df[i] + 0.1)/(df[yr] + 0.1) - 1
    df[yr] = change_yr
    
for i in month_features:
    mo = i+"_mo"
    change_mo = (df[i] + 0.1)/(df[mo] + 0.1) - 1
    df[mo] = change_mo

df[target] = df[target]/df["zhvi_condo"] - 1

In [6]:
# Add county
df = pd.merge(df, postal_code_df[["postal_code","county"]], on=["postal_code"], how="left")

# Merge
df_median = df[["county","state","year","month",
                "zhvi_condo","zhvi_condo_yr","zhvi_condo_mo",
                "median_sqft_value","median_sqft_value_mo","median_sqft_value_yr",
                "mean_hourly_earnings_adj","mean_hourly_earnings_adj_mo","mean_hourly_earnings_adj_yr"]].groupby(
    by=["county","state","year","month"], as_index=False).agg('median')
df_median = df_median.rename(columns={"zhvi_condo_yr":"median_county_zhvi_condo_yr",
                                      "zhvi_condo_mo":"median_county_zhvi_condo_mo",
                                      "zhvi_condo":"median_county_zhvi_condo",
                                      "median_sqft_value":"median_county_sqft_value",
                                      "median_sqft_value_mo":"median_county_sqft_value_mo",
                                      "median_sqft_value_yr":"median_county_sqft_value_yr",
                                      "mean_hourly_earnings_adj":"median_county_hourly_earnings_adj",
                                      "mean_hourly_earnings_adj_mo":"median_county_hourly_earnings_adj_mo",
                                      "mean_hourly_earnings_adj_yr":"median_county_hourly_earnings_adj_yr"})

df_mean = df[["county","state","year","month",
                "zhvi_condo","zhvi_condo_yr","zhvi_condo_mo",
                "median_sqft_value","median_sqft_value_mo","median_sqft_value_yr",
                "mean_hourly_earnings_adj","mean_hourly_earnings_adj_mo","mean_hourly_earnings_adj_yr"]].groupby(
    by=["county","state","year","month"], as_index=False).agg('mean')
df_mean = df_mean.rename(columns={"zhvi_condo_yr":"mean_county_zhvi_condo_yr",
                                      "zhvi_condo_mo":"mean_county_zhvi_condo_mo",
                                      "zhvi_condo":"mean_county_zhvi_condo",
                                      "median_sqft_value":"mean_county_sqft_value",
                                      "median_sqft_value_mo":"mean_county_sqft_value_mo",
                                      "median_sqft_value_yr":"mean_county_sqft_value_yr",
                                      "mean_hourly_earnings_adj":"mean_county_hourly_earnings_adj",
                                      "mean_hourly_earnings_adj_mo":"mean_county_hourly_earnings_adj_mo",
                                      "mean_hourly_earnings_adj_yr":"mean_county_hourly_earnings_adj_yr"})

df = pd.merge(df, df_median, on=["county","state","year","month"], how="left")
df = pd.merge(df, df_mean, on=["county","state","year","month"], how="left")

In [7]:
log_columns = ["median_sqft_value","zri_sqft_value","zhvi_condo","zhvi_singlefam","people_count_lag",
               "adult_count_lag","mean_adjusted_income_lag","mean_adjusted_income_with_dep_lag",
               "mean_real_estate_taxes_hh_lag","mean_mortgage_interest_hh_lag","business_count_cbp_lag",
               "business_100_count_cbp_lag","restaurant_count_cbp_lag","employment_count_cbp_lag",
               "median_county_zhvi_condo","mean_county_zhvi_condo","mean_county_sqft_value",
               "median_county_sqft_value"]

bound_columns = ["business_100_count_cbp_lag_yr","business_count_cbp_lag_yr","business_500_count_cbp_lag_yr",
                 "restaurant_count_cbp_lag_yr","bar_count_cbp_lag_yr","employment_count_cbp_lag_yr"]

for i in log_columns:
    df[i] = log_with_zeros(df[i].values)
    df = df.rename(columns={i:i+"_log"})
    
for i in bound_columns:
    df[i] = [i if i < 5 else 5 for i in df[i].values]

In [8]:
columns = ['postal_code',
           'zhvi_condo_12_months',
           'year',
           'month',
           '90_day_treasury_bill_rate',
           'mean_adjusted_income_lag_log', 
           'mean_adjusted_income_with_dep_lag_log', 
           'mean_fixed_mortgage_rate', 
           'mean_hourly_earnings_adj', 
           'mean_mortgage_interest_hh_lag_log',
           'mean_real_estate_taxes_hh_lag_log', 
           'median_sqft_value_log', 
           'people_count_lag_log', 
           'percent_decreasing', 
           'percent_increasing', 
           'total_nonfarm_payroll', 
           'zhvi_condo_log', 
           'zhvi_singlefam_log', 
           'zri_sqft_value_log',
           'business_count_cbp_lag_log',
           'business_500_count_cbp_lag',
           'bar_count_cbp_lag',
           'mean_county_zhvi_condo_log',
           'median_county_zhvi_condo_log',
           'mean_county_sqft_value_log',
           'median_county_sqft_value_log',
           'mean_county_hourly_earnings_adj',
           'mean_fixed_mortgage_rate_mo', 
           'percent_decreasing_mo', 
           'zhvi_singlefam_mo', 
           'zri_sqft_value_mo', 
           'percent_increasing_mo', 
           '90_day_treasury_bill_rate_mo', 
           'total_nonfarm_payroll_mo', 
           'mean_hourly_earnings_adj_mo', 
           'zhvi_condo_mo', 
           'median_sqft_value_mo', 
           'percent_decreasing_yr', 
           'percent_increasing_yr', 
           'adult_count_lag_yr', 
           'mean_fixed_mortgage_rate_yr', 
           'mean_adjusted_income_lag_yr', 
           'zhvi_singlefam_yr', 
           'zri_sqft_value_yr', 
           'zhvi_condo_yr', 
           'mean_mortgage_interest_hh_lag_yr', 
           'total_nonfarm_payroll_yr', 
           'people_count_lag_yr', 
           'mean_hourly_earnings_adj_yr', 
           'median_sqft_value_yr', 
           '90_day_treasury_bill_rate_yr', 
           'mean_real_estate_taxes_hh_lag_yr', 
           'mean_adjusted_income_with_dep_lag_yr',
           'business_count_cbp_lag_yr',
           'employment_count_cbp_lag_yr',
           'business_100_count_cbp_lag_yr',
           'restaurant_count_cbp_lag_yr',
           'business_500_count_cbp_lag_yr',
           'bar_count_cbp_lag_yr',
           'mean_county_zhvi_condo_yr',
           'mean_county_zhvi_condo_mo',
           'median_county_zhvi_condo_yr',
           'median_county_zhvi_condo_mo',
           'mean_county_sqft_value_mo',
           'median_county_sqft_value_mo',
           'mean_county_sqft_value_yr',
           'median_county_sqft_value_yr',
           'mean_county_hourly_earnings_adj_mo',
           'median_county_hourly_earnings_adj_mo',
           'mean_county_hourly_earnings_adj_yr',
           'median_county_hourly_earnings_adj_yr'
           ]

df = df[columns]

In [10]:
train_df = df.loc[df["year"] < 2017]
test_df = df.loc[df["year"] == 2017]

features = df.columns[2:]
target = "zhvi_condo_12_months"

In [11]:
# MinMax scale df
scaler = MinMaxScaler()
scaler.fit(train_df[features].values)

train_df_scaled = scaler.transform(train_df[features].values)
train_df_scaled = pd.DataFrame(train_df_scaled, columns=features)
train_df_scaled["target"] = train_df[target].values
train_df_scaled["postal_code"] = train_df["postal_code"].values

test_df_scaled = scaler.transform(test_df[features].values)
test_df_scaled = pd.DataFrame(test_df_scaled, columns=features)
test_df_scaled["target"] = test_df[target].values
test_df_scaled["postal_code"] = test_df["postal_code"].values

In [12]:
# Export
train_df_scaled.to_csv("~/real_estate/data/v11/train_df.csv",index=False)
test_df_scaled.to_csv("~/real_estate/data/v11/test_df.csv",index=False)
joblib.dump(scaler, "/home/gnazareths/real_estate/data/v11/scaler.pkl")

['/home/gnazareths/real_estate/data/v11/scaler.pkl']