In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from helpers import *
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.externals import joblib

% matplotlib inline

In [10]:
# Import
df = pd.read_csv("~/real_estate/data/v3/supervised_df.csv")

In [11]:
current_time_features = ["median_sqft_value","zhvi_1bed","zhvi_2bed","zhvi_3bed","zri_sqft_value",
    "zhvi_condo","zhvi_singlefam","90_day_treasury_bill_rate","mean_hourly_earnings_adj", 
    "mean_fixed_mortgage_rate","total_nonfarm_payroll","people_count_lag",
    "adult_count_lag","mean_adjusted_income_lag","mean_adjusted_income_with_dep_lag",
    "mean_real_estate_taxes_hh_lag","mean_mortgage_interest_hh_lag"]

target = "zhvi_condo_12_months"

for i in current_time_features:
    mo = i+"_mo"
    yr = i+"_yr"
    change_mo = df[i]/df[mo] - 1
    change_yr = df[i]/df[yr] - 1
    df[mo] = change_mo
    df[yr] = change_yr

df[target] = df[target]/df["zhvi_condo"] - 1

In [12]:
log_columns = ["median_sqft_value","zhvi_1bed","zhvi_2bed","zhvi_3bed","zri_sqft_value",
               "zhvi_condo","zhvi_singlefam","people_count_lag","adult_count_lag",
               "mean_adjusted_income_lag","mean_adjusted_income_with_dep_lag","mean_real_estate_taxes_hh_lag",
               "mean_mortgage_interest_hh_lag"]

for i in log_columns:
    df[i] = log_with_zeros(df[i].values)
    df = df.rename(columns={i:i+"_log"})

In [13]:
# Remove a few
df = df.drop(["percent_increasing_mo","percent_decreasing_mo","percent_increasing_yr","percent_decreasing_yr"], axis=1)

In [14]:
train_df = df.loc[df["year"] < 2017]
test_df = df.loc[df["year"] == 2017]

features = df.columns[1:-1]
target = "zhvi_condo_12_months"

In [15]:
# MinMax scale df
scaler = MinMaxScaler()
scaler.fit(train_df[features].values)

train_df_scaled = scaler.transform(train_df[features].values)
train_df_scaled = pd.DataFrame(train_df_scaled, columns=features)
train_df_scaled["target"] = train_df[target].values
train_df_scaled["RegionName"] = train_df["RegionName"].values

test_df_scaled = scaler.transform(test_df[features].values)
test_df_scaled = pd.DataFrame(test_df_scaled, columns=features)
test_df_scaled["target"] = test_df[target].values
test_df_scaled["RegionName"] = test_df["RegionName"].values

In [17]:
# Export
train_df_scaled.to_csv("~/real_estate/data/v3/train_df.csv",index=False)
test_df_scaled.to_csv("~/real_estate/data/v3/test_df.csv",index=False)
joblib.dump(scaler, "/home/gnazareths/real_estate/data/v3/scaler.pkl")

['/home/gnazareths/real_estate/data/v3/scaler.pkl']