In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from helpers import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib

% matplotlib inline

In [2]:
# Import
df = pd.read_csv("~/real_estate/data/supervised_dfs/df_20181210.csv")

In [3]:
# Multiply variables
columns_x100 = ["zhvi_sqft_value",
                "zhvi_sqft_value_mo",
                "zhvi_sqft_value_yr",
                "zhvi_sqft_value_12_months"]

for col in columns_x100:
    df[col] = df[col].values * 1000

In [4]:
current_time_features = ["median_sqft_value","percent_decreasing",
    "percent_increasing","zhvi_1bed","zhvi_2bed","zhvi_3bed",
    "zhvi_sqft_value","zhvi_condo","zhvi_singlefam",
    "90_day_treasury_bill_rate","mean_hourly_earnings_adj", 
    "mean_fixed_mortgage_rate","total_nonfarm_payroll"]

target = "zhvi_sqft_value_12_months"

for i in current_time_features:
    mo = i+"_mo"
    yr = i+"_yr"
    change_mo = df[i]/df[mo] - 1
    change_yr = df[i]/df[yr] - 1
    df[mo] = change_mo
    df[yr] = change_yr

df[target] = df[target]/df["zhvi_sqft_value"] - 1

In [5]:
log_columns = ["median_sqft_value","zhvi_1bed","zhvi_2bed","zhvi_3bed","zhvi_sqft_value",
               "zhvi_condo","zhvi_singlefam","percent_decreasing_mo","percent_increasing_mo",
               "percent_decreasing_yr","percent_increasing_yr"]

for i in log_columns:
    df[i] = log_with_zeros(df[i].values)
    df = df.rename(columns={i:i+"_log"})

In [6]:
train_df = df.loc[df["year"] < 2017]
test_df = df.loc[df["year"] == 2017]

features = df.columns[1:-1]
target = "zhvi_sqft_value_12_months"

In [7]:
scaler = MinMaxScaler()
scaler.fit(train_df[features].values)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [9]:
train_df_scaled = scaler.transform(train_df[features].values)
train_df_scaled = pd.DataFrame(train_df_scaled, columns=features)
train_df_scaled["target"] = train_df[target].values
train_df_scaled["RegionName"] = train_df["RegionName"].values

test_df_scaled = scaler.transform(test_df[features].values)
test_df_scaled = pd.DataFrame(test_df_scaled, columns=features)
test_df_scaled["target"] = test_df[target].values
test_df_scaled["RegionName"] = test_df["RegionName"].values

In [10]:
train_df_scaled.to_csv("~/real_estate/data/supervised_dfs/train_df.csv",index=False)
test_df_scaled.to_csv("~/real_estate/data/supervised_dfs/test_df.csv",index=False)

In [11]:
joblib.dump(scaler, "/home/gnazareths/real_estate/data/supervised_dfs/scaler.pkl")

['/home/gnazareths/real_estate/data/supervised_dfs/scaler.pkl']