In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from helpers import *
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

% matplotlib inline

In [2]:
# Import
df = pd.read_csv("~/real_estate/data/v4/supervised/yelp_supervised_df.csv")

In [3]:
current_time_features = [
    "90_day_treasury_bill_rate", "adult_count_lag", "bar_review_count", "coffee_review_count", 
    "mean_adjusted_income_lag", "mean_adjusted_income_with_dep_lag", "mean_fixed_mortgage_rate",
    "mean_hourly_earnings_adj", "mean_mortgage_interest_hh_lag", "mean_rating", "mean_real_estate_taxes_hh_lag",
    "median_sqft_value", "people_count_lag", "percent_decreasing", "percent_increasing", 
    "price_2_review_count", "price_3_review_count", "restaurant_review_count", "review_count", 
    "total_nonfarm_payroll", "zhvi_singlefam", "zri_sqft_value", "zhvi_condo"
]

target = "zhvi_condo_12_months"

In [5]:
print(df.shape, df.dropna().shape)

(25997, 92) (15527, 92)


In [6]:
df = df.dropna()

In [7]:
for i in current_time_features:
    mo = i+"_mo"
    yr = i+"_yr"
    change_mo = (df[i] + 0.1)/(df[mo] + 0.1) - 1
    change_yr = (df[i] + 0.1)/(df[yr] + 0.1) - 1
    df[mo] = change_mo
    df[yr] = change_yr

df[target] = df[target]/df["zhvi_condo"] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
log_columns = ["median_sqft_value","zri_sqft_value","zhvi_condo","zhvi_singlefam","people_count_lag",
               "adult_count_lag","mean_adjusted_income_lag","mean_adjusted_income_with_dep_lag",
               "mean_real_estate_taxes_hh_lag","mean_mortgage_interest_hh_lag"]

for i in log_columns:
    df[i] = log_with_zeros(df[i].values)
    df = df.rename(columns={i:i+"_log"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
columns = ['postal_code',
           '90_day_treasury_bill_rate',
           'mean_adjusted_income_lag_log', 
           'mean_adjusted_income_with_dep_lag_log', 
           'mean_fixed_mortgage_rate', 
           'mean_hourly_earnings_adj', 
           'mean_mortgage_interest_hh_lag_log',
           'mean_real_estate_taxes_hh_lag_log', 
           'mean_rating',
           'median_sqft_value_log', 
           'month', 
           'people_count_lag_log', 
           'percent_decreasing', 
           'percent_increasing', 
           'price_2_review_count', 
           'price_3_review_count', 
           'bar_review_count',
           'coffee_review_count',
           'restaurant_review_count', 
           'review_count', 
           'total_nonfarm_payroll', 
           'year', 
           'zhvi_condo_log', 
           'zhvi_singlefam_log', 
           'zri_sqft_value_log', 
           'adult_count_lag_mo', 
           'bar_review_count_mo', 
           'mean_mortgage_interest_hh_lag_mo', 
           'mean_fixed_mortgage_rate_mo', 
           'percent_decreasing_mo', 
           'zhvi_singlefam_mo', 
           'mean_adjusted_income_with_dep_lag_mo', 
           'restaurant_review_count_mo', 
           'zri_sqft_value_mo', 
           'percent_increasing_mo', 
           '90_day_treasury_bill_rate_mo', 
           'total_nonfarm_payroll_mo', 
           'mean_adjusted_income_lag_mo', 
           'mean_real_estate_taxes_hh_lag_mo', 
           'mean_hourly_earnings_adj_mo', 
           'review_count_mo', 
           'mean_rating_mo', 
           'people_count_lag_mo', 
           'price_3_review_count_mo', 
           'zhvi_condo_mo', 
           'coffee_review_count_mo', 
           'median_sqft_value_mo', 
           'price_2_review_count_mo', 
           'percent_decreasing_yr', 
           'coffee_review_count_yr', 
           'price_3_review_count_yr', 
           'price_2_review_count_yr', 
           'percent_increasing_yr', 
           'adult_count_lag_yr', 
           'mean_fixed_mortgage_rate_yr', 
           'mean_adjusted_income_lag_yr', 
           'zhvi_singlefam_yr', 
           'zri_sqft_value_yr', 
           'restaurant_review_count_yr', 
           'zhvi_condo_yr', 
           'mean_mortgage_interest_hh_lag_yr', 
           'total_nonfarm_payroll_yr', 
           'people_count_lag_yr', 
           'review_count_yr', 
           'mean_hourly_earnings_adj_yr', 
           'mean_rating_yr', 
           'bar_review_count_yr', 
           'median_sqft_value_yr', 
           '90_day_treasury_bill_rate_yr', 
           'mean_real_estate_taxes_hh_lag_yr', 
           'mean_adjusted_income_with_dep_lag_yr', 
           'zhvi_condo_12_months']

df = df[columns]

In [10]:
train_df = df.loc[df["year"] < 2017]
test_df = df.loc[df["year"] == 2017]

features = df.columns[1:-1]
target = "zhvi_condo_12_months"

In [11]:
df.head()

Unnamed: 0,postal_code,90_day_treasury_bill_rate,mean_adjusted_income_lag_log,mean_adjusted_income_with_dep_lag_log,mean_fixed_mortgage_rate,mean_hourly_earnings_adj,mean_mortgage_interest_hh_lag_log,mean_real_estate_taxes_hh_lag_log,mean_rating,median_sqft_value_log,...,people_count_lag_yr,review_count_yr,mean_hourly_earnings_adj_yr,mean_rating_yr,bar_review_count_yr,median_sqft_value_yr,90_day_treasury_bill_rate_yr,mean_real_estate_taxes_hh_lag_yr,mean_adjusted_income_with_dep_lag_yr,zhvi_condo_12_months
115,8054,0.04,4.076772,3.74638,3.536,23.89,1.428389,1.261144,3.0,4.812184,...,0.010503,0.909091,0.021286,-0.392157,0.0,-0.023791,-0.263158,-0.029478,0.020493,-0.010332
116,8054,0.05,4.076772,3.74638,4.07,23.96,1.428389,1.261144,3.0,4.812184,...,0.010503,0.909091,0.021222,-0.392157,0.0,-0.015987,-0.210526,-0.029478,0.020493,-0.005904
117,8054,0.04,4.076772,3.74638,4.37,23.99,1.428389,1.261144,3.0,4.820282,...,0.010503,0.909091,0.020763,-0.392157,0.0,0.0,-0.3,-0.029478,0.020493,-0.008798
118,8054,0.04,4.076772,3.74638,4.456,24.02,1.428389,1.261144,3.0,4.828314,...,0.010503,0.909091,0.022467,-0.392157,0.0,0.008058,-0.3,-0.029478,0.020493,-0.008047
119,8054,0.02,4.076772,3.74638,4.49,24.05,1.428389,1.261144,3.0,4.836282,...,0.010503,0.909091,0.020279,-0.392157,0.0,0.016116,-0.428571,-0.029478,0.020493,-0.005857


In [12]:
# MinMax scale df
scaler = MinMaxScaler()
scaler.fit(train_df[features].values)

train_df_scaled = scaler.transform(train_df[features].values)
train_df_scaled = pd.DataFrame(train_df_scaled, columns=features)
train_df_scaled["target"] = train_df[target].values
train_df_scaled["postal_code"] = train_df["postal_code"].values

test_df_scaled = scaler.transform(test_df[features].values)
test_df_scaled = pd.DataFrame(test_df_scaled, columns=features)
test_df_scaled["target"] = test_df[target].values
test_df_scaled["postal_code"] = test_df["postal_code"].values

In [14]:
# Export
train_df_scaled.to_csv("~/real_estate/data/v4/final/yelp/yelp_train_df.csv",index=False)
test_df_scaled.to_csv("~/real_estate/data/v4/final/yelp/yelp_test_df.csv",index=False)
joblib.dump(scaler, "/home/gnazareths/real_estate/data/v4/final/yelp/yelp_scaler.pkl")

['/home/gnazareths/real_estate/data/v4/final/yelp/yelp_scaler.pkl']