In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import config
import pickle

In [None]:
#Load google.cloud.bigquery
%load_ext google.cloud.bigquery

In [None]:
#Select path to credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.GOOGLE_APPLICATION_CREDENTIALS

In [None]:
%%bigquery --use_rest_api ZRI_MF
SELECT *
FROM `high-empire-220313.ZRI.Multi_Family`

In [None]:
file = open('pickles/ZRI_filtered.p','rb')
ZRI_filtered = pickle.load(file)

In [None]:
from data_setup import ZRI_format

In [None]:
%%time
time_unit = 'Month'
window_size = 6
future_time = 1

ZRI_diff = ZRI_format(ZRI_filtered, time_unit = time_unit, window_size = window_size, future_time = future_time, percent_change=True)
ZRI_actual = ZRI_format(ZRI_filtered, time_unit = time_unit, window_size = window_size, future_time = future_time)

In [None]:
#Adding real ZRI as a feature. (Either average over past n time_units, or past n as separate features)
feature_columns = [x for x in ZRI_diff.columns if 'minus' in x]
#Rename feature columns of %difference dataframe
ZRI_diff = ZRI_diff.rename({i:i+'_%difference' for i in feature_columns}, axis =1)

In [None]:
ZRI_new = ZRI_diff.merge(ZRI_actual[feature_columns+['Target_index']],how = 'left',on = 'Target_index')

In [None]:
ZRI_new.head()

In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [None]:
#Find feature columns
feature_columns = [x for x in ZRI_new.columns if 'minus' in x]

In [None]:
#Drop nan values generated from the difference
ZRI_new = ZRI_new.dropna()

In [None]:
#Train test split, test data is above a given year
test_year = 2019
training_data = ZRI_new[ZRI_new.Year < test_year]
final_test_data = ZRI_new[ZRI_new.Year >= test_year]

In [None]:
#  X_train, X_test, y_train, y_test = train_test_split(training_data[feature_columns],
#                                                      training_data['Target_ZRI'],
#                                                      test_size = .1
#                                                     ) 
X_train, y_train = training_data[feature_columns], training_data['Target_ZRI']

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
lr.score(X_train, y_train)

In [None]:
prediction_error = final_test_data['Target_ZRI'] - lr.predict(final_test_data[feature_columns])

In [None]:
prediction_error.describe()

In [None]:
lr.coef_

### Classification Problem
To make this easier we can reframe the problem as a classification problem. Does the rent go up or down. (Staying the same counts as going down?)

In [None]:
ZRI_new['ZRI_class'] = ZRI_new['Target_ZRI'].apply(lambda x: 1 if x>0 else -1)
ZRI_new = ZRI_new.dropna()
#Find feature columns
feature_columns = [x for x in ZRI_new.columns if 'minus' in x]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix

In [None]:
logistic_regression = LogisticRegression(class_weight='balanced')
binary_model = make_pipeline(StandardScaler(),logistic_regression)

In [None]:
#Train test split, test data is above a given year
test_year = 2019
training_data = ZRI_new[ZRI_new.Year < test_year]
final_test_data = ZRI_new[ZRI_new.Year >= test_year]
X_train, y_train = training_data[feature_columns], training_data['ZRI_class']
X_test, y_test = final_test_data[feature_columns], final_test_data['ZRI_class']

In [None]:
binary_model.fit(X_train,y_train)

In [None]:
binary_model.score(X_train,y_train), binary_model.score(X_test,y_test)

In [None]:
confusion_matrix(y_test, binary_model.predict(X_test))

### Model Tuning
See the effect of window size and future time on the predictive power of the model:

In [None]:
window_sizes = list(range(1,13))
future_time = 12
time_unit = 'Month'
num_obs = defaultdict()
errors = defaultdict()
scores1 = defaultdict()
coefficients = defaultdict()

for window_size in window_sizes:
    ZRI_new = ZRI_format(ZRI_filtered, time_unit = time_unit, 
                         window_size = window_size,
                         future_time = future_time)
    ZRI_new = ZRI_new.dropna()
    num_obs[window_size] = ZRI_new.shape[0]
    feature_columns = [x for x in ZRI_new.columns if 'minus' in x]
    test_year = 2019
    training_data = ZRI_new[ZRI_new.Year < test_year]
    final_test_data = ZRI_new[ZRI_new.Year >= test_year]
    most_recent_feature = f'ZRI_minus_{future_time}{time_unit[0]}'
    X_train, y_train = training_data[feature_columns], training_data['Target_ZRI']
    X_test, y_test = final_test_data[feature_columns], final_test_data['Target_ZRI']                                         
                                                   
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    coefficients[window_size] = defaultdict()
    scores1[window_size] = (lr.score(X_test,y_test), lr.score(X_train, y_train))
    errors[window_size] = (y_test - 
                           lr.predict(X_test)).div(final_test_data[most_recent_feature])
    

In [None]:
plt.boxplot(errors.values())

In [None]:
plt.boxplot(list(map(lambda x: x.apply(lambda y: np.log10(y+1250)),errors.values())))

In [None]:
pd.DataFrame(errors).describe()

In [None]:
test,train = [test for test,train in scores.values()],[train for test,train in scores.values()]

In [None]:
test1,train1 = [test for test,train in scores1.values()],[train for test,train in scores1.values()]

In [None]:
plt.plot(test, label = 'test_3_year')
plt.plot(train, label = 'train_3_year')
plt.plot(test1, label = 'test_1_year')
plt.plot(train1, label = 'train_1_year')
plt.title('Forecast R^2')
plt.xlabel('Time Window (Months)')
plt.legend(loc = 'upper left')

In [None]:
(final_test_data['Target_ZRI'] - lr.predict(final_test_data[feature_columns]))#.div(final_test_data[most_recent_feature])