In [7]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import functions as fc
import sqlalchemy as db

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from sodapy import Socrata

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score

In [3]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_model import ARIMA, ARMA, AR
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

import matplotlib.pyplot as plt
%matplotlib inline

# Data Query

In [5]:
# Load your PostgreSQL credential to `database_creds`
database_creds_file = open('./database_cred.json', 'r')
database_creds = json.loads(database_creds_file.read())

In [8]:
# Sign into your database
engine = db.create_engine(database_creds['url'])

connection = engine.connect()
metadata = db.MetaData()

In [9]:
# Run query and return the dataset as Pandas dataframe
sql = """
SELECT *
FROM ny_dhs_weekly
"""

weekly_df = pd.read_sql_query(sql, engine)

In [10]:
# Set 'date_of_census' column as datetime index
weekly_df.index = pd.to_datetime(weekly_df['date_of_census'])
weekly_df = weekly_df.drop('date_of_census', axis = 1)

In [11]:
weekly_df.head(2)

Unnamed: 0_level_0,total_adults_in_shelter,total_children_in_shelter,total_individuals_in_shelter,single_adult_men_in_shelter,single_adult_women_in_shelter,total_single_adults_in_shelter,families_with_children_in_shelter,adults_in_families_with_children_in_shelter,children_in_families_with_children_in_shelter,total_individuals_in_families_with_children_in_shelter_,adult_families_in_shelter,individuals_in_adult_families_in_shelter
date_of_census,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-08-25,28342,21334,49677,7184,2698,9882,10283,14636,21334,35971,1801,3823
2013-09-01,28327,21361,49689,7117,2690,9807,10296,14671,21361,36033,1811,3848


# RNN Modeling (Multivariate timeseries)
- `weekly_comb.csv` dataset

In [16]:
weekly_comb = pd.read_csv("./homeless/weekly_comb.csv")

FileNotFoundError: [Errno 2] File b'./homeless/weekly_comb.csv' does not exist: b'./homeless/weekly_comb.csv'

In [None]:
weekly_comb.head()

In [None]:
weekly_comb.index = pd.to_datetime(weekly_comb['project_start_date'], format = "%Y/%m/%d")
weekly_comb = weekly_comb.drop('project_start_date', axis = 1)

In [None]:
weekly_comb.head(2)

In [None]:
df = weekly_comb[['total_individuals_in_shelter', 'all_counted_units']]

In [None]:
# Create X and y
X = df.drop(columns = 'total_individuals_in_shelter')
y = df['all_counted_units'].values # with Keras, target should be `array`!!

In [None]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle =False)

In [None]:
# Scale

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
# Create training sequences
train_sequences = TimeseriesGenerator(X_train_sc, y_train,
                                      length = 3,
                                      batch_size = 128)

In [None]:
# Create test sequences
test_sequences = TimeseriesGenerator(X_test_sc, y_test,
                                     length = 3,
                                     batch_size = 128)

In [None]:
train_sequences[0][0].shape

In [None]:
# Design RNN
model = Sequential()

# Don't set the activation fn for RNN (tanh is defualt)
model.add(GRU(8,
              dropout = 0.2, # X_samples
              recurrent_dropout = 0.2, # state from previous step
              input_shape = (3, 1), # implicitly setting the input layer
              return_sequences = True)) # True is the NEXT layer is RNNs

model.add(GRU(8,
              return_sequences = False)) # False if the NEXT layer is DENSE

model.add(Dropout(0.2)) # bad AFTER RNNs (set dropout and recurrent_dropout instead)

model.add(Dense(4,
                activation = 'relu'))

model.add(Dropout(0.2)) # good AFTER Dense

# For binary classification
# 1 neuron, sigmoid activation fn
model.add(Dense(1,
                activation = None))

In [None]:
model.compile(loss = 'binary_crossentropy',
              optimizer = Adam(lr = 0.001),
              metrics = ['accuracy'])

In [None]:
hist = model.fit_generator(train_sequences,
                    epochs = 20,
                    validation_data = test_sequences,
                    verbose = 0)

In [None]:
# Plot out our train/test loss curves

plt.plot(hist.history['loss'], label = 'Train loss')
plt.plot(hist.history['val_loss'], label = 'Test loss')
plt.legend();

In [None]:
# Plot out our train/test loss curves

plt.plot(hist.history['loss'], label = 'Train loss')
plt.plot(hist.history['val_loss'], label = 'Test loss')
plt.legend();

In [None]:
# Plot out our train/test loss curves

plt.plot(hist.history['accuracy'], label = 'Train accuracy')
plt.plot(hist.history['val_accuracy'], label = 'Test accuracy')
plt.legend();

In [None]:
model.evaluate_generator(test_sequences)
trainPredict = model.predict_generator(train_sequences)
testPredict = model.predict_generator(test_sequences)

In [None]:
inverse_sc_trainPredict = ss.inverse_transform(trainPredict)
inverse_sc_testPredict = ss.inverse_transform(testPredict)

In [None]:
test_predict=[]
for arr in list(inverse_sc_testPredict):
    test_predict.append(arr[0])

In [None]:
train_predict=[]
for arr in list(inverse_sc_trainPredict):
    train_predict.append(arr[0])

In [None]:
original_train_preds = np.cumsum([df['total_individuals_in_shelter'].iloc[3-1]] + train_predict)
original_test_preds = np.cumsum([df['total_individuals_in_shelter'].iloc[len(y_train)+3-1]] + test_predict)

In [None]:
plt.figure(figsize = (15,5))
plt.plot(df['total_individuals_in_shelter'])
plt.plot(df.index[-74:], original_test_preds)
plt.plot(df.index[4:-74], original_train_preds)

# RNN Modeling (Univariate timeseries)

In [None]:
adf(weekly_df['total_individuals_in_shelter'].diff(1).dropna())

In [None]:
weekly_diff = weekly_df[['total_individuals_in_shelter']].diff(1).dropna()

In [None]:
train, test = train_test_split(weekly_diff[['total_individuals_in_shelter']],
                               test_size = 0.25,
                               shuffle = False)

In [None]:
train.shape

In [None]:
# Scale
ss = StandardScaler()
X_train_sc = ss.fit_transform(train)
X_test_sc = ss.transform(test)

In [None]:
train_sequences = TimeseriesGenerator(train,
                                      targets = ['total_individual_in_shelter'],
                                      length = 3,
                                      batch_size = 128)