In [1]:
import datetime
import math
import os
import site
import sqlite3
import sys

import logzero
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import yaml
from logzero import logger
from tqdm import tqdm
from tqdm.notebook import tqdm
from yaml import dump, load, safe_load

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import tensorflow
import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import LSTM, RNN, Conv1D, Dense, Lambda, LSTMCell, Reshape
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import MeanAbsoluteError
from tensorflow.keras.optimizers import Adam

# print(tf.__version__)

In [4]:
sys.path.append("../../sql")
import queries

sys.path.append("../source")
import dl_tools
import ts_tools

### Resources  
1) https://machinelearningmastery.com/time-series-forecasting-long-short-term-memory-network-python/

In [5]:
plt.rcParams["figure.figsize"] = 30, 25

plt.rcParams["ytick.labelsize"] = 20
plt.rcParams["xtick.labelsize"] = 20

plt.rcParams["axes.grid"] = False
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["axes.labelpad"] = 16
plt.rcParams["axes.xmargin"] = 0.05
plt.rcParams["axes.ymargin"] = 0.05

In [6]:
tf.random.set_seed(42)
np.random.seed(42)

In [7]:
log_path = "logs/"
log_file = "ts_lstm.log"

logzero.logfile(log_path + log_file, maxBytes=1e6, backupCount=5, disableStderrLogger=True)
logger.info(f"{log_path}, {log_file}\n")

In [8]:
configs = None
try:
    with open("../configs/config.yml", "r") as config_in:
        configs = load(config_in, Loader=yaml.SafeLoader)
        logger.info(f"{configs}\n")
except:
    logger.error(f"config file open failure.")
    exit(1)

cfg_vars = configs["url_variables"]
logger.info(f"variables: {cfg_vars}\n")

years = configs["request_years"]
logger.info(f"years: {years}\n")

db_path = configs["file_paths"]["db_path"]

city = configs["location_info"]["city"]
state = configs["location_info"]["state"]
db_file = city + "-" + state + ".db"

db_table1 = configs["table_names"]["db_table1"]
db_table2 = configs["table_names"]["db_table2"]

data_units = configs["lstm_cfg"]["data_units"]
period = configs["lstm_cfg"]["period"]
periods_train = configs["lstm_cfg"]["periods_train"]
periods_val = configs["lstm_cfg"]["periods_val"]
periods_test = configs["lstm_cfg"]["periods_test"]

lower = configs["lstm_cfg"]["min_max_lower"]
upper = configs["lstm_cfg"]["min_max_upper"]

logger.info(f"{db_path}, {db_file}")

nrows = configs["num_rows"][0]
logger.info(f"number of rows: {nrows}\n")

In [9]:
conn = sqlite3.connect(db_path + db_file)
cursor = conn.cursor()

In [10]:
cursor.execute(queries.select_distinct_zips)
distinct_zipcodes = cursor.fetchall()
distinct_zipcodes = [z[0] for z in distinct_zipcodes]
logger.info(f"distinct zip codes:\n{distinct_zipcodes}")
print(distinct_zipcodes)

['91708']


In [11]:
zipcode_index = 0
params = {"zipcode": distinct_zipcodes[zipcode_index]}

select_nsr_rows = f"""
SELECT date_time,
-- year, month, day, 
-- zipcode,
-- Clearsky_DHI, DHI,
Clearsky_DNI, DNI,
Clearsky_GHI, GHI,
Temperature,
Relative_Humidity,
Precipitable_Water,
-- Wind_Direction,
Wind_Speed
from nsrdb
where zipcode = :zipcode
-- and not (month = 2 and day = 29)
-- and year = 2000
;
"""

df = pd.read_sql(
    select_nsr_rows,
    conn,
    params=params,
    index_col="date_time",
    parse_dates=["date_time"],
)

df.sort_index(axis=0, inplace=True)
# df.head(5)

In [12]:
df_rsd = df.resample("D").mean().reset_index(drop=False)
df_rsd.set_index("date_time", inplace=True)
df_rsd.head()

Unnamed: 0_level_0,Clearsky_DNI,DNI,Clearsky_GHI,GHI,Temperature,Relative_Humidity,Precipitable_Water,Wind_Speed
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1998-01-01,293.458333,293.458333,142.541667,142.541667,15.416667,66.250417,1.320042,1.375
1998-01-02,289.833333,55.583333,143.458333,61.666667,12.125,93.799167,1.329208,2.245833
1998-01-03,300.625,18.25,144.291667,42.0,11.208333,93.456667,1.365375,1.779167
1998-01-04,291.833333,65.916667,144.0,47.875,8.25,94.297917,1.054083,2.008333
1998-01-05,344.25,311.333333,154.666667,147.916667,6.041667,63.997917,0.4035,1.620833


In [13]:
df_rsm = df.resample("M").mean().reset_index(drop=False)
df_rsm.set_index("date_time", inplace=True)
df_rsm.head()

Unnamed: 0_level_0,Clearsky_DNI,DNI,Clearsky_GHI,GHI,Temperature,Relative_Humidity,Precipitable_Water,Wind_Speed
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1998-01-31,312.725806,193.25672,158.55914,116.268817,11.461022,74.901344,1.196043,1.408737
1998-02-28,330.165179,171.066964,201.436012,136.39881,10.00744,86.479226,1.425155,2.150149
1998-03-31,365.642473,219.232527,261.442204,187.193548,12.858871,76.266237,1.270946,1.910887
1998-04-30,384.641667,292.604167,313.873611,268.347222,13.668056,72.995972,1.137753,1.906944
1998-05-31,399.022849,244.490591,345.922043,257.693548,14.939516,79.635323,1.470833,2.055242


In [14]:
columns = df.columns.tolist()
print(columns, "\n")
forecast_on_idx = 4

if data_units == "D":
    df_rs = df_rsd.copy()
else:
    df_rs = df_rsm.copy()

train_len = period * periods_train
val_len = period * periods_val
test_len = period * periods_test

val_end = train_len + val_len

print(
    f"period type: {data_units}, period: {period},\nperiods_train: {periods_train}, periods_val: {periods_val}, periods_test: {periods_test}"
)

print(f"train_len: {train_len}, validate_len: {val_len}, test_len: {test_len}")

['Clearsky_DNI', 'DNI', 'Clearsky_GHI', 'GHI', 'Temperature', 'Relative_Humidity', 'Precipitable_Water', 'Wind_Speed'] 

period type: M, period: 12,
periods_train: 16, periods_val: 5, periods_test: 2
train_len: 192, validate_len: 60, test_len: 24


In [15]:
df_lstm = df_rs[columns[forecast_on_idx]]
# df_lstm.head(7)

X = df_lstm.values
# print(X)

In [16]:
X_train, X_val, X_test = X[:train_len], X[train_len:val_end], X[val_end:]
# print(X_train.shape, X_val.shape, X_test.shape)
# print(X_test)

In [22]:
# dl_tools.persistence_forecast_plot(X_train, X_val)

In [None]:
scaler = MinMaxScaler(feature_range=(lower, upper))
scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [17]:
# difference transform
test_diff = dl_tools.difference(X_test, interval=1)
# print(test_diff.values)

# invert transform
orig = dl_tools.inverse_difference(X_test, test_diff)
# print(orig.values)

In [18]:
df_test_sup = dl_tools.make_supervised(test_diff, lag=1)
# df_test_sup

In [None]:
X, y = train[:, 0:-1], train[:, -1]
X = X.reshape(X.shape[0], 1, X.shape[1])

In [None]:
model = Sequential()
model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

layer = LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True)