# Description
I will use several deep learning models for my time series predictions. 
* LSTM
* Transformer
* dialated CNN

In all cases I will include daily snowfall as an exogenous variable.

## Environment
For $ reasons I will use Colab

In [6]:
# get colab status
try:
    import google.colab
    IN_COLAB = True
    %tensorflow_version 2.x
except:
    IN_COLAB = False

In [7]:
# data wrangling
import numpy as np
import pandas as pd
import os.path

# viz
import altair as alt
import matplotlib.pyplot as plt
%matplotlib inline
from vapeplot import vapeplot
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [8]:
# local code with hack to avoid cloing full repo each time colab is run
if IN_COLAB:
    projectcode = r"thttps://github.com/chrisoyer/ski-snow-modeling/blob/master/src/analysis/project_utils/project_utils.py"
    ! wget $projectcode
from project_utils.project_utils import *

In [9]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

# Parameters

In [17]:
alt.renderers.enable(embed_options={'theme': 'vox'})
alt.data_transformers.disable_max_rows()
#plt_style = r'https://github.com/dhaitz/matplotlib-stylesheets/blob/master/pitayasmoothie-light.mplstyle'
#plt.style.use(plt_style)
plt.rc('figure', figsize=(11.0, 7.0))
batch_size = 30
logs_path = "./logs/visualize_graph"

# Load Data

In [18]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    os.chdir(r'/content/gdrive/My Drive/data_sci/colab_datasets/ski/')
    all_data_path = r'./data/snow_data_clean.parquet'
    mirrored_strategy = tf.distribute.MirroredStrategy()
else:
    all_data_path = r'../../data/snow_data_clean.parquet'
!pwd

/c/Users/User/Documents/GitHub/ski-snow-modeling/src/analysis


In [19]:
snow_df = pd.read_parquet(all_data_path)

### Reshape for TF input
Shape should match (__samples__, __time steps__, __features__)

In [20]:
## Quick& dirty
def data_slim(source=snow_df, station=None, region=None):
    """filters data for station OR region, relevant features, and returns np"""
    if station:
        source = source.query('station==@station')
    if region:
        source = source.query('region==@region')

    data_arr = (source
                .reset_index()
                [['dayofyr', 'base', 'snowfall']]
                .to_numpy()
                )
    return data_arr

copper = data_slim(station="Copper Mountain")
copper

array([[ 67.        ,  18.        ,   0.        ],
       [ 68.        ,  18.        ,   0.        ],
       [ 69.        ,  18.        ,   0.        ],
       ...,
       [432.        ,  35.43418834,   0.        ],
       [433.        ,  35.56049869,   0.        ],
       [434.        ,  35.6400577 ,   0.        ]])

In [21]:
def data_split(data=None, test_frac=.2, lookback=30, batch_size=batch_size):
    """split into train and test sets
    Params:
        data: input data
        test_size: fraction of data for test
    returns: (training data generator, test data generator)
    """
    data_rows = data.shape[0] 
    test_size = int(data_rows * test_frac)
    train_size = data_rows - test_size
    train, test = data[:train_size, :], data[train_size:, :]
    train_data_gen = sequence.TimeseriesGenerator(train, train,
                                   length=lookback, sampling_rate=1, stride=1,
                                   batch_size=batch_size)
    test_data_gen = sequence.TimeseriesGenerator(test, test,
                                   length=lookback, sampling_rate=1, stride=1,
                                   batch_size=batch_size)
    return train_data_gen, test_data_gen
X_train, X_test = data_split(data=copper)

# Plotting Functions

In [22]:
def error_plotter(history):
    """plots train and validation scores by epoch"""
    pal =  sns.blend_palette(vapeplot.palette('macplus'))
    sns.set_palette(pal)
    
    loss_train = history.history['train_loss']
    loss_val = history.history['val_loss']
    plt.plot(epochs, loss_train, 'g', label='Training loss')
    plt.plot(epochs, loss_val, 'b', label='Validation loss')
    plt.title('Training and Validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# Timeseries Modeling

The evolution of snow base depth over time depends (not 1:1; a foot of powder is only a few inches of packed powder) on new snowfall and melting of old snow. I will start by modeling as a simple timeseries, and then include new snowfall as a predictor variable.

## Modeling Setup
I will use supersetting crossvalidation (walk-forward CV) since this is a time series problem.

# TF LSTM models
 

In [23]:
# scale data
def scaler_factory(X):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(X)
    scaled_X = scaler.transform(X)

    # invert transform
    inverted_X = scaler.inverse_transform(scaled_X)
    return scaler, inverse_scaler

In [24]:
def make_lstm(neurons, layers, batch_size, x_shape):
    """
    Parameters:
        neurons: width of layers, eg (4,5,6) implies first hidden layer has 4
            neuron, 2nd layer has 5, third layer has 6
        batch size: ...
        x_shape: (rows, features)
    Returns: unfitted model
    """
    input_shape = (x_shape[0], x_shape[1])
    xlayer = inputs = Input(shape=input_shape, batch_size=batch_size)
    for layer in range(layers):
        xlayer = LSTM(units=neurons, batch_input_shape=input_shape, 
                   stateful=True, dropout=0.2, recurrent_dropout=0.2,)(xlayer)
    outputs = Dense(1)(xlayer)
    model = Model(inputs=inputs, outputs=outputs)
    metrics = ['mean_absolute_error']  # TODO: custom r2 func
    model.compile(loss="mse", metrics=metrics,
                  optimizer='adam')
    return model

def fit_model(model, X, batch_size, n_epoch):
    """runs the training; returns model and history"""
    for i in range(n_epoch):
        history = model.fit(X, epochs=n_epoch, batch_size=batch_size, 
                            verbose=0, shuffle=False, callbacks=[])
        model.reset_states()
    return model, history

# Vanilla LSTM Model

In [28]:
lstm_100x1 = make_lstm(neurons=100, layers=1,
                       batch_size=batch_size, x_shape=copper.shape)
lstm_100x1.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(30, 2925, 3)]           0         
_________________________________________________________________
lstm_2 (LSTM)                (30, 100)                 41600     
_________________________________________________________________
dense_2 (Dense)              (30, 1)                   101       
Total params: 41,701
Trainable params: 41,701
Non-trainable params: 0
_________________________________________________________________


In [29]:
lstm_100x1, lstm_100x1_hst = fit_model(model=lstm_100x1, X=X_train,
                                       batch_size=batch_size, n_epoch=10)

plot_model(lstm_100, show_shapes=True, to_file='model.png')

InvalidArgumentError:  Trying to access element 31 in a list with 30 elements.
	 [[{{node model_2/lstm_2/while/body/_1/TensorArrayV2Read/TensorListGetItem}}]] [Op:__inference_train_function_6801]

Function call stack:
train_function


Batch Size: 1
Epochs: 3000
Neurons: 4
--------------
or 1 neuron

# Compare Models