In [None]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop

In [None]:
jena_weather_data = pd.read_csv('data/jena_climate_2009_2016.csv')
jena_weather_data.head()

In [None]:
# Excluding date attribute
jena_weather_data_array = jena_weather_data.iloc[:,1:].values

jena_weather_data_array.shape

### Explore data

- Temperature has a periodic pattern
- Second plot  shows temperature values for the first 10 days
    - the weather data captures a sample for every timestep = 10 mins
    - in a day, there'd be 6*24 = 144 timesteps
    - for the first 10 days, there'd be 144 * 10 = 1440 timesteps

In [None]:
temp = jena_weather_data_array[:,1]
plt.plot(range(len(temp)),temp)

In [None]:
plt.plot(range(1440),temp[:1440]) # for first 10 days ; each timestep is for 10 mts, implying 144 timesteps a day!

### Problem formulation

* given data going as far back as lookback timesteps (a timestep is 10 minutes) and sampled every steps timesteps, can you predict the temperature in delay timesteps?

    - lookback = **720** — Observations will go back ___5 days___.
    - steps = **6** — Observations will be sampled at one data point per hour.
    - delay = **144** — Targets will be 24 hours in the future.
 
 
* Normalize the data as each feature/attribute is of different scale, which may not bode well for learning

In [None]:
def normalize_features(array_like_data, std=None, mean=None):
    
    if std is None:
        std = np.std(array_like_data, axis=0)
    if mean is None:
        mean = np.mean(array_like_data, axis=0)
    
    array_like_data -= mean
    array_like_data /= std
    
    return array_like_data, std, mean


(data, data_std, data_mean) = normalize_features(jena_weather_data_array[:20000])
(val_data, val_data_std,val_data_mean) = normalize_features(jena_weather_data_array[280000:300000],data_std,data_mean)
(test_data, test_data_std, test_data_mean) = normalize_features(jena_weather_data_array[380000:400000],data_std, data_mean)


In [None]:
# Data Generator
def data_generator(data, lookback, delay, min_index, max_index,shuffle=False, batch_size=128, step=6):
    
    if max_index is None:
        max_index = len(data) - delay - 1
    
    i = min_index + lookback
    
    while 1:
        if shuffle:
            rows = np.random.randint(min_index + lookback, max_index, size=batch_size)
        else:
            if i + batch_size >= max_index:
                i = min_index + lookback
            rows = np.arange(i, min(i + batch_size, max_index))
            i += len(rows)
            
        samples = np.zeros((len(rows),lookback // step,data.shape[-1]))
        targets = np.zeros((len(rows),))
        
        for j, row in enumerate(rows):
            indices = range(rows[j] - lookback, rows[j], step)
            samples[j] = data[indices]
            targets[j] = data[rows[j] + delay][1]
            
        yield samples, targets


# Training/Validation Generators

lookback = 720
delay = 144
step = 6
batch_size = 128


train_gen = data_generator(data,lookback=lookback,delay=delay,min_index=0,max_index=None,shuffle=True,step=step,
                      batch_size=batch_size)

val_gen = data_generator(val_data,lookback=lookback,delay=delay,min_index=0,max_index=None,step=step,
                    batch_size=batch_size)

test_gen = data_generator(test_data,lookback=lookback,delay=delay,min_index=0,max_index=None,step=step,
                     batch_size=batch_size)

val_steps = (len(val_data) - lookback)
test_steps = (len(test_data) - lookback)

In [None]:
# Computing the common sense baseline with MAE

def evaluate_naive_method():
    
    batch_maes = []
    for step in range(val_steps):
        samples, targets = next(val_gen)
        preds = samples[:, -1, 1]
        mae = np.mean(np.abs(preds - targets))
        batch_maes.append(mae)
    print(np.mean(batch_maes))
    
evaluate_naive_method()

In [None]:
basic_model = Sequential()

basic_model.add(layers.Flatten(input_shape=(lookback // step,data.shape[-1])))
basic_model.add(layers.Dense(32,activation='relu'))
basic_model.add(layers.Dense(1))

basic_model.summary()

In [None]:
basic_model.compile(optimizer=RMSprop(), loss='mae')

basic_model_fit = basic_model.fit_generator(train_gen, steps_per_epoch=10,epochs=1, 
                                            validation_data=val_gen,validation_steps=val_steps)

In [None]:
loss = basic_model_fit.history['loss']
val_loss = basic_model_fit.history['val_loss']

epochs = range(1,len(loss) + 1)

plt.plot(epochs, loss, 'b-',label='Training Loss')
plt.plot(epochs, val_loss, 'r-',label='Validation Loss')
plt.title('Basic ML Model: Training and Validation Loss')
plt.legend()

In [None]:
# Recurrent baseline

basic_recurrent_model = Sequential()

basic_recurrent_model.add(layers.GRU(32,input_shape=(None,data.shape[-1])))
basic_recurrent_model.add(layers.Dense(1))

basic_recurrent_model.summary()

In [None]:
basic_recurrent_model.compile(optimizer=RMSprop(),loss='mae')

basic_recurrent_model_fit = basic_recurrent_model.fit_generator(train_gen, steps_per_epoch=10, epochs=1, 
                                                                validation_data=val_gen, validation_steps=val_steps)


In [None]:
loss = basic_recurrent_model_fit.history['loss']
val_loss = basic_recurrent_model_fit.history['val_loss']

epochs = range(1,len(loss) + 1)

plt.plot(epochs, loss, 'b-',label='Training Loss')
plt.plot(epochs, val_loss, 'r-',label='Validation Loss')
plt.title('Basic Recurrent Model: Training and Validation Loss')
plt.legend()

### Recurrent Layer with dropout
- For regularization
- In recurrent layers, dropout has to be handled differently for input units and recurrent units, because the recurrent units have different representations at each timestep. Therefore, unless handled properly, the recurrent layer won't be able to learn properly
- Achieved in Keras through a separate, additional argument - `recurrent_dropout` ; which could be managed separately from `dropout` parameter that caters only to the input units of the layer

In [None]:
basic_recurrent_model = Sequential()

basic_recurrent_model.add(layers.GRU(32,input_shape=(None,data.shape[-1]), dropout=0.2, recurrent_dropout=0.5))
basic_recurrent_model.add(layers.Dense(1))

basic_recurrent_model.summary()

In [None]:
basic_recurrent_model.compile(optimizer=RMSprop(),loss='mae')

basic_recurrent_model_fit = basic_recurrent_model.fit_generator(train_gen, steps_per_epoch=10, epochs=1, 
                                                                validation_data=val_gen, validation_steps=val_steps)

In [None]:
loss = basic_recurrent_model_fit.history['loss']
val_loss = basic_recurrent_model_fit.history['val_loss']

epochs = range(1,len(loss) + 1)

plt.plot(epochs, loss, 'b-',label='Training Loss')
plt.plot(epochs, val_loss, 'r-',label='Validation Loss')
plt.title('Basic Recurrent Model (with Dropout): Training and Validation Loss')
plt.legend()

### Stacked Recurrent layers

- Stacking is computationally expensive
- However, could work in favour if the additional representations are useful
- One thing to ensure is that all the internal representation across each timestep of the first recurrent layer has to be returned so that it's available to the stacked recurrent layer on top of it
- Possible in Keras using the argument `return_sequences`
- It can be noted that in the stacked layer (second GRU), `relu` is used as `activation` and `input_shape` is not defined

In [None]:
stacked_recurrent_model = Sequential()

stacked_recurrent_model.add(layers.GRU(32,input_shape=(None,data.shape[-1]), dropout=0.2, recurrent_dropout=0.5, 
                                       return_sequences=True))
stacked_recurrent_model.add(layers.GRU(64,activation='relu', dropout=0.2, recurrent_dropout=0.5))
stacked_recurrent_model.add(layers.Dense(1))

stacked_recurrent_model.summary()

In [None]:
stacked_recurrent_model.compile(optimizer=RMSprop(),loss='mae')

stacked_recurrent_model = basic_recurrent_model.fit_generator(train_gen, steps_per_epoch=10, epochs=1, 
                                                                validation_data=val_gen, validation_steps=val_steps)

### Bidirectional RNNs

In [None]:
bidirectional_gru_model = Sequential()

bidirectional_gru_model.add(
    layers.Bidirectional(
    layers.GRU(32,input_shape=(None,data.shape[-1]))
    ))
bidirectional_gru_model.add(layers.Dense(1))

In [None]:
# Error!!! - yet to be resolved!

bidirectional_gru_model.compile(optimizer=RMSprop(),loss='mae')
# bidirectional_gru_model.summary()

bidirectional_gru_model = bidirectional_gru_model.fit_generator(train_gen, steps_per_epoch=10, epochs=1, 
                                                                validation_data=val_gen, validation_steps=val_steps)

### Things to try out
* There are many other things you could try, in order to improve performance on the temperature-forecasting problem:
    - Adjust the number of units in each recurrent layer in the stacked setup. Thecurrent choices are largely arbitrary and thus probably suboptimal.
    - Adjust the learning rate used by the RMSprop optimizer.
    - Try using LSTM layers instead of GRU layers.
    - Try using a bigger densely connected regressor on top of the recurrent layers: 
        that is, a bigger Dense layer or even a stack of Dense layers.
    - Don’t forget to eventually run the best-performing models (in terms of validation MAE ) on the test set! Otherwise, you’ll develop architectures that are overfitting to the validation set.

### Learnings

- it’s good to first establish common-sense baselines for your metric of choice. If you don’t have a baseline to beat, you can’t tell whether you’re making real progress
- Try simple models before expensive ones, to justify the additional expense. Sometimes a simple model will turn out to be your best option.
- When you have data where temporal ordering matters, recurrent networks are a great fit and easily outperform models that first flatten the temporal data.
- To use dropout with recurrent networks, you should use a time-constant drop-out mask and recurrent dropout mask. These are built into Keras recurrent layers, so all you have to do is use the dropout and recurrent_dropout arguments of recurrent layers.
- Stacked RNN s provide more representational power than a single RNN layer. They’re also much more expensive and thus not always worth it. Although they offer clear gains on complex problems (such as machine translation), they may not always be relevant to smaller, simpler problems.
- Bidirectional RNN s, which look at a sequence both ways, are useful on natural-language processing problems. But they aren’t strong performers on sequence data where the recent past is much more informative than the beginning of the sequence.

In [None]:
rnn_1Dconvnet_combination_model = Sequential()

rnn_1Dconvnet_combination_model.add(layers.Conv1D(32,kernel_size=5,activation='relu',input_shape=(None, data.shape[-1])))
rnn_1Dconvnet_combination_model.add(layers.MaxPool1D(3))
rnn_1Dconvnet_combination_model.add(layers.Conv1D(32,5,activation='relu'))
rnn_1Dconvnet_combination_model.add(layers.GRU(32,dropout=0.2,recurrent_dropout=0.5))
rnn_1Dconvnet_combination_model.add(layers.Dense(1))

rnn_1Dconvnet_combination_model.summary()

In [None]:
rnn_1Dconvnet_combination_model.compile(optimizer=RMSprop(), loss='mae')

rnn_1Dconvnet_combination_model_fit = rnn_1Dconvnet_combination_model.fit_generator(train_gen,steps_per_epoch=10,epochs=2,
                                                                                    validation_data=val_gen,
                                                                                   validation_steps=val_steps)

### Key take aways about 1D Convnets

- In the same way that 2D convnets perform well for processing visual patterns in 2D space, 1D convnets perform well for processing temporal patterns. They offer a faster alternative to RNN s on some problems, in particular natural-language processing tasks.
- Typically, 1D convnets are structured much like their 2D equivalents from the world of computer vision: they consist of stacks of Conv1D layers and Max-Pooling1D layers, ending in a global pooling operation or flattening operation. 
- Because RNN s are extremely expensive for processing very long sequences, but 1D convnets are cheap, it can be a good idea to use a 1D convnet as a preprocessing step before an RNN , shortening the sequence and extracting useful representations for the RNN to process.

## Sequence processing - Summary

* In this chapter, you learned the following techniques, which are widely applicable to any dataset of sequence data, from text to timeseries:
    * How to tokenize text
    * What word embeddings are, and how to use them
    * What recurrent networks are, and how to use them
    * How to stack RNN layers and use bidirectional RNNs to build more-powerful sequence-processing models
    * How to use 1D convnets for sequence processing
    * How to combine 1D convnets and RNNs to process long sequences
    
    
* You can use RNNs for timeseries regression (“predicting the future”), timeseries classification, anomaly detection in timeseries, and sequence labeling (such as identifying names or dates in sentences).


* Similarly, you can use 1D convnets for machine translation (sequence-to-sequence convolutional models, like SliceNet a ), document classification, and spelling correction.


* If global order matters in your sequence data, then it’s preferable to use a recurrent network to process it. This is typically the case for timeseries, where the recent past is likely to be more informative than the distant past.


* If global ordering isn’t fundamentally meaningful, then 1D convnets will turn out to work at least as well and are cheaper. This is often the case for text data, where a keyword found at the beginning of a sentence is just as meaningful as a keyword found at the end.