In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from tensorflow import keras
from tensorflow.keras import layers

print("pandas version: " + pd.__version__)
print("numpy version: " + np.__version__)
print("tensorflow version: " + tf.__version__)

pandas version: 1.2.4
numpy version: 1.18.5
tensorflow version: 2.3.1


In [2]:
combined_df = pd.read_csv("../data/clean/station_trips_weather.csv", parse_dates=["datetime"])
combined_df.set_index("datetime", inplace=True)
combined_df["station_name"] = combined_df["station_name"].astype("category")

In [3]:
combined_df.head(3)

Unnamed: 0_level_0,station_name,check_ins,check_outs,diff,precip_intensity,precip_probability,precip_type,temperature,humidity,wind_speed,wind_bearing,uv_index,visibility
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2014-01-01,Allende-Platz/Grindelhof,0,5,-5,0.0,0.0,none,1.93,0.81,3.91,161.0,0.0,9.988
2014-01-01,Alsenstraße/Düppelstraße,3,0,3,0.0,0.0,none,1.93,0.81,3.91,161.0,0.0,9.988
2014-01-01,Alsterdorf Markt/Evangelische Stiftung,0,0,0,0.0,0.0,none,1.93,0.81,3.91,161.0,0.0,9.988


In [4]:
combined_df.tail(3)

Unnamed: 0_level_0,station_name,check_ins,check_outs,diff,precip_intensity,precip_probability,precip_type,temperature,humidity,wind_speed,wind_bearing,uv_index,visibility
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-05-15 11:00:00,Winterhuder Weg/ Zimmerstraße,2,4,-2,0.0,0.0,none,14.02,0.71,3.49,276.0,4.0,10.003
2017-05-15 11:00:00,Zentralbibliothek / Münzstraße,0,2,-2,0.0,0.0,none,14.02,0.71,3.49,276.0,4.0,10.003
2017-05-15 11:00:00,Überseering/Mexikoring,1,1,0,0.0,0.0,none,14.02,0.71,3.49,276.0,4.0,10.003


In [16]:
sample_df = combined_df[["check_ins", "temperature", "humidity"]].resample("H").mean()
sample_df.head(3)

Unnamed: 0_level_0,check_ins,temperature,humidity
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01-01 00:00:00,0.254808,1.93,0.81
2014-01-01 01:00:00,0.490385,1.94,0.85
2014-01-01 02:00:00,0.480769,1.81,0.85


In [17]:
sample_df_scaled = sample_df.copy()
sample_df_scaled["temperature"] = MinMaxScaler().fit_transform(sample_df_scaled["temperature"].values.reshape(-1, 1))
sample_df_scaled["humidity"] = MinMaxScaler().fit_transform(sample_df_scaled["humidity"].values.reshape(-1, 1))
sample_df_scaled.head(3)

Unnamed: 0_level_0,check_ins,temperature,humidity
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01-01 00:00:00,0.254808,0.288866,0.753247
2014-01-01 01:00:00,0.490385,0.289072,0.805195
2014-01-01 02:00:00,0.480769,0.286392,0.805195


In [5]:
combined_df_mean = combined_df.resample("H").mean()
simple_df_mean = combined_df_mean[["check_ins"]].copy()

In [6]:
# Define percentage of training, testing and validation data sets

sequence_length = 10
valid_set_size_percentage = 10
test_set_size_percentage = 10
train_set_size_percentage = 80

In [82]:
# Create every possible sequence with the defined sequence length

as_numpy = simple_df_mean.to_numpy()
data = []

for index in range(len(as_numpy) - sequence_length):
    data.append(as_numpy[index: index + sequence_length])

data_size = len(data)
data = np.array(data)

In [83]:
# Calculate size of training, testing and validation data sets

valid_set_size = int(np.round(valid_set_size_percentage / 100 * data_size))
test_set_size = int(np.round(test_set_size_percentage / 100 * data_size))
train_set_size = data_size - (valid_set_size + test_set_size)

In [84]:
# Create actual training, testing and validation data sets

x_train = data[:train_set_size,:-1,:]
y_train = data[:train_set_size,-1,:]

x_valid = data[train_set_size:train_set_size+valid_set_size,:-1,:]
y_valid = data[train_set_size:train_set_size+valid_set_size,-1,:]

x_test = data[train_set_size+valid_set_size:,:-1,:]
y_test = data[train_set_size+valid_set_size:,-1,:]

In [73]:
# Normalise the data so everything is on the same scale

normalised_data = StandardScaler().fit_transform(data.reshape(-1,1))

In [75]:
normalised_data.shape

(295220, 1)

In [85]:
steps = sequence_length - 1
inputs = 4 
neurons = 200 
outputs = 4
layers = 2
learning_rate = 0.001
batch_size = 50
epochs = 100 
train_set_size = len(x_train)
test_set_size = len(x_test)

In [80]:
tf.keras.layers.RNN(
    cell, return_sequences=False, return_state=False, go_backwards=False,
    stateful=False, unroll=False, time_major=False, **kwargs
)
tf.keras.layers.LSTM(
    units, activation='tanh', recurrent_activation='sigmoid',
    use_bias=True, kernel_initializer='glorot_uniform',
    recurrent_initializer='orthogonal',
    bias_initializer='zeros', unit_forget_bias=True,
    kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None,
    activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None,
    bias_constraint=None, dropout=0.0, recurrent_dropout=0.0,
    return_sequences=False, return_state=False, go_backwards=False, stateful=False,
    time_major=False, unroll=False, **kwargs
)

NameError: name 'cell' is not defined