<a href="https://colab.research.google.com/github/capitaineoblivious/Grab_AIforSEA/blob/master/Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install python-geohash
#!pip install pandas
#!pip install tensorflow
#!pip install numpy
#!pip install matplotlib

Collecting python-geohash
  Downloading https://files.pythonhosted.org/packages/9c/e2/1a3507af7c8f91f8a4975d651d4aeb6a846dfdf74713954186ade4205850/python-geohash-0.8.5.tar.gz
Building wheels for collected packages: python-geohash
  Building wheel for python-geohash (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/b5/64/5a/6a286481fc7c2a698d2f297d4c90af19946be430b23eba9a33
Successfully built python-geohash
Installing collected packages: python-geohash
Successfully installed python-geohash-0.8.5


In [2]:
import sys
import os

import pandas as pd
import numpy as np
import random

import geohash

import folium
from folium import plugins
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import tensorflow as tf
tf.enable_eager_execution()

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


Please change the link below to use your data

In [3]:
dataset_link = 'https://s3-ap-southeast-1.amazonaws.com/grab-aiforsea-dataset/traffic-management.zip'
df = pd.read_csv(dataset_link, compression='zip', header=0, sep=',', quotechar='"')
df.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp03wc,18,20:0,0.020072
1,qp03pn,10,14:30,0.024721
2,qp09sw,9,6:15,0.102821
3,qp0991,32,5:0,0.088755
4,qp090q,15,4:0,0.074468


We recover the scaling values from the set of geohashes used for the training process

In [0]:
std_lat=0.07292679591440783 
mean_lat=-5.361328125  
std_long=0.11412921417661685
mean_long=90.780029296875

geohashes = df.geohash6.unique().tolist()

In [5]:
max_d = int(max(df.day))
all_ts = []
for d in range(1, max_d +1):
    for h in range(0, 24):
        for m in [0,15,30,45]:
            ts = str(d).zfill(2) + '-' + str(h).zfill(2) + ':' + str(m).zfill(2)
            all_ts.append(ts)

print('The first timestamp of the data set is:', all_ts[0])
print('The last timestamp of the data set is:', all_ts[-1])
print('There are %d timestamps in the dataset' %(len(all_ts)))

The first timestamp of the data set is: 01-00:00
The last timestamp of the data set is: 61-23:45
There are 5856 timestamps in the dataset


In [0]:
def formatted(string):
    hour, minute = string.split(':')
    return hour.zfill(2) + ":" + minute.zfill(2)

def hour(string):
    hour, minute = string.split(':')
    return int(hour)

df['hour'] = df['timestamp'].apply(hour)    
df['timestamp'] = df.day.apply(lambda x: str(x).zfill(2)) + '-' + df['timestamp'].apply(formatted)

In [7]:
data = df.pivot(index='timestamp', columns='geohash6', values='demand')
data = data.reindex(all_ts).fillna(0)

Tmax = 14 * 24 * 4 # maximum 14 days input
if len(data.index) > Tmax:
    data = data.iloc[-Tmax:]
data.shape

(1344, 1329)

We build the model inputs from the data

In [8]:
def generate_input(data):
    
    Tx, m = data.shape
    X = np.zeros((m, Tx, 29)) # 29 features : demand, one hot vector of hours (24), long, lat, long2, lat2

    count = 0
    for ghash in data.columns:
        sample = data[ghash]
        ts = sample.iloc[:-5].index
        sample = sample.iloc[:-5].values

        lat, long = geohash.decode(ghash)
        lat = (lat - mean_lat) / std_lat
        long = (long - mean_long) / std_long

        matrix = np.zeros((Tx, 29))
        for j, s in enumerate(ts):
            hour = int(s.split('-')[1].split(':')[0])
            matrix[j, 0] = sample[j]
            matrix[j, hour+1] = 1
            matrix[j, 25:] = np.array([lat, long, lat**2, long**2])
        X[count] = matrix
        count +=1
    return X

A = generate_input(data)
A.shape

(1329, 1344, 29)

We load the pretrained model

In [9]:
if tf.test.is_gpu_available():
  print(' GPU acceleration available')
  rnn = tf.keras.layers.CuDNNLSTM
else:
  print(' NO GPU detected')
  rnn = tf.keras.layers.LSTM

# define model where LSTM is also output layer

def build_model(batch_size=16):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.BatchNormalization(input_shape=(None, 29),batch_size=batch_size))
    model.add(rnn(256, return_sequences=True, stateful=True))
    model.add(rnn(128, return_sequences=True, stateful=True))
    model.add(rnn(64, return_sequences=True, stateful=True))
    model.add(rnn(64, return_sequences=True, stateful=True))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    return model

model = build_model(batch_size=1)
opt = tf.keras.optimizers.Adam(lr=0.015, beta_1=0.9, beta_2=0.999, decay=0.005)
model.compile(optimizer=opt, loss='mean_squared_error')

 GPU acceleration available


In [15]:
! git clone https://github.com/capitaineoblivious/Grab_AIforSEA

Cloning into 'Grab_AIforSEA'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 9 (delta 1), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (9/9), done.


In [17]:
!ls Grab_AIforSEA/

my_model.h5  README.md


In [18]:
model.load_weights('Grab_AIforSEA/my_model.h5')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization (BatchNo (1, None, 29)             116       
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       (1, None, 256)            293888    
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (1, None, 128)            197632    
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (1, None, 64)             49664     
_________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)     (1, None, 64)             33280     
_________________________________________________________________
dense (Dense)                (1, None, 1)              65        
Total params: 574,645
Trainable params: 574,587
Non-trainable params: 58
_________________________________________________

We define the function that will predict the demand at time T+1 to T+5, given the history up to time T

In [0]:
def make_predictions(input, model2):
    seq = input.copy()
    a, b, c = seq.shape
    pred = np.zeros((a, b))
    out = None
    
    model2.reset_states()

    for i in range(b):
        val = seq[:,i,:].reshape((a,1,c))
        if i >= b-5:
            val[:,0,0] = out
        
        pred[:,i] = val[:,0,0]
        out = model2.predict(val)
        out = tf.squeeze(out).numpy()

    return pred

The prediction phase takes in the data from 0 to T-5 and compares the predictions from T-5 to T with the expected values from T-5 to T
We print the RMSE below. The whole computation might take a while depending on the size of the data

In [0]:
X = generate_input(data)
a, b, c = X.shape
predictions_dic = {}
error_dic = {}
pred = make_predictions(X[:,:,:], model)

In [34]:
forecast = pred[:,-5:]
target = X[:,-5:,0]

rmse = np.mean(np.sqrt((forecast-target)**2))

print("The RMSE is : %.4f" %(rmse))

The RMSE is : 0.0246


In [0]:
rmse