In [1]:
import sys
sys.path.append('..')

import tensorflow as tf
import pandas as pd
import numpy as np

from geohash2 import decode_exactly
from src.data.prepare import prepare_data
from src.features.create_features import create_features
from src.models.loss import calc_rmse

pd.options.mode.chained_assignment = None

from src.models.model import Model
from src.data.dataset import create_dataset

In [2]:
np.random.seed(42)

# 1. Data

In [3]:
df = pd.read_csv("../data/training.csv")

In [4]:
df.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp03wc,18,20:0,0.020072
1,qp03pn,10,14:30,0.024721
2,qp09sw,9,6:15,0.102821
3,qp0991,32,5:0,0.088755
4,qp090q,15,4:0,0.074468


In [5]:
df = prepare_data(df)

In [6]:
df.head()

Unnamed: 0,geohash6,lat,long,day,hour,minute,demand
0,qp03wc,-5.353088,90.653687,1,0,0,0.054858
1,qp03wc,-5.353088,90.653687,1,0,15,0.086209
2,qp03wc,-5.353088,90.653687,1,0,30,0.050739
3,qp03wc,-5.353088,90.653687,1,0,45,0.075174
4,qp03wc,-5.353088,90.653687,1,1,0,0.062867


# 2. Features Engineering

In [7]:
df = create_features(df)

In [8]:
df.head()

Unnamed: 0,geohash6,lat,long,day,hour,minute,demand,recur_day,time_bin,recur_day_1,recur_day_2,recur_day_3,recur_day_4,recur_day_5,recur_day_6,recur_day_7,time_bin_morning,time_bin_afternoon,time_bin_evening,time_bin_night
0,qp03wc,-5.353088,90.653687,1,0,0,0.054858,1,night,1,0,0,0,0,0,0,0,0,0,1
1,qp03wc,-5.353088,90.653687,1,0,15,0.086209,1,night,1,0,0,0,0,0,0,0,0,0,1
2,qp03wc,-5.353088,90.653687,1,0,30,0.050739,1,night,1,0,0,0,0,0,0,0,0,0,1
3,qp03wc,-5.353088,90.653687,1,0,45,0.075174,1,night,1,0,0,0,0,0,0,0,0,0,1
4,qp03wc,-5.353088,90.653687,1,1,0,0.062867,1,night,1,0,0,0,0,0,0,0,0,0,1


# 3. Baseline

In [9]:
baseline_train = df[df.day < 61]
baseline_val = df[df.day == 61]

In [10]:
baseline_model = baseline_train.groupby(["geohash6", "recur_day", "hour", "minute"]).demand.median()

In [11]:
baseline_actual = np.array(baseline_val.demand.to_list())

baseline_pred = np.array(baseline_model.loc[
    [tuple(x) for x in baseline_val[["geohash6", "recur_day", "hour", "minute"]].values]
].reset_index().demand.to_list())

In [12]:
calc_rmse(baseline_actual, baseline_pred)

0.04544460790419308

# 4. Model

In [13]:
model = Model()

In [14]:
dataset = create_dataset(df, epochs=10)

In [15]:
model.train(dataset)

(32, 1) (32, 1)
(32, 1) (32, 1)
Iter 1 Batch 0 Loss 0.019743815064430237
(32, 1) (32, 1)
(32, 1) (32, 1)
Iter 1 Batch 100 Loss 0.001121237175539136
(32, 1) (32, 1)
(32, 1) (32, 1)
Iter 1 Batch 200 Loss 1.6255551599897444e-05
(32, 1) (32, 1)


KeyboardInterrupt: 