# Modeling with skflow

### Imports and options

In [None]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [None]:
import h5py
import dask
# from dask import array as da
from dask import dataframe as dd
# from dask import delayed
from dask.multiprocessing import get
import pandas as pd
import pathlib2 as pl
import mmh3  # The hash function used to hash sites. See the preprocessor script.

In [None]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 6)
pd.set_option('display.width', 1000)

# dask.set_options(get=get);  # Due to a bug we can't read files in different processes so set this option after reading.

### Loading data into dask dataframes

Our preprocessor supports output into `numpy` `arrays` and `pandas` `DataFrames` and `skflow` supports the latter.

In [None]:
CHUNK_SIZE = int(1e5)
DF_DIR = pl.Path('/Volumes/CompanionEx/Data/dfs_pandas/*.hdf')
str(DF_DIR)

In [None]:
# data = dd.read_hdf(str(DF_DIR), key='dataset', chunksize=CHUNK_SIZE)
data = pd.read_hdf('/Volumes/CompanionEx/Data/dfs_pandas/PP_TS_2016-01-01-06_2016-01-01-13.hdf')
data.head()

We can apply a query at this stage to limit the dataset.

In [None]:
QUERY = 'site_hash == %i' % mmh3.hash64('rws01_monibas_0010vwa0056ra')[-1]
QUERY

In [None]:
# data = data.query(QUERY)
data

#### Split into train, test and validation sets

In [None]:
data.count

In [None]:
features = data[['site_hash', 'timestamp_start', 'precipitation mm/h', 'temperature C', 'windspeed m/s']]

In [None]:
features.head()

Note that `site_hash` is the `mmh3.hash64` of the `site` column (the last component actually):

In [None]:
mmh3.hash64('rws01_monibas_0010vwa0056ra')[-1]

In [None]:
features.npartitions

As you can see we (lazy) loaded the entire dataset. It has been distributed into the above number of partitions.

In [None]:
target = data[['trafficspeed km/h', 'trafficflow counts/h']]
target.head()

### Modeling

In [None]:
import tensorflow.contrib.learn as skflow

Note that we can use `skit.learn` objects with `skflow`. 

In [None]:
from sklearn import metrics

#### Baseline

We start with a simple 3-layer complete NN trained on a limited set of highways.

In [None]:
dnn_reg = skflow.TensorFlowDNNRegressor(hidden_units=[20, 40, 20], 
                                        batch_size=500, steps=10, learning_rate=0.1, dropout=None, 
                                        optimizer='Adagrad', continue_training=False, verbose=1)

Note that we can pass the `dask` `DataFrame` directly to the `fit` function:

In [None]:
dnn_reg.fit(features, target['trafficspeed km/h'], logdir='../tf_logs/baseline/')

In [None]:
metrics.accuracy_score()

#### A simple RNN.

In [None]:
classifier = skflow.TensorFlowRNNClassifier(rnn_size=EMBEDDING_SIZE, 
    n_classes=15, cell_type='gru', input_op_fn=input_op_fn,
    num_layers=1, bidirectional=False, sequence_length=None,
    steps=1000, optimizer='Adam', learning_rate=0.01, continue_training=True)