# Example 4: Prepare a dataset and train AutoEncoder model

In this example, we will learn how to create a dataset containing acceleration data for training a simple AutoEncoder model for anomaly detection.  

In [1]:
#!/usr/bin/env python3

import os

## Get and check data list

In [2]:
from dwtk.db import V3DBHandler as DBHandler

db_handler = DBHandler(
    db_class='meta',
    db_host='/data_pool_1/small_DrivingBehaviorDatabase/dwtk.db',
    base_dir_path='/data_pool_1/small_DrivingBehaviorDatabase',
    read_on_init=False
)
db_handler.read(where=
                'tags like "%acceleration%" '
                'and tags not like "%gps%" '
                'and path not like "%lidar%"'
               )

print('# of metadata: {}'.format(len(db_handler)))
for i, meta in enumerate(db_handler):
    print("record_id: {}".format(meta["record_id"]))

# of metadata: 4
record_id: 016_00000000030000000215
record_id: 016_00000000030000000240
record_id: 253_16000000080000000747
record_id: W07_17000000020000001255


For now, we only collect acceleration data of 4 trips as we just want to see the workflow.

## Make Dataset

In [3]:
from dwtk.io import BaseFileReader
from dwtk.preprocesses.downsampling import Downsample
import numpy as np
from tqdm.notebook import tqdm

reader = BaseFileReader()
reader.add_preprocess(Downsample(target_frame_rate=1))

dump_dir = "dump"
data_dir = os.path.join(dump_dir, "down")
os.makedirs(data_dir, exist_ok=True)
feats_scp = os.path.join(data_dir, "feats.scp")

f = open(feats_scp, mode='w')
for meta in tqdm(db_handler):
    timestamps, data, columns = reader.read(meta)
    feats_path = os.path.join(data_dir, meta["record_id"] + ".npy")
    np.save(feats_path, data.astype(np.float32))
    f.write("{0} {1}\n".format(meta["record_id"], feats_path))

f.close()

dim = data.shape[1]
print("Data dimemsion: {}".format(dim))

Failed to load Python extension for LZ4 support. LZ4 compression will not be available.


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Data dimemsion: 3


## Train anomaly-detection model
Install [nadt](https://github.com/TakedaLab/neural-anomaly-detection-toolkit) library to use anomaly detection model.

In [None]:
!pip install https://github.com/TakedaLab/neural-anomaly-detection-toolkit.git

### Model and Training settings

In [4]:
# Path setting
exp_dir = "exp"
os.makedirs(exp_dir, exist_ok=True)
checkpoint_dir = os.path.join(exp_dir, "checkpoint")
log_dir = os.path.join(exp_dir, "lightning_logs")
checkpoint_file = os.path.join(checkpoint_dir, "last.ckpt")
checkpoint_file = checkpoint_file if os.path.isfile(checkpoint_file) else None

# Model setting
algorithm = "AutoEncoder"
algorithm_params = {
    "num_features": dim,
    "sequence_length": 128,
    "num_hidden_layers": 4,
    "num_hidden_units": 128,
    "num_latent_units": 8,
    "dropout": 0.1,
}
# Training setting
optimizer = "Adam"
optimizer_params = {"lr": 0.001}
training_params = {
    "num_epochs": 10,
    "batch_size": 512,
    "saving_period": 1,
}

### Dataset settings

In [5]:
from nadt.datasets.dataset import DataSet
from torch.utils.data import DataLoader

algorithm_dataset = DataSet(
    feats_scp=feats_scp,
    sequence_length=algorithm_params["sequence_length"],
    annotation=False,
)
algorithm_dataloader = DataLoader(
    dataset=algorithm_dataset, shuffle=True, num_workers=4,
    batch_size=training_params["batch_size"],
)

### Set and train the model

In [6]:
from nadt.algorithms.wrapper import Algorithm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.logging import TensorBoardLogger

# Set model
algorithm_model = Algorithm(
    algorithm=algorithm,
    algorithm_params=algorithm_params,
    optimizer=optimizer,
    optimizer_params=optimizer_params,
)

# Set trainer
checkpoint_callback = ModelCheckpoint(
    save_last=True,
    monitor="loss",
    filepath=checkpoint_dir + "/{epoch}_{loss:.2f}",
    period=training_params["saving_period"]
)
logger = TensorBoardLogger(
    save_dir=log_dir,
    version="latest",
)
gpus = None
algorithm_trainer = pl.Trainer(checkpoint_callback=checkpoint_callback,
    logger=logger,
    resume_from_checkpoint=checkpoint_file,
    max_epochs=training_params["num_epochs"],
    gpus=gpus, auto_select_gpus=True
)

# Train the model
algorithm_trainer.fit(algorithm_model, algorithm_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name      | Type        | Params
------------------------------------------
0 | algorithm | AutoEncoder | 202 K 


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




1