In [1]:
import sys
import os
import toml
import glob
import logging

import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

sys.path.append(os.path.join(sys.path[0], '../..'))

import data.kcost_dataset as EndureData
from model.losses import MSLELoss

In [2]:
config = toml.load('../../endure.toml')
config['model']['dir'] = 'delete'

In [16]:
class KCostModel(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.params = config['model']

        self.embedding = torch.nn.Embedding(
                num_embeddings=config['lsm']['size_ratio']['max'],
                embedding_dim=self.params['embedding_size'],
                max_norm=True)
        embedding_output = (self.params['num_cate_vars'] *
                            self.params['embedding_size'])
        num_feat = self.params['num_cont_vars'] + embedding_output

        modules = []
        for _ in range(self.params['hidden_layers']):
            modules.append(torch.nn.Linear(num_feat, num_feat))
            torch.nn.init.xavier_normal_(modules[-1].weight)
            modules.append(torch.nn.ReLU())
        modules.append(torch.nn.Linear(num_feat, self.params['out_dims']))
        torch.nn.init.xavier_normal_(modules[-1].weight)
        modules.append(torch.nn.ReLU())

        self.cost_layer = torch.nn.Sequential(*modules)

    def forward(self, x):
        print(f'input {x.shape}')
        cate_inputs = x[:, self.params['num_cont_vars']:]
        print(f'cate_inputs {cate_inputs.shape}')
        out = self.embedding(cate_inputs.to(torch.int32))
        print(f'after embedding: {out.shape}')
        out = torch.flatten(out, start_dim=1)
        print(f'after flatten: {out.shape}')
        print(f'to concat with {x[:, :self.params["num_cont_vars"]].shape}')
        out = torch.cat((x[:, :self.params['num_cont_vars']], out), -1)
        out = self.cost_layer(out)

        return out

In [17]:
class Trainer:
    def __init__(self, config, model, optimizer, loss_fn, train_data,
                 test_data):
        self.config = config
        self.log = logging.getLogger(self.config['log']['name'])
        self.model = model
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.train_data = train_data
        self.test_data = test_data
        self.train_len = self.test_len = 0
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        self.log.info(f'Training on device: {self.device}')
        self.model = self.model.to(self.device)
        self.loss_fn = self.loss_fn.to(self.device)

    def _train_step(self, label, features):
        label = label.to(self.device)
        features = features.to(self.device)
        print(f'{label.device=}')
        pred = self.model(features)
        loss = self.loss_fn(pred, label)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss

    def _train_loop(self):
        self.model.train()
        total_loss = 0
        for batch, (labels, features) in enumerate(self.train_data):
            loss = self._train_step(labels, features)
            total_loss += loss
        if self.train_len == 0:
            self.train_len = batch + 1

        return total_loss.item() / self.train_len

    def _test_step(self, labels, features):
        with torch.no_grad():
            labels = labels.to(self.device)
            features = features.to(self.device)
            pred = self.model(features)
            test_loss = self.loss_fn(pred, labels).item()

        return test_loss

    def _test_loop(self):
        self.model.eval()
        test_loss = 0
        for batch, (labels, features) in enumerate(self.test_data):
            test_loss += self._test_step(labels, features)

        if self.test_len == 0:
            self.test_len = batch + 1  # Last batch will correspond to total
        test_loss /= (batch + 1)

        return test_loss

    def _dumpconfig(self, save_dir):
        with open(os.path.join(save_dir, 'config.toml'), 'w') as fid:
            toml.dump(self.config, fid)

    def _checkpoint(self, save_dir, epoch, loss):
        save_pt = {'epoch': epoch,
                   'model_state_dict': self.model.state_dict(),
                   'optimizer_state_dict': self.optimizer.state_dict(),
                   'loss': loss}
        torch.save(save_pt, os.path.join(save_dir, 'checkpoint.pt'))

    def _save_model(self, save_dir):
        torch.save(self.model.state_dict(),
                   os.path.join(save_dir, 'kcost_min.model'))

    def run(self):
        early_stop_num = self.config['train']['early_stop_num']
        epsilon = self.config['train']['epsilon']
        max_epochs = self.config['train']['max_epochs']
        save_dir = os.path.join(self.config['io']['data_dir'],
                                self.config['model']['dir'])

        os.makedirs(save_dir, exist_ok=True)
        self._dumpconfig(save_dir)

        df = []
        prev_loss = float('inf')
        loss_gradients_up = 0
        loss_min = float('inf')
        for epoch in range(max_epochs):
            train_loss = self._train_loop()
            curr_loss = self._test_loop()
            self._checkpoint(save_dir, epoch, curr_loss)

            if curr_loss < loss_min:
                loss_min = curr_loss
                self._save_model(save_dir)
            df.append({
                'epoch': epoch,
                'train_loss': train_loss,
                'test_loss': curr_loss})
            pd.DataFrame(df).to_csv(
                os.path.join(save_dir, 'losses.csv'),
                index=False)

            if curr_loss - prev_loss > -epsilon:
                loss_gradients_up += 1
            if loss_gradients_up >= early_stop_num:
                break
            prev_loss = curr_loss

        self.log.info('Training finished')

In [18]:
class TrainJob:
    def __init__(self, config):
        self._config = config
        self.log = logging.getLogger(self._config['log']['name'])
        self.log.info('Running Training Job')
        self._dp = EndureData.EndureDataPipeGenerator(self._config)

    def _build_model(self):
        choice = self._config['model']['arch']
        model = KCostModel(self._config)

        return model

    def _build_optimizer(self, model):
        optimizer = torch.optim.Adam(
                model.parameters(),
                lr=self._config['train']['learning_rate'])

        return optimizer

    def _build_train(self):
        train_dir = os.path.join(
            self._config['io']['data_dir'],
            self._config['train']['dir'])
        train_data = EndureData.EndureIterableDataSet(
            config=self._config,
            folder=train_dir,)
        train = DataLoader(
            train_data,
            batch_size=self._config['train']['batch_size'],
            drop_last=self._config['train']['drop_last'],)
            # num_workers=4,
        return train

    def _build_test(self):
        test_dir = os.path.join(
                self._config['io']['data_dir'],
                self._config['test']['dir'])
        test_data = EndureData.EndureIterableDataSet(
            config=self._config,
            folder=test_dir,)
        test = DataLoader(
            test_data,
            batch_size=self._config['test']['batch_size'],
            drop_last=self._config['test']['drop_last'],)
        return test

    def _build_data(self):
        train = self._build_train()
        test = self._build_test()

        return train, test

    def run(self) -> Trainer:
        model = self._build_model()
        optimizer = self._build_optimizer(model)
        train_data = self._build_train()
        test_data = self._build_test()
        loss_fn = MSLELoss()
        
        trainer = Trainer(
            config=self._config,
            model=model,
            optimizer=optimizer,
            loss_fn=loss_fn,
            train_data=train_data,
            test_data=test_data)
        trainer.run()

        return trainer

In [19]:
model = KCostModel(config).to('cuda')
tj = TrainJob(config)
train = tj._build_train()

In [20]:
labels, inputs = next(iter(train))
inputs = inputs.to('cuda')h
model(inputs)

input torch.Size([64, 7])
cate_inputs torch.Size([64, 2])
after embedding: torch.Size([64, 2, 10])
after flatten: torch.Size([64, 20])
to concat with torch.Size([64, 5])


tensor([[0.0000, 0.0000, 0.3113, 0.1093],
        [0.0000, 0.0000, 0.3178, 0.1142],
        [0.0000, 0.0000, 0.3174, 0.1122],
        [0.0000, 0.0000, 0.3136, 0.1167],
        [0.0000, 0.0000, 0.2980, 0.0909],
        [0.0000, 0.0000, 0.3089, 0.1123],
        [0.0000, 0.0000, 0.3080, 0.1053],
        [0.0000, 0.0000, 0.2979, 0.0950],
        [0.0000, 0.0000, 0.3107, 0.1078],
        [0.0000, 0.0000, 0.2988, 0.1029],
        [0.0000, 0.0000, 0.3198, 0.1150],
        [0.0000, 0.0000, 0.3091, 0.1075],
        [0.0000, 0.0000, 0.3091, 0.1089],
        [0.0000, 0.0000, 0.3111, 0.1098],
        [0.0000, 0.0000, 0.3040, 0.1036],
        [0.0000, 0.0000, 0.3073, 0.1054],
        [0.0000, 0.0000, 0.3051, 0.1040],
        [0.0000, 0.0000, 0.2967, 0.0871],
        [0.0000, 0.0000, 0.3102, 0.1051],
        [0.0000, 0.0000, 0.3106, 0.1105],
        [0.0000, 0.0000, 0.3086, 0.1073],
        [0.0000, 0.0000, 0.3162, 0.1104],
        [0.0000, 0.0000, 0.3105, 0.1025],
        [0.0000, 0.0000, 0.3078, 0