# Functions for running training loops

In [1]:
#hide
%load_ext autoreload

In [2]:
#default_exp trainers
#export
import datetime
import logging
import os
import tempfile

import torch
import pandas as pd
import pytorch_lightning as lit
import wandb
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor, GPUStatsMonitor
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger, WandbLogger

from reappraisalmodel.lightningreapp import LightningReapp
from reappraisalmodel.utils import upload_file

In [3]:
%autoreload


from reappraisalmodel.ldhdata import LDHDataModule
from pathlib import Path
ROOT_DIR = Path().resolve().parent
print(ROOT_DIR)

STRAT='obj'

ldhdata = LDHDataModule(data_dir=ROOT_DIR, strat=STRAT)
ldhdata.load_train_data()

/home/ubuntu/reapp


Loading cached processed dataset at /home/ubuntu/reapp/output/training/obj/cache-b785a2c963e958a2.arrow


Training data loaded from disk.
Encoding Training Data:


In [None]:

#exporti


#export
def test_num_embeddings(ldhdata, strat, s3_bucket=None, **trainer_kwargs) -> None:
    all_metrics = []

    max_epochs = trainer_kwargs.pop('max_epochs', 20)
    gpus = trainer_kwargs.pop('gpus', 1 if torch.cuda.is_available() else None)

    today = datetime.datetime.today().strftime('%Y%m%d_%H%M%S')

    #Create temporary data to store checkpoint files.
    with tempfile.TemporaryDirectory() as tempdir:
        print(f'Created temporary directory: {tempdir}')

        for i in range(1,4):
            config = {
                'lr': 1e-3,
                'num_embedding_layers': i,
                'batch_size': 128
            }
            # Select the dataloaders for the given split.
            save_dir=ROOT_DIR / 'reapp_logs'
            name=f"{i}layer_{strat}_{today}"
            version=i
            prefix="layer"

            # Loggers
            logger = TensorBoardLogger(
                save_dir=save_dir,
                name=name,
                version=version,
                prefix=prefix
            )

            csv_logger = CSVLogger(
                save_dir=save_dir,
                name=name,
                version=version,
                prefix=prefix
            )

            #Checkpoints
            early_stop_checkpoint = EarlyStopping(
                monitor='val_loss',
                mode='min',
                min_delta=0.01,
                patience=3,
                verbose=True
            )

            callback_checkpoint = ModelCheckpoint(
                monitor='val_loss',
                mode='min',
                dirpath=os.path.join(tempdir, name),
                filename= f'{i+1}layer_'+'{epoch:02d}-{val_loss:.02f}',
                verbose=False,
                save_last=False,
                save_top_k=1,
                save_weights_only=True,
            )

            model = LightningReapp(config)
            trainer = lit.Trainer(
                benchmark=True,
                logger = [logger, csv_logger],
                gpus = gpus,
                val_check_interval=0.25,
                gradient_clip_val=0.5,
                max_epochs=max_epochs,
                terminate_on_nan=True,
                weights_summary=None,
                callbacks=[callback_checkpoint, early_stop_checkpoint],
                **trainer_kwargs)
            print(f"Training with {i} layers")
            trainer.fit(model, ldhdata.get_train_dataloader(batch_size=model.batch_size), 
            ldhdata.get_val_dataloader(batch_size=model.batch_size))
            all_metrics.append({
                'metrics': trainer.logged_metrics,
                'num_epochs': trainer.current_epoch
            })

        outputs = []
        for split in all_metrics:
            val_loss = split['metrics']['val_loss'].item()
            train_loss = split['metrics']['loss'].item()
            num_epochs = split['num_epochs']
            r2score = split['metrics']['r2score']
            explained_variance = split['metrics']['explained_var']

#             ckpt_path = split['checkpoint']
#             filename = os.path.split(ckpt_path)[-1]
            
            
#             upload_result = upload_file(ckpt_path, 'ldhdata', f'{strat}/{i}-{str(today)}-{filename}')
#             print(f"Successful {filename} to s3: {upload_result}")

            row = {
                'val_loss': val_loss,
                'train_loss': train_loss,
                'num_epochs': num_epochs,
                'r2score':r2score,
                'explained_var': explained_variance
            }
            print(row)
            outputs.append(row)
        df = pd.DataFrame(outputs)
        df['r2score'] = df['r2score'].apply(lambda x: x.item())
        df['explained_var'] = df['explained_var'].apply(lambda x: x.item())
        
        report_name = f'num_layers: {str(today)}-report.csv'
        report_path = os.path.join(tempdir, f"{strat}-{report_name}" )
        df.to_csv(report_path)
        if s3_bucket is not None:
            upload_report = upload_file(report_path, s3_bucket, f'{strat}/{report_name}')
            print(f"Successful Uploading Report to s3: {upload_report}")
        print(df.describe())
        return df
    
test_num_embeddings(ldhdata, STRAT)

Created temporary directory: /tmp/tmpm5bj50kj


GPU available: True, used: True
TPU available: None, using: 0 TPU cores


Training with 1 layers




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  return torch.tensor(x, **format_kwargs)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

## K-Fold Training and Cross-Validation

In [None]:

#exporti
default_config = {
    'lr': 1e-3,
    'hidden_layer_size': 50
    }

#export
def kfold_train(k: int, ldhdata, strat, s3_bucket=None, **trainer_kwargs) -> None:
    """Fits a LightningReapp instance with k-fold cross-validation.
    Args:
        k (int):
        ldhdata : See `reappraisalmodel.ldhdata.LDHDataModule`
    """
    all_metrics = []

    max_epochs = trainer_kwargs.pop('max_epochs', 20)
    gpus = trainer_kwargs.pop('gpus', 1 if torch.cuda.is_available() else None)

    today = datetime.datetime.today().strftime('%Y%m%d_%H%M%S')

    #Create temporary data to store checkpoint files.
    with tempfile.TemporaryDirectory() as tempdir:
        print(f'Created temporary directory: {tempdir}')

        for i in range(k):
            # Select the dataloaders for the given split.
            split = i
            train_dl = ldhdata.get_train_dataloader(split)
            val_dl = ldhdata.get_val_dataloader(split)

            save_dir='reapp_logs'
            name=f"{i:02d}foldCV_{strat}_{today}"
            version="split"
            prefix=i

            # Loggers
            logger = TensorBoardLogger(
                save_dir=save_dir,
                name=name,
                version=version,
                prefix=prefix
            )

            csv_logger = CSVLogger(
                save_dir=save_dir,
                name=name,
                version=version,
                prefix=prefix
            )

            #Checkpoints
            early_stop_checkpoint = EarlyStopping(
                monitor='val_loss',
                mode='min',
                min_delta=0.001,
                patience=3,
                verbose=False
            )

            callback_checkpoint = ModelCheckpoint(
                monitor='val_loss',
                mode='min',
                dirpath=os.path.join(tempdir, name),
                filename= f'{split}_'+'{epoch:02d}-{val_loss:.02f}',
                verbose=False,
                save_last=False,
                save_top_k=1,
                save_weights_only=False,
            )

            model = LightningReapp(default_config)
            trainer = lit.Trainer(
                benchmark=True,
                logger = [logger, csv_logger],
                gpus = gpus,
                gradient_clip_val=1.0,
                max_epochs=max_epochs,
                terminate_on_nan=True,
                weights_summary=None,
                callbacks=[callback_checkpoint, early_stop_checkpoint],
                **trainer_kwargs)
            print(f"Training on split {i}")
            trainer.fit(model, train_dl, val_dl)
            all_metrics.append({
                'metrics': trainer.logged_metrics,
                'checkpoint': callback_checkpoint.best_model_path,
                'num_epochs': trainer.current_epoch
            })

        outputs = []
        for split in all_metrics:
            val_loss = split['metrics']['val_loss'].item()
            train_loss = split['metrics']['loss'].item()
            num_epochs = split['num_epochs']
            r2score = split['metrics']['r2score']
            explained_variance = split['metrics']['explained_var']

            ckpt_path = split['checkpoint']
            filename = os.path.split(ckpt_path)[-1]

            upload_result = upload_file(ckpt_path, 'ldhdata', f'{strat}/{i}-{str(today)}-{filename}')
            print(f"Successful {filename} to s3: {upload_result}")

            row = {
                'val_loss': val_loss,
                'train_loss': train_loss,
                'num_epochs': num_epochs,
                'r2score': r2score,
                'explained_var': explained_variance
            }
            print(row)
            outputs.append(row)
        df = pd.DataFrame(outputs)
        df['r2score'] = df['r2score'].apply(lambda x: x.item())
        df['explained_var'] = df['explained_var'].apply(lambda x: x.item())
        
        report_name = f'{str(today)}-report.csv'
        report_path = os.path.join(tempdir, f"{strat}-{report_name}" )
        df.to_csv(report_path)
        if s3_bucket is not None:
            upload_report = upload_file(report_path, s3_bucket, f'{strat}/{report_name}')
            print(f"Successful Uploading Report to s3: {upload_report}")
        print(df.describe())
        return df

## Hyperparameter Tuning

Sources:
- [Scaling Up PyTorch Lightning Hyperparameter Tuning w/ Ray](https://medium.com/distributed-computing-with-ray/scaling-up-pytorch-lightning-hyperparameter-tuning-with-ray-tune-4bd9e1ff9929
)

In [None]:
from reappraisalmodel.utils import download_file


download_file('ldhdata', 'Master_Final_TrainingData.csv',f"{ROOT_DIR}/data/training/Master_Final_TrainingData.csv")

In [4]:
import ray 
ray.shutdown()

In [None]:
%autoreload
import torch
import pytorch_lightning as lit
from ray import tune
from ray.tune import JupyterNotebookReporter, CLIReporter
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.suggest import ConcurrencyLimiter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.stopper import TrialPlateauStopper



from reappraisalmodel.lightningreapp import LightningReapp

default_tune_config = {
    "lr": tune.loguniform(1e-4, 1e-1), # loguniform samples by magnitude
    "num_embedding_layers": tune.choice([1,2,3])
}

callback_tuner_val = TuneReportCallback(
    {
        "val_loss": "val_loss",
        "explained_var": "explained_var"
    },
    on="validation_end",
)
callback_tuner_train = TuneReportCallback(
{
    "train_loss": "loss",
}, on="train_end")


def train_tune(config, ldhdata, num_gpus=None, num_epochs=10):
    model = LightningReapp(config)
    trainer = lit.Trainer(
        max_epochs=1,
        limit_train_batches=1,
        limit_test_batches=1,
        gpus=num_gpus,
        progress_bar_refresh_rate=0,
        weights_summary=None,
        stochastic_weight_avg=True,
        callbacks=[callback_tuner_val],
    )
    trainer.fit(model, ldhdata.get_train_dataloader(batch_size=model.batch_size), 
            ldhdata.get_val_dataloader(batch_size=model.batch_size))

In [6]:
hp_search = HyperOptSearch(metric='val_loss', mode='min')

scheduler = ASHAScheduler(
    time_attr='training_iteration', 
    grace_period=2, 
    max_t=15)

reporter = JupyterNotebookReporter(
    overwrite=True,
    parameter_columns=["lr", 'num_embedding_layers'],
    metric_columns=["val_loss", "training_iteration", "explained_var"],
    print_intermediate_tables=True
)

analysis = tune.run(
    tune.with_parameters(train_tune,
        ldhdata=ldhdata),
    local_dir=f"{ROOT_DIR}/reapp_logs/tune",
    config=default_tune_config, 
    resources_per_trial={
        "cpu": 1,
        
    },
    metric="val_loss",
    mode='min',
    progress_reporter=reporter,
    scheduler=scheduler,
    search_alg=hp_search,
    num_samples=1,
    fail_fast=True
)
print("Best hyperparameters found were: ", analysis.best_config)

Trial name,status,loc,lr,num_embedding_layers
_inner_526eabea,RUNNING,,0.00731174,3


[2m[36m(pid=20393)[0m GPU available: False, used: False
[2m[36m(pid=20393)[0m TPU available: None, using: 0 TPU cores
[2m[36m(pid=20393)[0m   return torch.tensor(x, **format_kwargs)


KeyboardInterrupt: 

In [None]:
#hide
from nbdev.export import notebook2script

notebook2script("Trainers.ipynb")

In [None]:
!nbdev_update_lib


In [None]:
config = {
        'lr': 1e-4,
        'num_embedding_layers':2,
        'batch_size': 128 #trial.suggest_int("batch_size", 32, 512, log=True),
#        "pretrained_model": trial.suggest_categorical("pretrainedmodel", )
#         'num_hidden_layers': trial.suggest_int("num_hidden_layers", 1,10),
#         'hidden_layer_size': trial.suggest_int()
    }

model = LightningReapp(config)

trainer = lit.Trainer(
    gpus = 1 if torch.cuda.is_available() else None,
    max_epochs=10,
    terminate_on_nan=True)

trainer.fit(model, ldhdata.get_train_dataloader(split=0, batch_size=model.batch_size), 
            ldhdata.get_val_dataloader(split=0, batch_size=model.batch_size))



In [11]:
%autoreload 2
import optuna
from optuna.integration import PyTorchLightningPruningCallback




def objective(trial):
    config = {
        'lr': trial.suggest_loguniform('lr', 1e-4, 100),
        'num_embedding_layers': trial.suggest_int("num_embedding_layers", 1,3),
        'batch_size': trial.suggest_int("batch_size", 32, 128, log=True),
#        "pretrained_model": trial.suggest_categorical("pretrainedmodel", )
#         'num_hidden_layers': trial.suggest_int("num_hidden_layers", 1,10),
#         'hidden_layer_size': trial.suggest_int()
    }

    
    model = LightningReapp(config)
    trainer = lit.Trainer(
        max_epochs=1,
        gpus=1,
        weights_summary=None,
        stochastic_weight_avg=True,
        num_sanity_val_steps=0,
        limit_train_batches=1,
        limit_val_batches=1,
        callbacks=[PyTorchLightningPruningCallback(trial, monitor="val_loss")],
    )
    trainer.fit(model, ldhdata.get_train_dataloader(batch_size=model.batch_size), 
            ldhdata.get_val_dataloader(batch_size=model.batch_size))
    
    metrics = trainer.logged_metrics
    return metrics['train_loss_epoch'], metrics['val_loss']




In [13]:
optuna.logging.set_verbosity(optuna.logging.DEBUG)

study = optuna.create_study(
    study_name='hparamtune',
    directions=['minimize','minimize'],
    pruner= optuna.pruners.ThresholdPruner(lower=None, upper=2, n_warmup_steps=5, interval_steps=1))


study.optimize(
    objective, n_trials=1, timeout=60, n_jobs=1)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    
    print("    {}: {}".format(key, value))

[32m[I 2021-03-04 17:03:34,430][0m A new study created in memory with name: hparamtune[0m
GPU available: True, used: True
TPU available: None, using: 0 TPU cores


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




[33m[W 2021-03-04 17:03:38,180][0m Trial 0 failed because of the following error: NotImplementedError('Trial.report is not supported for multi-objective optimization.')
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/reapp/lib/python3.8/site-packages/optuna/_optimize.py", line 211, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-11-8f8f82b48d3f>", line 30, in objective
    trainer.fit(model, ldhdata.get_train_dataloader(batch_size=model.batch_size),
  File "/home/ubuntu/anaconda3/envs/reapp/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 513, in fit
    self.dispatch()
  File "/home/ubuntu/anaconda3/envs/reapp/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 553, in dispatch
    self.accelerator.start_training(self)
  File "/home/ubuntu/anaconda3/envs/reapp/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in start_training
    self.training_type_plugin.star

NotImplementedError: Trial.report is not supported for multi-objective optimization.

In [7]:
torch.cuda.empty_cache()
torch.cuda.memory_summary()



In [None]:
trial