# Reappraisal Training on PyTorch Lightning

## Setup
- When running on Google Colab, mount Google Drive to access scripts.
- `cd` into the project root and install dependencies:

In [None]:
%load_ext tensorboard
import os



In [None]:
%load_ext autoreload

# import nltk
# nltk.download('punkt')

In [None]:
%autoreload
import torch

# Define constants
STRAT = 'obj'
BATCH_SIZE = 128

## Load LDH Data

Contains the following:

- LDHI
- LDHII

In [None]:
import pathlib
from reappraisalmodel.ldhdata import LDHDataModule

ROOT_DIR = pathlib.Path().parent
ldhdata = LDHDataModule(data_dir=ROOT_DIR, strat=STRAT)
ldhdata.load_train_data()
ldhdata.load_eval_data()

Loading cached processed dataset at output/training/obj/cache-50ed3c54936a704e.arrow


Training data loaded from disk.
Encoding Training Data:
Evaluation data loaded from disk.
Encoding Test Data


HBox(children=(FloatProgress(value=0.0, max=32109.0), HTML(value='')))




## Run K-Fold Training


In [None]:
%autoreload
import datetime
import logging
import os
import tempfile
import torch
import pandas as pd
import pytorch_lightning as lit
import wandb
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger, WandbLogger

from reappraisalmodel.lightningreapp import LightningReapp
from reappraisalmodel.utils import upload_file

import datetime

strat = 'far'
config = {
    'lr': 1e-3,
    'num_embedding_layers': 2,
    'batch_size': 128
}
save_dir=ROOT_DIR / 'reapp_logs'
today = datetime.datetime.today().strftime('%Y%m%d_%H%M%S')
name=f"_{strat}_{today}"


 # Loggers
logger = TensorBoardLogger(
    save_dir=save_dir,
    name=name,

)

csv_logger = CSVLogger(
    save_dir=save_dir,
    name=name,
)

    #Checkpoints
early_stop_checkpoint = EarlyStopping(
    monitor='val_loss',
    mode='min',
    min_delta=0.01,
    patience=2,
    verbose=True
)

callback_checkpoint = ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    dirpath=save_dir,
    filename= '{epoch:02d}-{val_loss:.02f}',
    verbose=True,
    save_last=False,
    save_top_k=1,
    save_weights_only=False,
)

model = LightningReapp(config)
trainer = lit.Trainer(
    benchmark=True,
    logger = [logger, csv_logger],
    gpus = 1,
    val_check_interval=0.25,
    gradient_clip_val=0.5,
    max_epochs=10,
    terminate_on_nan=True,
    weights_summary=None,
    callbacks=[callback_checkpoint, early_stop_checkpoint])
trainer.fit(model, ldhdata.get_train_dataloader(batch_size=model.batch_size), 
ldhdata.get_val_dataloader(batch_size=model.batch_size))

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  return torch.tensor(x, **format_kwargs)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

In [None]:
trainer.logged_metrics

In [None]:
import pandas as pd

df = pd.DataFrame(results)
df['r2score'] = df['r2score'].apply(lambda x: x.item())
df['explained_var'] = df['explained_var'].apply(lambda x: x.item())
df.describe()

## Tuning Hyperparameters


In [None]:
%autoreload
import pytorch_lightning as lit

from reappraisalmodel.lightningreapp import LightningReapp

model = LightningReapp({
    'lr': 1e-3,
    'hidden_layer_size': 50
})

trainer = lit.Trainer(fast_dev_run=1)
trainer.fit(model, ldhdata)


## Extra!

In [None]:
resp = s3.list_objects(Bucket='ldhdata')

In [None]:
key="obj/{'metrics': {'epoch': tensor(10.), 'val_loss': tensor(1.2882), 'r2score': tensor(0.5832), 'explained_var': tensor(0.6109), 'train_loss': tensor(1.1194, device='cuda:0')}, 'checkpoint': '/tmp/tmpvsrtlvfl/reappmodel_obj_20210227_203013/2_epoch=07-val_loss=1.12.ckpt', 'num_epochs': 10}-20210227_203013-2_epoch=07-val_loss=1.12.ckpt"
s3.copy({
    'Bucket': 'ldhdata',
    'Key': key
},
Bucket='ldhdata',
Key='obj/20210227_203013-2_epoch=07-val_loss=1.12.ckpt')


In [None]:
import torch
metrics = [
    {'epoch': torch.tensor(7.), 'val_loss': 1.1842, 'r2score': torch.tensor(0.6130), 'explained_var': torch.tensor(0.6388), 'train_loss': torch.tensor(1.1642, device='cuda:0')},
    {'epoch': torch.tensor(8.), 'val_loss': 1.1819, 'r2score': torch.tensor(0.6087), 'explained_var': torch.tensor(0.6377), 'train_loss': torch.tensor(1.0931, device='cuda:0')}, 
    {'epoch': torch.tensor(9.), 'val_loss': 1.2094, 'r2score': torch.tensor(0.5926), 'explained_var': torch.tensor(0.6366), 'train_loss': torch.tensor(1.1363, device='cuda:0')}, 
    {'epoch': torch.tensor(10.), 'val_loss': 1.2712, 'r2score': torch.tensor(0.5906), 'explained_var': torch.tensor(0.6339), 'train_loss': torch.tensor(1.0842, device='cuda:0')}, 
    {'epoch': torch.tensor(10.), 'val_loss': 1.2882, 'r2score': torch.tensor(0.5832), 'explained_var': torch.tensor(0.6109), 'train_loss': torch.tensor(1.1194, device='cuda:0')}, 
]

df = pd.DataFrame(metrics)
for key in ['r2score', 'epoch', 'explained_var', 'train_loss']:
    df[key] = df[key].apply(lambda x: x.item())

In [None]:
upload_report = upload_file('this.csv', 'ldhdata', f'obj/20210227_203013-report.csv')
print(f"Successful Uploading Report to s3: {upload_report}")

In [None]:
df.describe()

In [None]:
for obj in resp['Contents']:
    print(obj['Key'])

### Tokenization

In [None]:
# hide
# Returns a BatchEncoding of the text.
tokenized = tokenizer(text = ["This is the first test sentence!", "This is the second, better test sentence."], 
    padding='max_length', max_length=150)

for idx, sent in enumerate(tokenized.input_ids):
    print(f"Sentence            {idx}: {tokenizer.convert_ids_to_tokens(sent)}")
    print(f"Tokenized Attention {idx}: {tokenized[idx].attention_mask}")



In [None]:
%autoreload
import torch
import pytorch_lightning as lit
from reappraisalmodel.lightningreapp import LightningReapp

default_config = default_config = {
    'lr': 1e-3,
    'hidden_lay': 50
}

model = LightningReapp(default_config)

trainer = lit.Trainer(
    gpus = 1 if torch.cuda.is_available() else None,
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    max_epochs=10,
    fast_dev_run=2,
    terminate_on_nan=True)

model = LightningReapp(default_config)

trainer.fit(model, ldhdata.train_dataloader(), ldhdata.val_dataloader())



In [None]:
model = LightningReapp.load_from_checkpoint(
    '/Users/danielpham/Google Drive/ldh/lightning_logs_obj_0223/version_2/checkpoints/epoch=1-step=337.ckpt', map_location='cpu')

model.eval()
model

In [None]:
%autoreload 
import pandas as pd
from pytorch_lightning import Trainer 

from reappraisalmodel.lightningreapp import LightningReapp

config = {
    'lr': 1e-3,
    'num_embedding_layers': 2
}

model = LightningReapp(config)
trainer = Trainer(
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    terminate_on_nan=True)

test_dataloader = ldhdata

In [None]:
dfs = []
for result in results:
    print(len(result['predict']))

In [None]:
import boto3
import pickle

from sagemaker import get_execution_role
role = get_execution_role()

bucket = 'ldhdata'
file = 'Master_Final_TrainingData.csv'

s3client = boto3.client('s3')

response = s3client.get_object(Bucket=bucket, Key=file)

import codecs 
import csv

train = csv.DictReader(codecs.getreader("utf-8")(response["Body"])) # returns an ordered dict


In [None]:
torch.cuda

<module 'torch.cuda' from '/home/ubuntu/anaconda3/envs/reapp/lib/python3.8/site-packages/torch/cuda/__init__.py'>