# Reappraisal Training on PyTorch Lightning

## Setup
- When running on Google Colab, mount Google Drive to access scripts.
- `cd` into the project root and install dependencies:

In [None]:
%load_ext autoreload

In [None]:
import torch

# Define constants
STRAT = 'obj'
BATCH_SIZE = 128
NUM_FOLDS=5

## Load LDH Data

Contains the following:

- LDHI
- LDHII

In [None]:
from reappraisalmodel.ldhdata import LDHDataModule
ldhdata = LDHDataModule(data_dir='.', strat=STRAT)
ldhdata.load_train_data()
#ldhdata.load_eval_data()

Loading cached processed dataset at output/training/obj/cache-50ed3c54936a704e.arrow


Training data loaded from disk.
Encoding Training Data:


## Run K-Fold Training


In [None]:
%autoreload 2
from reappraisalmodel.trainers import kfold_train


results = kfold_train(5, ldhdata, strat=STRAT, 
                       max_epochs=15, 
#                        limit_train_batches=2,
#                        limit_val_batches=1
                    )


Created temporary directory: /tmp/tmpgsdhoddc


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using native 16bit precision.


Training on split 0




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  return torch.tensor(x, **format_kwargs)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…






IndexError: list index out of range

In [None]:
import pandas as pd

df = pd.DataFrame(results)
df['r2score'] = df['r2score'].apply(lambda x: x.item())
df['explained_var'] = df['explained_var'].apply(lambda x: x.item())
df.describe()

## Tuning Hyperparameters


In [None]:
# export
%autoreload
import torch
import pytorch_lightning as lit
from pytorch_lightning.loggers import TensorBoardLogger
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from ray import tune

from reappraisalmodel.lightningreapp import LightningReapp

default_tune_config = {
    "lr": tune.loguniform(1e-4, 1e-1), # loguniform samples by magnitude
    "hidden_layer_size": tune.randint(0,50),
}

callback_tuner = TuneReportCallback(
    {
        "loss": "val_loss",
    },
    on="validation_end",
)

### TUNING HYPERPARAMETERS
def train_tune(config, ldhdata, num_gpus=None, num_epochs=10):
    model = LightningReapp(config)
    print("Running tune")
    trainer = lit.Trainer(
        limit_train_batches=1,
        limit_val_batches=1,
        gpus=num_gpus,
        callbacks=[callback_tuner],
    )
    trainer.fit(model, ldhdata)

analysis = tune.run(
    tune.with_parameters(train_tune,
        ldhdata=ldhdata,
        num_epochs=1),
    config=default_tune_config, 
    num_samples=2)
print("Best hyperparameters found were: ", analysis.best_config)


In [None]:
%autoreload
import pytorch_lightning as lit

from reappraisalmodel.lightningreapp import LightningReapp

model = LightningReapp({
    'lr': 1e-3,
    'hidden_layer_size': 50
})

trainer = lit.Trainer(fast_dev_run=1)
trainer.fit(model, ldhdata)


## Extra!

In [None]:
from reappraisalmodel.lightningreapp import LightningReapp
objmodel = LightningReapp.load_from_checkpoint("s3://ldhdata/backup/obj-epoch=1-step=337.ckpt")
objdl = torch.utils.data.DataLoader(ldhdata.train_data,batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
objouts = []

In [None]:
from tqdm import tqdm
objmodel.cuda()
objmodel.eval()
for batch_idx, batch in enumerate(tqdm(objdl)):
    if batch_idx >= 10:
        break
    input_ids = batch['input_ids'].cuda()
    attention_mask = batch['attention_mask'].cuda()
    out = objmodel(input_ids, attention_mask)
    objouts.append(out.sum(dim=1))

 10%|▉         | 10/103 [00:13<02:05,  1.35s/it]


In [None]:
cpuouts = [ten.detach().cpu().tolist() for ten in objouts]


newouts = []
for batch in cpuouts:
    newouts += batch
len(newouts)

In [None]:
# far_df = ldhdata.train_data.to_dict()

# far_df['observed'] = cpuouts


In [None]:
# obj_df = obj_df[:len(newouts)]

# obj_df['observed'] = newouts

obj_df.to_csv("study1subset_scored_obj.csv")

In [None]:
farmodel = LightningReapp.load_from_checkpoint("s3://ldhdata/backup/far-0224-epoch=2-step=2021.ckpt")
farldhdata = LDHDataModule(data_dir='.', strat='far')
farldhdata.load_train_data()
ldhdata.train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'score'])
fardl = torch.utils.data.DataLoader(ldhdata.train_data,batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
farouts = []
farmodel.cuda()
print("Sent model to GPU")
farmodel.eval()

Loading cached processed dataset at output/training/far/cache-ea913b003125d90f.arrow


Training data loaded from disk.
Encoding Training Data:
Sent model to GPU


LightningReapp(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [None]:
for idx, batch in enumerate(tqdm(fardl)):
    if idx >= 10:
        break
    input_ids = batch['input_ids'].cuda()
    attention_mask = batch['attention_mask'].cuda()
    farout = farmodel(input_ids, attention_mask)
    farouts.append(out.sum(dim=1).to('cpu'))

 10%|▉         | 10/103 [00:13<02:03,  1.33s/it]


In [None]:
cpuouts = [ten.detach().cpu().tolist() for ten in farouts]


newouts = []
for batch in cpuouts:
    newouts += batch
len(newouts)

traindata = ldhdata.train_data

traindata.reset_format()

import pandas as pd
far_df = pd.DataFrame(ldhdata.train_data[:len(newouts)], columns=['response', 'score', 'observed'])
far_df[['observed']] = newouts

In [None]:
far_df
far_df.to_csv('study1subset_scored_far.csv')

### Tokenization

In [None]:
# hide
# Returns a BatchEncoding of the text.
tokenized = tokenizer(text = ["This is the first test sentence!", "This is the second, better test sentence."], 
    padding='max_length', max_length=150)

for idx, sent in enumerate(tokenized.input_ids):
    print(f"Sentence            {idx}: {tokenizer.convert_ids_to_tokens(sent)}")
    print(f"Tokenized Attention {idx}: {tokenized[idx].attention_mask}")



In [None]:
%autoreload
import torch
import pytorch_lightning as lit
from reappraisalmodel.lightningreapp import LightningReapp

default_config = default_config = {
    'lr': 1e-3,
    'hidden_layer_size': 50
}

model = LightningReapp(default_config)

trainer = lit.Trainer(
    gpus = 1 if torch.cuda.is_available() else None,
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    max_epochs=10,
    fast_dev_run=2,
    terminate_on_nan=True)

model = LightningReapp(default_config)

trainer.fit(model, ldhdata.train_dataloader(), ldhdata.val_dataloader())



In [None]:
model = LightningReapp.load_from_checkpoint(
    '/Users/danielpham/Google Drive/ldh/lightning_logs_obj_0223/version_2/checkpoints/epoch=1-step=337.ckpt', map_location='cpu')

model.eval()
model

In [None]:
import pandas as pd
from pytorch_lightning import Trainer 

model.eval()
trainer = Trainer(
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    terminate_on_nan=True)

test_dataloader = ldhdata

In [None]:
dfs = []
for result in results:
    print(len(result['predict']))

In [None]:
import pickle
with open('./output_reapp.pkl', 'rb+') as f:
    results = pickle.load(f)

In [None]:
import boto3
import pickle

from sagemaker import get_execution_role
role = get_execution_role()

bucket = 'ldhdata'
file = 'Master_Final_TrainingData.csv'

s3client = boto3.client('s3')

response = s3client.get_object(Bucket=bucket, Key=file)

import codecs 
import csv

train = csv.DictReader(codecs.getreader("utf-8")(response["Body"])) # returns an ordered dict
