# Reappraisal Training on PyTorch Lightning

## Setup
- When running on Google Colab, mount Google Drive to access scripts.
- `cd` into the project root and install dependencies:

In [None]:
%load_ext autoreload

In [None]:
import torch

# Define constants
STRAT = 'far'
BATCH_SIZE = 128
NUM_FOLDS=5

## Load LDH Data

Contains the following:

- LDHI
- LDHII

In [None]:
from reappraisalmodel.ldhdata import LDHDataModule
ldhdata = LDHDataModule(data_dir='.', strat=STRAT)
ldhdata.load_train_data()
# ldhdata.load_eval_data()
objdl = torch.utils.data.DataLoader(ldhdata.train_data,batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)



Loading cached processed dataset at output/training/far/cache-ea913b003125d90f.arrow


Training data loaded from disk.
Encoding Training Data:


## Run K-Fold Training


In [None]:
%autoreload 2
from reappraisalmodel.trainers import kfold_train


results = kfold_train(5, ldhdata, strat=STRAT, 
                       max_epochs=15, 
#                        limit_train_batches=2,
#                        limit_val_batches=1
                    )


In [None]:
import pandas as pd

df = pd.DataFrame(results)
df['r2score'] = df['r2score'].apply(lambda x: x.item())
df['explained_var'] = df['explained_var'].apply(lambda x: x.item())
df.describe()

## Tuning Hyperparameters


In [None]:
# export
%autoreload
import torch
import pytorch_lightning as lit
from pytorch_lightning.loggers import TensorBoardLogger
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from ray import tune

from reappraisalmodel.lightningreapp import LightningReapp

default_tune_config = {
    "lr": tune.loguniform(1e-4, 1e-1), # loguniform samples by magnitude
    "hidden_layer_size": tune.randint(0,50),
}

callback_tuner = TuneReportCallback(
    {
        "loss": "val_loss",
    },
    on="validation_end",
)

### TUNING HYPERPARAMETERS
def train_tune(config, ldhdata, num_gpus=None, num_epochs=10):
    model = LightningReapp(config)
    print("Running tune")
    trainer = lit.Trainer(
        limit_train_batches=1,
        limit_val_batches=1,
        gpus=num_gpus,
        callbacks=[callback_tuner],
    )
    trainer.fit(model, ldhdata)

analysis = tune.run(
    tune.with_parameters(train_tune,
        ldhdata=ldhdata,
        num_epochs=1),
    config=default_tune_config, 
    num_samples=2)
print("Best hyperparameters found were: ", analysis.best_config)


In [None]:
%autoreload
import pytorch_lightning as lit

from reappraisalmodel.lightningreapp import LightningReapp

model = LightningReapp({
    'lr': 1e-3,
    'hidden_layer_size': 50
})

trainer = lit.Trainer(fast_dev_run=1)
trainer.fit(model, ldhdata)


## Extra!

In [None]:
%autoreload
from reappraisalmodel.lightningreapp import LightningReapp
objmodel = LightningReapp.load_from_checkpoint("s3://ldhdata/backup/far-0224-epoch=2-step=2021.ckpt")


In [None]:
from tqdm import tqdm
objmodel.cuda()
objmodel.eval()
objouts = []
for batch_idx, batch in enumerate(tqdm(objdl)):
    input_ids = batch['input_ids'].cuda()
    attention_mask = batch['attention_mask'].cuda()
    out = objmodel(input_ids, attention_mask)
    objouts.append(out.sum(dim=1))
print(objouts)

100%|██████████| 103/103 [02:09<00:00,  1.26s/it]


[tensor([4.9768, 4.7120, 5.3450, 3.1752, 5.8367, 5.6425, 3.0887, 2.7584, 4.6572,
        4.6415, 6.3578, 5.9967, 3.9584, 5.3127, 5.6118, 5.3018, 2.7939, 4.1334,
        4.3670, 4.3868, 5.8492, 5.2478, 5.7022, 4.6689, 3.4617, 3.3892, 3.7305,
        3.5305, 4.3831, 4.9282, 4.3812, 5.1925, 2.9679, 5.2369, 5.5824, 4.9345,
        5.7207, 5.0278, 4.6729, 5.9130, 4.0081, 3.9656, 3.4183, 3.7172, 4.0550,
        5.4847, 3.9615, 6.4529, 6.6842, 5.4715, 5.0014, 4.6121, 3.8554, 2.9429,
        4.7425, 3.7650, 2.6500, 6.0192, 2.0828, 5.5164, 4.9339, 5.5056, 3.5440,
        4.4746, 4.3192, 5.9902, 3.3773, 4.1931, 5.6179, 2.9066, 6.5778, 4.8426,
        5.1134, 5.9744, 3.7991, 3.4627, 2.7221, 5.3629, 3.3773, 4.0681, 6.2160,
        4.1665, 4.9335, 4.2355, 5.6654, 3.6064, 4.1600, 6.2243, 3.7744, 2.8945,
        6.6466, 3.8747, 5.3905, 6.1073, 4.2364, 5.0721, 3.6362, 2.8469, 5.5601,
        3.2527, 4.5304, 4.3323, 6.5144, 5.5979, 5.4404, 4.0319, 4.9598, 3.1683,
        7.5310, 6.4202, 5.4275, 4.8484,

In [None]:
import pytorch_lightning as lit
trainer = lit.Trainer(
    limit_train_batches=1,
gpus=1)
objouts = trainer.fit(objmodel, objdl)

In [None]:
cpuouts = [ten.detach().cpu().tolist() for ten in objouts]


newouts = []
for batch in cpuouts:
    newouts += batch
len(newouts)

import pandas as pd
objdata = ldhdata.train_data
objdata.reset_format()

objdf = pd.DataFrame(objdata, columns=['response', 'score'])
objdf[['observed']] = newouts


In [None]:
objdf.to_csv("./study1full_scored_far.csv")

In [None]:
farmodel = LightningReapp.load_from_checkpoint("s3://ldhdata/backup/far-0224-epoch=2-step=2021.ckpt")
farldhdata = LDHDataModule(data_dir='.', strat='far')
farldhdata.load_train_data()
ldhdata.train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'score'])
fardl = torch.utils.data.DataLoader(ldhdata.train_data,batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
farouts = []
farmodel.cuda()
print("Sent model to GPU")
farmodel.eval()

In [None]:
for idx, batch in enumerate(tqdm(fardl)):
    if idx >= 10:
        break
    input_ids = batch['input_ids'].cuda()
    attention_mask = batch['attention_mask'].cuda()
    farout = farmodel(input_ids, attention_mask)
    farouts.append(out.sum(dim=1).to('cpu'))

In [None]:
cpuouts = [ten.detach().cpu().tolist() for ten in farouts]


newouts = []
for batch in cpuouts:
    newouts += batch
len(newouts)

traindata = ldhdata.train_data

traindata.reset_format()

import pandas as pd
far_df = pd.DataFrame(ldhdata.train_data[:len(newouts)], columns=['response', 'score', 'observed'])
far_df[['observed']] = newouts

In [None]:
far_df
far_df.to_csv('study1subset_scored_far.csv')

### Tokenization

In [None]:
# hide
# Returns a BatchEncoding of the text.
tokenized = tokenizer(text = ["This is the first test sentence!", "This is the second, better test sentence."], 
    padding='max_length', max_length=150)

for idx, sent in enumerate(tokenized.input_ids):
    print(f"Sentence            {idx}: {tokenizer.convert_ids_to_tokens(sent)}")
    print(f"Tokenized Attention {idx}: {tokenized[idx].attention_mask}")



In [None]:
%autoreload
import torch
import pytorch_lightning as lit
from reappraisalmodel.lightningreapp import LightningReapp

default_config = default_config = {
    'lr': 1e-3,
    'hidden_layer_size': 50
}

model = LightningReapp(default_config)

trainer = lit.Trainer(
    gpus = 1 if torch.cuda.is_available() else None,
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    max_epochs=10,
    fast_dev_run=2,
    terminate_on_nan=True)

model = LightningReapp(default_config)

trainer.fit(model, ldhdata.train_dataloader(), ldhdata.val_dataloader())



In [None]:
model = LightningReapp.load_from_checkpoint(
    '/Users/danielpham/Google Drive/ldh/lightning_logs_obj_0223/version_2/checkpoints/epoch=1-step=337.ckpt', map_location='cpu')

model.eval()
model

In [None]:
import pandas as pd
from pytorch_lightning import Trainer 

model.eval()
trainer = Trainer(
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    terminate_on_nan=True)

test_dataloader = ldhdata

In [None]:
dfs = []
for result in results:
    print(len(result['predict']))

In [None]:
import pickle
with open('./output_reapp.pkl', 'rb+') as f:
    results = pickle.load(f)

In [None]:
import boto3
import pickle

from sagemaker import get_execution_role
role = get_execution_role()

bucket = 'ldhdata'
file = 'Master_Final_TrainingData.csv'

s3client = boto3.client('s3')

response = s3client.get_object(Bucket=bucket, Key=file)

import codecs 
import csv

train = csv.DictReader(codecs.getreader("utf-8")(response["Body"])) # returns an ordered dict
