# Reappraisal Training on PyTorch Lightning

## Setup
- `cd` into the project root and install dependencies:
  - Conda: 
  - Pip: 

## GPU Usage:

## Loading and Encoding Data

In [None]:
%load_ext autoreload
import os
import torch

# Define project root directory.
ROOT_DIR = os.path.abspath(".")
STRAT = 'obj'
BATCH_SIZE = 64
NUM_FOLDS=5
DEV_FLAG = 1 # Flag for fast runs when debugging.

# Load the DataModule and its corresponding 
from reappraisalmodel.ldhdata import LDHDataModule
ldhdata = LDHDataModule(data_dir=ROOT_DIR, strat=STRAT)
ldhdata.load_train_data()
ldhdata.load_eval_data()

Loading cached processed dataset at /Users/danielpham/Google Drive/ldh/output/training/obj/cache-5d39f0ae666bea55.arrow


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Training data loaded from disk.
Encoding Training Data:
Evaluation data loaded from disk.
Encoding Test Data


HBox(children=(FloatProgress(value=0.0, max=32109.0), HTML(value='')))




## Run K-Fold Training
- See `./nbs/Trainers.ipynb`


In [None]:
%autoreload 2
from reappraisalmodel.trainers import kfold_train
# Learns a model NUM_FOLDS times and records the distribution of metrics across the CV.
results = kfold_train(
    NUM_FOLDS, 
    ldhdata, 
    strat=STRAT, 
)


In [None]:
import pandas as pd

df = pd.DataFrame(results)
df['r2score'] = df['r2score'].apply(lambda x: x.item())
df['explained_var'] = df['explained_var'].apply(lambda x: x.item())
df.describe()

## Testing on LDH II Data

# Training Process

## Test Models (Manual)

In [None]:
%autoreload
from reappraisalmodel.lightningreapp import LightningReapp
objmodel = LightningReapp.load_from_checkpoint("s3://ldhdata/backup/far-0224-epoch=2-step=2021.ckpt")

In [None]:
from tqdm import tqdm
objmodel.cuda()
objmodel.eval()
objouts = []
for batch_idx, batch in enumerate(tqdm(objdl)):
    input_ids = batch['input_ids'].cuda()
    attention_mask = batch['attention_mask'].cuda()
    out = objmodel(input_ids, attention_mask)
    objouts.append(out.sum(dim=1))
print(objouts)

## Single Training Session

In [None]:
%autoreload
import pickle
from datetime import datetime

import torch
import pytorch_lightning as lit
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from reappraisalmodel.lightningreapp import LightningReapp

model = LightningReapp()

# Model saves the 3 checkpoints with the lowest validation loss throughout training
modelcheckpoint = ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=3,
    verbose=True
)
# Model tracks the loss_distance; shows when training and validation loss begin to diverge 
modelcheckpoint_loss_dist = ModelCheckpoint(
    monitor='loss_distance',
    mode='min',
    save_top_k=3,
    verbose=True
)

# Split train and validation data.
ldhdata.train_data.set_format(type='torch', columns=['score', 'input_ids', 'attention_mask'])
split_data = ldhdata.train_data.train_test_split(test_size=0.2)
train_data = split_data['train']
val_data = split_data['test']
eval_data = ldhdata.eval_data

# Create dataloaders
train_dl = DataLoader(train_data, batch_size=BATCH_SIZE,num_workers=4)
val_dl = DataLoader(val_data, batch_size=BATCH_SIZE, num_workers=4)
eval_dl = DataLoader(eval_data, batch_size=BATCH_SIZE, num_workers=4)

# Mark the start time of the training session. 
today = datetime.today().strftime('%Y%m%d_%H%M%S')
session_version = "_".join([STRAT,today])
tb_logger = TensorBoardLogger("lightning_logs", name="reapp_model", version=session_version)
trainer = lit.Trainer(
    logger = tb_logger,
    precision=16 if torch.cuda.is_available() else 32, # We use 16-bit precision to reduce computational complexity
    val_check_interval=0.25, # Check validation loss 4 times an epoch
    callbacks=[modelcheckpoint, modelcheckpoint_loss_dist], # Register callbacks with trainer.
    gpus=1 if torch.cuda.is_available() else None,
    fast_dev_run=2 if DEV_FLAG else None
)
results = trainer.fit(model, train_dl, val_dl)

# ldhdata.eval_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])
# results = trainer.test(model, eval_dl)

# import pickle
# with open("results_obj", "wb+") as f:
#     pickle.dump(results, f)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores
Running in fast_dev_run mode: will run a full train, val and test loop using 2 batch(es).

  | Name          | Type              | Params
----------------------------------------------------
0 | bert          | DistilBertModel   | 66.4 M
1 | classifier    | Sequential        | 38.8 K
2 | train_loss    | MeanSquaredError  | 0     
3 | val_loss      | MeanSquaredError  | 0     
4 | r2score       | R2Score           | 0     
5 | explained_var | ExplainedVariance | 0     
----------------------------------------------------
38.8 K    Trainable params
66.4 M    Non-trainable params
66.4 M    Total params
265.607   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [None]:
ldhdata.eval_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])
ldhdata.eval_data.reset_format()
# eval_data = ldhdata.eval_data
eval_dl = DataLoader(ldhdata.eval_data, batch_size=BATCH_SIZE)

model.eval()

for batch, batch_idx in eval_dl:
    print(batch)
    break

# with open("results_obj", "wb+") as f:
#     pickle.dump(results, f)

TypeError: must be real number, not NoneType

In [None]:
results

[{}]

In [None]:
import datetime
import torch
import pytorch_lightning as lit
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from reappraisalmodel.lightningreapp import LightningReapp
from reappraisalmodel.ldhdata import LDHDataModule

model = LightningReapp()

modelcheckpoint = ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=3,
    verbose=True
)

ldhdata.train_data.set_format(type='torch', columns=['score', 'input_ids', 'attention_mask'])
data = ldhdata.train_data.train_test_split(test_size=0.2)
train_data = data['train']
val_data = data['test']
train_dl = DataLoader(train_data, batch_size=BATCH_SIZE, num_workers=4)
val_dl = DataLoader(val_data, batch_size=BATCH_SIZE, num_workers=4)

today = datetime.datetime.today().strftime('%Y%m%d_%H%M%S')
trainer = lit.Trainer(
    logger = TensorBoardLogger("lightning_logs", name="reapp_model", version="_".join([STRAT,today])),
    precision=16,
    max_epochs=30,
    val_check_interval=0.25,
    gpus=1)

# Fit model on training data.
results = trainer.fit(model, train_dl, val_dl)

import pickle
with open("results_obj", "wb+") as f:
    pickle.dump(results, f)

In [None]:
ldhdata.eval_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])
eval_data = ldhdata.eval_data
eval_dl = DataLoader(eval_data, batch_size=BATCH_SIZE, num_workers=4)

results = trainer.test(model, eval_dl)

cpuouts = [ten.detach().cpu().tolist() for ten in results]
newouts = []
for batch in cpuouts:
    newouts += batch
len(newouts)

import pandas as pd
objdata = ldhdata.train_data
objdata.reset_format()

objdf = pd.DataFrame(objdata, columns=['response', 'score'])
objdf[['observed']] = newouts

In [None]:
farmodel = LightningReapp.load_from_checkpoint("s3://ldhdata/backup/far-0224-epoch=2-step=2021.ckpt")
farldhdata = LDHDataModule(data_dir='.', strat='far')
farldhdata.load_train_data()
ldhdata.train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'score'])
fardl = torch.utils.data.DataLoader(ldhdata.train_data,batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
farouts = []
farmodel.cuda()
print("Sent model to GPU")
farmodel.eval()

In [None]:
for idx, batch in enumerate(tqdm(fardl)):
    if idx >= 10:
        break
    input_ids = batch['input_ids'].cuda()
    attention_mask = batch['attention_mask'].cuda()
    farout = farmodel(input_ids, attention_mask)
    farouts.append(out.sum(dim=1).to('cpu'))

In [None]:
cpuouts = [ten.detach().cpu().tolist() for ten in farouts]
newouts = []
for batch in cpuouts:
    newouts += batch
len(newouts)

traindata = ldhdata.train_data

traindata.reset_format()

import pandas as pd
far_df = pd.DataFrame(ldhdata.train_data[:len(newouts)], columns=['response', 'score', 'observed'])
far_df[['observed']] = newouts

In [None]:
far_df
far_df.to_csv('study1subset_scored_far.csv')