# Reappraisal Training on PyTorch Lightning

## Setup
- When running on Google Colab, mount Google Drive to access scripts.
- `cd` into the project root and install dependencies:

In [2]:
%load_ext autoreload
# from google.colab import drive
# drive.mount('/content/drive')
# ROOT_DIR = "/content/drive/MyDrive/ldh"

# %pip install transformers datasets pytorch-lightning "ray[tune]" wandb
# %pip uninstall dataclasses # Apparently we need to remove dataclasses or it messes with the tuner
# import nltk
# nltk.download('punkt')


import os

ROOT_DIR = "/Users/danielpham/Google Drive/ldh"
%cd {ROOT_DIR}


/Users/danielpham/Google Drive/ldh


In [4]:
import torch

# Define constants
STRAT = 'obj'
BATCH_SIZE = 16 if not torch.cuda.is_available() else 16
TEST_BATCH_SIZE = 64
NUM_FOLDS = 5

## Load LDH Data

Contains the following:

- LDHI
- LDHII
- LDHIII

In [44]:
from reappraisalmodel.ldhdata import LDHDataModule
ldhdata = LDHDataModule(data_dir=ROOT_DIR, batch_size=BATCH_SIZE, strat=STRAT, kfolds=NUM_FOLDS)

Loading cached processed dataset at /Users/danielpham/Google Drive/ldh/output/training/obj/cache-f115175015bf3091.arrow
100%|██████████| 32109/32109 [00:12<00:00, 2549.92ex/s]


Training data loaded from disk.
Encoding Train Data:
Evaluation data loaded from disk.
Encoding Test Data:


## Run K-Fold Training


In [None]:
%autoreload
from reappraisalmodel.trainers import kfold_train

results = kfold_train(NUM_FOLDS, ldhdata, strat=STRAT, max_epochs=20)

In [None]:

results

## Tuning Hyperparameters


In [None]:
%autoreload
from reappraisalmodel.trainers import run_tune
run_tune(ldhdata)

# Exploring

## Shape of Model

In [None]:
pretrained_model = "distilbert-base-uncased-finetuned-sst-2-english"

from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained(pretrained_model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

### Tokenization

In [None]:
# hide
# Returns a BatchEncoding of the text.
tokenized = tokenizer(text = ["This is the first test sentence!", "This is the second, better test sentence."], 
    padding='max_length', max_length=150)

for idx, sent in enumerate(tokenized.input_ids):
    print(f"Sentence            {idx}: {tokenizer.convert_ids_to_tokens(sent)}")
    print(f"Tokenized Attention {idx}: {tokenized[idx].attention_mask}")



### Training

In [None]:
%autoreload
import torch
from reappraisalmodel.lightningreapp import LightningReapp

default_config = default_config = {
    'lr': 1e-3,
    'hidden_layer_size': 50
}

# Convert attention mask to a true/false tensor
# use masked_select to select only the non-zero parts of the encoding
# Average over feature dimension using AvgPool1d, where stride_size = kernel_size = encode_length_no_zeros




In [None]:
model = LightningReapp.load_from_checkpoint(
    '/Users/danielpham/Google Drive/ldh/lightning_logs_obj_0223/version_2/checkpoints/epoch=1-step=337.ckpt', map_location='cpu')

model.eval()
model

In [37]:
import pandas as pd
from pytorch_lightning import Trainer 

model.eval()
trainer = Trainer(
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    terminate_on_nan=True)

test_dataloader = ldhdata

GPU available: False, used: False
TPU available: None, using: 0 TPU cores


Testing: 100%|██████████| 502/502 [1:17:17<00:00,  9.24s/it]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'predict':     addcode  daycode condition  \
0   ZCQ3041        5  LookNeut   
1   ZCQ3041        5  LookNeut   
2   ZCQ3041        5  LookNeut   
3   ZCQ3041        5  LookNeut   
4   ZCQ3041        5  LookNeut   
5   ZCQ3041        5  LookNeut   
6   ZCQ3041        5  LookNeut   
7   ZCQ3041        5  LookNeut   
8   ZCQ3041        5  LookNeut   
9   ZCQ3041        5  LookNeut   
10  ZCQ3041        5  LookNeut   
11  ZCQ3041        5  LookNeut   
12  XAS1043        5  LookNeut   
13  XAS1043        5  LookNeut   
14  XAS1043        5  LookNeut   
15  XAS1043        5  LookNeut   
16  XAS1043        5  LookNeut   
17  XAS1043        5  LookNeut   
18  XAS1043        5  LookNeut   
19  XAS1043        5  LookNeut   
20  XAS1043        5  LookNeut   
21  XAS1043        5  LookNeut   
22  XAS1043        5  LookNeut   
23  

In [43]:
dfs = []
for result in results:
    print(len(result['predict']))

45


In [45]:
import pickle
with open('./output_reapp.pkl', 'rb+') as f:
    results = pickle.load(f)

In [46]:
results

[(0,
  [3.238124132156372,
   3.2816250324249268,
   2.7720842361450195,
   2.6859583854675293,
   3.020054340362549,
   2.9753594398498535,
   2.997465133666992,
   3.200166702270508,
   2.7771201133728027,
   2.407811164855957,
   2.606590747833252,
   3.1598992347717285,
   3.0359230041503906,
   2.8698744773864746,
   3.0974855422973633,
   2.9610798358917236,
   3.0385332107543945,
   2.34713077545166,
   2.4361188411712646,
   2.711233615875244,
   2.7180004119873047,
   2.6673500537872314,
   3.18027400970459,
   2.390026330947876,
   2.3737940788269043,
   3.362354040145874,
   2.7100276947021484,
   2.434675693511963,
   3.1180777549743652,
   5.595335960388184,
   2.964660167694092,
   3.20009708404541,
   5.224921226501465,
   3.440326690673828,
   4.354846477508545,
   3.460278034210205,
   4.78737211227417,
   2.5443875789642334,
   5.492396354675293,
   3.142167568206787,
   5.802211284637451,
   2.729754686355591,
   2.9242591857910156,
   5.210406303405762,
   4.8059048