# Reappraisal Training on PyTorch Lightning

## Setup
- When running on Google Colab, mount Google Drive to access scripts.
- `cd` into the project root and install dependencies:

In [None]:
%load_ext autoreload
# from google.colab import drive
# drive.mount('/content/drive')
# %cd {root_dir}

%pip install pytorch-lightning "ray[tune]" wandb transformers datasets nltk 
%pip install nbdev 


import nltk
nltk.download('punkt')

ROOT_DIR = '/home/ec2-user/SageMaker/reappraisal-model/'

In [None]:
import torch

# Define constants
STRAT = 'obj'
BATCH_SIZE = 16 if not torch.cuda.is_available() else 16
TEST_BATCH_SIZE = 64
NUM_FOLDS = 5

## Load LDH Data

Contains the following:

- LDHI
- LDHII
- LDHIII

In [None]:
%autoreload
from reappraisalmodel.ldhdata import LDHDataModule
ldhdata = LDHDataModule(data_dir=ROOT_DIR, batch_size=BATCH_SIZE, strat=STRAT, kfolds=NUM_FOLDS)

[autoreload of reappraisalmodel.ldhdata failed: Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 410, in superreload
    update_generic(old_obj, new_obj)
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 347, in update_generic
    update(a, b)
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 302, in update_class
    if update_generic(old_obj, new_obj): continue
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 347, in update_generic
    update(a, b)
  File "/home/ec2-user/an

Training data loaded from disk.
Encoding Train Data:
Evaluation data loaded from disk.
Encoding Test Data:


HBox(children=(FloatProgress(value=0.0, max=32109.0), HTML(value='')))




## Run K-Fold Training


In [None]:
%autoreload
from reappraisalmodel.trainers import kfold_train

results = kfold_train(NUM_FOLDS, ldhdata, strat=STRAT, max_epochs=20, fast_dev_run=1)

SyntaxError: invalid syntax (trainers.py, line 56)

## Tuning Hyperparameters


In [None]:
%autoreload
from reappraisalmodel.trainers import run_tune
run_tune(ldhdata)

# Exploring

## Shape of Model

In [None]:
pretrained_model = "distilbert-base-uncased-finetuned-sst-2-english"

from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained(pretrained_model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

### Tokenization

In [None]:
# hide
# Returns a BatchEncoding of the text.
tokenized = tokenizer(text = ["This is the first test sentence!", "This is the second, better test sentence."], 
    padding='max_length', max_length=150)

for idx, sent in enumerate(tokenized.input_ids):
    print(f"Sentence            {idx}: {tokenizer.convert_ids_to_tokens(sent)}")
    print(f"Tokenized Attention {idx}: {tokenized[idx].attention_mask}")



### Training

In [None]:
%autoreload
import torch
import pytorch_lightning as lit
from reappraisalmodel.lightningreapp import LightningReapp

default_config = default_config = {
    'lr': 1e-3,
    'hidden_layer_size': 50
}

model = LightningReapp(default_config)

trainer = lit.Trainer(
    gpus = 1 if torch.cuda.is_available() else None,
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    max_epochs=10,
    fast_dev_run=2,
    terminate_on_nan=True)

model = LightningReapp(default_config)

trainer.fit(model, ldhdata.train_dataloader(), ldhdata.val_dataloader())



In [None]:
model = LightningReapp.load_from_checkpoint(
    '/Users/danielpham/Google Drive/ldh/lightning_logs_obj_0223/version_2/checkpoints/epoch=1-step=337.ckpt', map_location='cpu')

model.eval()
model

In [None]:
import pandas as pd
from pytorch_lightning import Trainer 

model.eval()
trainer = Trainer(
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    terminate_on_nan=True)

test_dataloader = ldhdata

In [None]:
dfs = []
for result in results:
    print(len(result['predict']))

In [None]:
import pickle
with open('./output_reapp.pkl', 'rb+') as f:
    results = pickle.load(f)

In [None]:
import boto3
import pickle

from sagemaker import get_execution_role
role = get_execution_role()

bucket = 'ldhdata'
file = 'Master_Final_TrainingData.csv'

s3client = boto3.client('s3')

response = s3client.get_object(Bucket=bucket, Key=file)

import codecs 
import csv

train = csv.DictReader(codecs.getreader("utf-8")(response["Body"])) # returns an ordered dict


In [None]:
import torch
from pytorch_lightning.metrics.functional import r2score, explained_variance

expected = torch.rand(16)
observed = torch.rand(16)

r2score(expected, observed)

In [None]:
%nbdev_build_lib --fname 'Trainers.ipynb'

UsageError: Line magic function `%nbdev_build_lib` not found.
