# Reappraisal Training on PyTorch Lightning

## Setup
- When running on Google Colab, mount Google Drive to access scripts.
- `cd` into the project root and install dependencies:

In [None]:
%load_ext autoreload
# from google.colab import drive
# drive.mount('/content/drive')
# %cd {root_dir}

# %pip install pytorch-lightning "ray[tune]" wandb transformers datasets nltk nbdev jupyterlab_github
# ! nbdev_install_git_hooks

# import nltk
# nltk.download('punkt')

# ROOT_DIR = '/root/reappraisal-model'


In [None]:
# Define constants
STRAT = 'obj'
BATCH_SIZE = 64 
NUM_FOLDS=1

## Load LDH Data

Contains the following:

- LDHI
- LDHII
- LDHIII

In [None]:
%autoreload
from reappraisalmodel.ldhdata import LDHDataModule
ldhdata = LDHDataModule(data_dir=ROOT_DIR, batch_size=BATCH_SIZE, strat=STRAT, kfolds=5)
ldhdata.load_train_data()


Training data loaded from disk.


[autoreload of reappraisalmodel.ldhdata failed: Traceback (most recent call last):
  File "/Users/danielpham/.pyenv/versions/3.8.7/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/Users/danielpham/.pyenv/versions/3.8.7/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 410, in superreload
    update_generic(old_obj, new_obj)
  File "/Users/danielpham/.pyenv/versions/3.8.7/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 347, in update_generic
    update(a, b)
  File "/Users/danielpham/.pyenv/versions/3.8.7/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 302, in update_class
    if update_generic(old_obj, new_obj): continue
  File "/Users/danielpham/.pyenv/versions/3.8.7/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 347, in update_generic
    update(a, b)
  File "/Users/danielpham/.pyenv/versions/3.8.7/lib/python3.8/site-pac

## Run K-Fold Training


In [None]:
%autoreload 2
from reappraisalmodel.trainers import kfold_train


#TODO: ADD CSVLOGGER!!!!
results = kfold_train(5, ldhdata, strat=STRAT, 
                       max_epochs=15, 
#                        limit_train_batches=2,
#                        limit_val_batches=1
                    )


In [None]:
import pandas as pd

df = pd.DataFrame(results)
df['r2score'] = df['r2score'].apply(lambda x: x.item())
df['explained_var'] = df['explained_var'].apply(lambda x: x.item())
df.describe()

In [None]:
import math

math.sqrt(0.5976)

## Tuning Hyperparameters


In [None]:
ldhdata = LDHDataModule(data_dir=ROOT_DIR, strat=STRAT, kfolds=1)

In [None]:
# export
from functools import partial
from argparse import ArgumentParser

import torch
import pytorch_lightning as lit
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from ray import tune

from reappraisalmodel.lightningreapp import LightningReapp

default_tune_config = {
    "lr": tune.loguniform(1e-4, 1e-1), # loguniform samples by magnitude
    "hidden_layer_size": tune.quniform(10, 50, 1)
}

# scheduler = ASHAScheduler(
#     max_t=num_epochs,
#     grace_period=1,
#     reduction_factor=2)

callback_tuner = TuneReportCallback(
    {
        "loss": "val_loss",
        # "mean_accuracy": "val_accuracy"
    },
    on="validation_end",
)



### TUNING HYPERPARAMETERS
def train_tune(config, num_gpus=None, **tuner_kwargs):
    model = LightningReapp(config)

    trainer = lit.Trainer(
        gpus= num_gpus,
        callbacks=[callback_tuner],
    )
    trainer.fit(model, ldhdata)


analysis = tune.run(tune.with_parameters(train_tune,
                                        ),
                    config=default_tune_config, 
                    num_samples=2, 
                    resources_per_trial={
                        'cpu'=1,
                        gpus
                    }
             scheduler=scheduler)
    print("Best hyperparameters found were: ", analysis.best_config)


TypeError: __init__() got an unexpected keyword argument 'verbose'

## Extra!

In [None]:
resp = s3.list_objects(Bucket='ldhdata')

In [None]:
key="obj/{'metrics': {'epoch': tensor(10.), 'val_loss': tensor(1.2882), 'r2score': tensor(0.5832), 'explained_var': tensor(0.6109), 'train_loss': tensor(1.1194, device='cuda:0')}, 'checkpoint': '/tmp/tmpvsrtlvfl/reappmodel_obj_20210227_203013/2_epoch=07-val_loss=1.12.ckpt', 'num_epochs': 10}-20210227_203013-2_epoch=07-val_loss=1.12.ckpt"
s3.copy({
    'Bucket': 'ldhdata',
    'Key': key
},
Bucket='ldhdata',
Key='obj/20210227_203013-2_epoch=07-val_loss=1.12.ckpt')


In [None]:
import torch
metrics = [
    {'epoch': torch.tensor(7.), 'val_loss': 1.1842, 'r2score': torch.tensor(0.6130), 'explained_var': torch.tensor(0.6388), 'train_loss': torch.tensor(1.1642, device='cuda:0')},
    {'epoch': torch.tensor(8.), 'val_loss': 1.1819, 'r2score': torch.tensor(0.6087), 'explained_var': torch.tensor(0.6377), 'train_loss': torch.tensor(1.0931, device='cuda:0')}, 
    {'epoch': torch.tensor(9.), 'val_loss': 1.2094, 'r2score': torch.tensor(0.5926), 'explained_var': torch.tensor(0.6366), 'train_loss': torch.tensor(1.1363, device='cuda:0')}, 
    {'epoch': torch.tensor(10.), 'val_loss': 1.2712, 'r2score': torch.tensor(0.5906), 'explained_var': torch.tensor(0.6339), 'train_loss': torch.tensor(1.0842, device='cuda:0')}, 
    {'epoch': torch.tensor(10.), 'val_loss': 1.2882, 'r2score': torch.tensor(0.5832), 'explained_var': torch.tensor(0.6109), 'train_loss': torch.tensor(1.1194, device='cuda:0')}, 
]

df = pd.DataFrame(metrics)
for key in ['r2score', 'epoch', 'explained_var', 'train_loss']:
    df[key] = df[key].apply(lambda x: x.item())

In [None]:
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True


upload_report = upload_file('this.csv', 'ldhdata', f'obj/20210227_203013-report.csv')
print(f"Successful Uploading Report to s3: {upload_report}")

In [None]:
df.describe()

In [None]:
for obj in resp['Contents']:
    print(obj['Key'])

# Exploring

## Shape of Model

In [None]:
pretrained_model = "distilbert-base-uncased-finetuned-sst-2-english"

from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained(pretrained_model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

### Tokenization

In [None]:
# hide
# Returns a BatchEncoding of the text.
tokenized = tokenizer(text = ["This is the first test sentence!", "This is the second, better test sentence."], 
    padding='max_length', max_length=150)

for idx, sent in enumerate(tokenized.input_ids):
    print(f"Sentence            {idx}: {tokenizer.convert_ids_to_tokens(sent)}")
    print(f"Tokenized Attention {idx}: {tokenized[idx].attention_mask}")



### Training

In [None]:
%autoreload
import torch
import pytorch_lightning as lit
from reappraisalmodel.lightningreapp import LightningReapp

default_config = default_config = {
    'lr': 1e-3,
    'hidden_layer_size': 50
}

model = LightningReapp(default_config)

trainer = lit.Trainer(
    gpus = 1 if torch.cuda.is_available() else None,
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    max_epochs=10,
    fast_dev_run=2,
    terminate_on_nan=True)

model = LightningReapp(default_config)

trainer.fit(model, ldhdata.train_dataloader(), ldhdata.val_dataloader())



In [None]:
model = LightningReapp.load_from_checkpoint(
    '/Users/danielpham/Google Drive/ldh/lightning_logs_obj_0223/version_2/checkpoints/epoch=1-step=337.ckpt', map_location='cpu')

model.eval()
model

In [None]:
import pandas as pd
from pytorch_lightning import Trainer 

model.eval()
trainer = Trainer(
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    terminate_on_nan=True)

test_dataloader = ldhdata

In [None]:
dfs = []
for result in results:
    print(len(result['predict']))

In [None]:
import pickle
with open('./output_reapp.pkl', 'rb+') as f:
    results = pickle.load(f)

In [None]:
import boto3
import pickle

from sagemaker import get_execution_role
role = get_execution_role()

bucket = 'ldhdata'
file = 'Master_Final_TrainingData.csv'

s3client = boto3.client('s3')

response = s3client.get_object(Bucket=bucket, Key=file)

import codecs 
import csv

train = csv.DictReader(codecs.getreader("utf-8")(response["Body"])) # returns an ordered dict


In [None]:
import torch
from pytorch_lightning.metrics.functional import r2score, explained_variance

expected = torch.rand(16)
observed = torch.rand(16)

r2score(expected, observed)