# Reappraisal Training on PyTorch Lightning

## Setup
- When running on Google Colab, mount Google Drive to access scripts.
- `cd` into the project root and install dependencies:

In [None]:
%load_ext autoreload
# from google.colab import drive
# drive.mount('/content/drive')
# ROOT_DIR = "/content/drive/MyDrive/ldh"

# %pip install transformers datasets pytorch-lightning "ray[tune]" wandb
# %pip uninstall dataclasses # Apparently we need to remove dataclasses or it messes with the tuner
# import nltk
# nltk.download('punkt')


import os

ROOT_DIR = "/Users/danielpham/Google Drive/ldh"
%cd {ROOT_DIR}


In [None]:
import torch

# Define constants
STRAT = 'obj'
BATCH_SIZE = 16 if not torch.cuda.is_available() else 16
TEST_BATCH_SIZE = 64
NUM_FOLDS = 5

## Load LDH Data

Contains the following:

- LDHI
- LDHII
- LDHIII

In [None]:
from reappraisalmodel.ldhdata import LDHDataModule
ldhdata = LDHDataModule(data_dir=ROOT_DIR, batch_size=BATCH_SIZE, strat=STRAT, kfolds=NUM_FOLDS)

## Run K-Fold Training


In [None]:
%autoreload
from reappraisalmodel.trainers import kfold_train

results = kfold_train(NUM_FOLDS, ldhdata, strat=STRAT, max_epochs=20)

In [None]:

results

## Tuning Hyperparameters


In [None]:
%autoreload
from reappraisalmodel.trainers import run_tune
run_tune(ldhdata)

# Exploring

## Shape of Model

In [None]:
pretrained_model = "distilbert-base-uncased-finetuned-sst-2-english"

from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained(pretrained_model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

### Tokenization

In [None]:
# hide
# Returns a BatchEncoding of the text.
tokenized = tokenizer(text = ["This is the first test sentence!", "This is the second, better test sentence."], 
    padding='max_length', max_length=150)

for idx, sent in enumerate(tokenized.input_ids):
    print(f"Sentence            {idx}: {tokenizer.convert_ids_to_tokens(sent)}")
    print(f"Tokenized Attention {idx}: {tokenized[idx].attention_mask}")



### Training

In [None]:
%autoreload
import torch
from reappraisalmodel.lightningreapp import LightningReapp

default_config = default_config = {
    'lr': 1e-3,
    'hidden_layer_size': 50
}

# Convert attention mask to a true/false tensor
# use masked_select to select only the non-zero parts of the encoding
# Average over feature dimension using AvgPool1d, where stride_size = kernel_size = encode_length_no_zeros




In [None]:
model = LightningReapp.load_from_checkpoint(
    '/Users/danielpham/Google Drive/ldh/lightning_logs_obj_0223/version_2/checkpoints/epoch=1-step=337.ckpt', map_location='cpu')

model.eval()
model

In [None]:
import pandas as pd
from pytorch_lightning import Trainer 

model.eval()
trainer = Trainer(
    gradient_clip_val=1.0,
    progress_bar_refresh_rate=30,
    terminate_on_nan=True)

test_dataloader = ldhdata

In [None]:
dfs = []
for result in results:
    print(len(result['predict']))

In [None]:
import pickle
with open('./output_reapp.pkl', 'rb+') as f:
    results = pickle.load(f)

In [None]:
results

In [None]:
import boto3
import pickle

from sagemaker import get_execution_role
role = get_execution_role()

bucket = 'ldhdata'
file = 'Master_Final_TrainingData.csv'

s3client = boto3.client('s3')

In [None]:
response = s3client.get_object(Bucket=bucket, Key=file)

In [None]:
import codecs 
import csv

train = csv.DictReader(codecs.getreader("utf-8")(response["Body"])) # returns an ordered dict


encodings.utf_8.StreamReader