# Reappraisal Training For Linguistic Distancing and Emotion Regulation


## Setup
```bash
> pipenv shell  #Generates a new virtual environment based on Pipfile
> pipenv install # Installs the packages in Pipfile.lock (Use --dev) to also install dev packages
```
## Included Datasets
- LDHII 
- 
- Emobank
**Sources**
-  [Sentiment Analysis Text Classification Tutorial](https://www.youtube.com/watch?v=8N-nM3QW7O0)
- [Using Catalyst for Training Organization](https://github.com/catalyst-team/catalyst)



In [1]:
# %%capture
# !pip install wandb -qqq
# import wandb
# !wandb login

## Sample code for tracking model training runs in wandb 
# see: https://colab.research.google.com/github/wandb/examples/blob/master/colabs/intro/Intro_to_Weights_%26_Biases.ipynb#scrollTo=-VE3MabfZAcx
# import math
# import random

# # 1️⃣ Start a new run, tracking config metadata
# wandb.init(project="test-drive", config={
#     "learning_rate": 0.02,
#     "dropout": 0.2,
#     "architecture": "CNN",
#     "dataset": "CIFAR-100",
# })
# config = wandb.config

# # Simulating a training or evaluation loop
# for x in range(50):
#     acc = math.log(1 + x + random.random() * config.learning_rate) + random.random()
#     loss = 10 - math.log(1 + x + random.random() + config.learning_rate * x) + random.random()
#     # 2️⃣ Log metrics from your script to W&B
#     wandb.log({"acc":acc, "loss":loss})

# wandb.finish()

In [2]:
# TODO: Add Open in Colab Button
# TODO: Write scripts for running as CLI in pipfile
# TODO: hyperparameter search

In [3]:
import os
import numpy as np
import pandas as pd
import torch
# !pipenv install pandas

In [4]:
from datasets import ReadInstruction

# Enable GPU usage, if we can.
if torch.cuda.is_available():
    print("Enabling GPU usage")
    device = torch.device("cuda:0")
    IS_GPU = True
else:
    print("No GPU available, running on CPU")
    device = torch.device("cpu") # Note: macOS incompatible with NVIDIA GPUs
    IS_GPU = False
    
# Constants and environment setup
# TODO: Set up env files for dev and "prod"
#Casing can matter for sentiment analysis ("bad" vs. "BAD")
PRETRAINED_MODEL_NAME = 'distilbert-base-cased'

No GPU available, running on CPU


### LDH Dataset Imports

In [5]:
from src.LDHData import LDHData

data = LDHData()
ldh_train, ldh_eval = data.get_spatiotemp_data().values()
# Split LDH Data into a training dataset and a validation dataset.
train_ds = ldh_train.train_test_split(test_size=0.15) # shuffle

### IMDB Dataset Import

### EmoBank Dataset Import

In [6]:
from torch import nn, optim
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from datasets import Features, Sequence, Value
# Tokenize the datasets.
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")
# Add score to the model inputs so we can calculate loss.
tokenizer.model_input_names.append("score")
encoded_ds= train_ds.map(
    lambda batch: tokenizer(
        batch['response'],
        add_special_tokens=True,
        padding="max_length",
        truncation=True), 
    batched=True, batch_size=16, features=Features({
        'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
        'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
        'response': Value(dtype='string', id=None),
        'score': Value(dtype='float32', id=None)
    }))


# Reformat the dataset to PyTorch tensors.
encoded_ds.set_format(type='torch', columns=['attention_mask', 'input_ids', 'score'])

# {'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
#  'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
#  'response': Value(dtype='string', id=None),
#  'score': Value(dtype='float64', id=None)}

encoded_ds['train'].features

HBox(children=(FloatProgress(value=0.0, max=716.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=127.0), HTML(value='')))




{'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'response': Value(dtype='string', id=None),
 'score': Value(dtype='float32', id=None)}

## Generate NN Model

In [7]:
from transformers import TrainingArguments, Trainer, DistilBertModel
from torch.utils.data import DataLoader

from src.ReappModel import ReappModel

# Create the training model.
# TODO: Suppress initialization errors.
model = ReappModel(DistilBertModel, PRETRAINED_MODEL_NAME)

num_train_epochs = 3 if IS_GPU else 1

# Define the parameters under which the model will be trained.
# By default, uses an AdamW optimizer w/ linear warmup.

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)
encoded_train = encoded_ds['train']
encoded_test  = encoded_ds['test']

print(encoded_train[0].keys())

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,                  
    train_dataset=encoded_train,      
    eval_dataset=encoded_test         
)



dict_keys(['attention_mask', 'input_ids', 'score'])


  return torch.tensor(x, **format_kwargs)


In [10]:
encoded_train_step = encoded_train.select(range(16))
input_ids = encoded_train_step['input_ids']
attention_mask = encoded_train_step['attention_mask']
score = encoded_train_step['score'],

trainer.training_step(model, {
    "input_ids": input_ids,
    "attention_mask": attention_mask,
    "score": score
    }
)

trainer.evaluate(model, {
    "input_ids": input_ids,
    "attention_mask": attention_mask,
    "score": score
    }
)

torch.Size([16]) torch.Size([16])


ValueError: eval_dataset must implement __len__

In [9]:
# Model Evaluation: Parse the TrainOutput Object 