# Reappraisal Training For Linguistic Distancing and Emotion Regulation


## Setup
```bash
> pipenv shell  #Generates a new virtual environment based on Pipfile
> pipenv install # Installs the packages in Pipfile.lock (Use --dev) to also install dev packages
```
## Included Datasets
- LDHII 
- 
- Emobank
**Sources**
-  [Sentiment Analysis Text Classification Tutorial](https://www.youtube.com/watch?v=8N-nM3QW7O0)
- [Using Catalyst for Training Organization](https://github.com/catalyst-team/catalyst)



In [1]:

# Uncomment to add the src code to the working dir when on colab
# ! git init
# ! git config core.sparseCheckout true
# ! git remote add -f origin https://github.com/danielcpham/reappraisal-model.git
# ! echo "src/" > .git/info/sparse-checkout
# ! echo "poetry.lock" >> .git/info/sparse-checkout
# ! echo "pyproject.toml" >> .git/info/sparse-checkout 
# ! git checkout dev

# ! pip install transformers datasets 


In [2]:
import os
import numpy as np
import pandas as pd
import torch
# !pipenv install pandas

In [3]:
from datasets import ReadInstruction
from transformers import DistilBertTokenizer

# Enable GPU usage, if we can.
if torch.cuda.is_available():
    print("Enabling GPU usage")
    device = torch.device("cuda:0")
    IS_GPU = True
else:
    print("No GPU available, running on CPU")
    device = torch.device("cpu") # Note: macOS incompatible with NVIDIA GPUs
    IS_GPU = False
    
# Constants and environment setup
# TODO: Set up env files for dev and "prod"
#Casing can matter for sentiment analysis ("bad" vs. "BAD")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")

tokenize_func = lambda batch: tokenizer(
    batch['response'],
    add_special_tokens=True,
    padding="max_length",
    truncation=True)

PRETRAINED_MODEL_NAME = 'distilbert-base-cased'

No GPU available, running on CPU


### LDH Dataset Imports

In [4]:
from src.LDHData import LDHData

data = LDHData()

### IMDB Dataset Import

### EmoBank Dataset Import

In [5]:
from torch import nn, optim
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from datasets import Features, Sequence, Value, DatasetDict
# Tokenize the datasets.
# Add score to the model inputs so we can calculate loss.

# encoded_ds = data.encode(tokenizer, train_ds)

# encoded_ds['train'].features
# data.set_tokenizer(tokenizer)
# train_ds = data.get_train_far_data()

encoded_ds = data.encode_datasets(tokenizer, batched=True, batch_size=16)
encoded_ds

HBox(children=(FloatProgress(value=0.0, max=842.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=842.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1436.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1436.0), HTML(value='')))




{'train': DatasetDict({
     far: Dataset({
         features: ['attention_mask', 'input_ids', 'response', 'score'],
         num_rows: 13470
     })
     obj: Dataset({
         features: ['attention_mask', 'input_ids', 'response', 'score'],
         num_rows: 13470
     })
 }),
 'eval': DatasetDict({
     far: Dataset({
         features: ['__index_level_0__', 'addcode', 'attention_mask', 'input_ids', 'level_0', 'response'],
         num_rows: 22972
     })
     obj: Dataset({
         features: ['__index_level_0__', 'addcode', 'attention_mask', 'input_ids', 'level_0', 'response'],
         num_rows: 22972
     })
 })}

## Generate NN Model

In [6]:
from transformers import TrainingArguments, Trainer, DistilBertModel
from torch.utils.data import DataLoader

from src.ReappModel import ReappModel

# Create the training model.
# TODO: Suppress initialization errors.
model = ReappModel(DistilBertModel, PRETRAINED_MODEL_NAME)

num_train_epochs = 3 if IS_GPU else 1

# Define the parameters under which the model will be trained.
# By default, uses an AdamW optimizer w/ linear warmup.

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)
encoded_train = encoded_ds['train']['far']
encoded_eval  = encoded_ds['eval']['far']

print(len(encoded_eval))

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,                  
    train_dataset=encoded_train,      
    eval_dataset=encoded_eval         
)

22972


In [7]:
encoded_train_step = encoded_train.select(range(16))
input_ids = encoded_train_step['input_ids']
attention_mask = encoded_train_step['attention_mask']
score = encoded_train_step['score'],

trainer.training_step(model, {
    "input_ids": input_ids,
    "attention_mask": attention_mask,
    "score": score
    }
)



tensor(1.2437e+09)

In [8]:
# Model Evaluation: Parse the TrainOutput Object 

In [9]:
trainer.prediction_step(model, {
    "input_ids": input_ids,
    "attention_mask": attention_mask,
    "score": score
    }
, False)
# TODO: fix the result of the forward method so it properly returns the result of a prediction step


(None,
 (tensor(1.2421e+09),
  tensor([[0.3158, 0.1279, 0.0000,  ..., 0.0000, 0.1537, 0.0000],
          [0.0800, 0.0000, 0.4360,  ..., 0.0000, 0.2885, 0.0000],
          [0.1454, 0.0000, 0.0957,  ..., 0.0000, 0.2639, 0.1835],
          ...,
          [0.3524, 0.0000, 0.0000,  ..., 0.0000, 0.3172, 0.0000],
          [0.3425, 0.0000, 0.0000,  ..., 0.0401, 0.3588, 0.0000],
          [0.4287, 0.1626, 0.0000,  ..., 0.0000, 0.3991, 0.0000]]),
  tensor([[0.2471, 0.0000, 0.0000,  ..., 0.0000, 0.1829, 0.0379],
          [0.1583, 0.0000, 0.0000,  ..., 0.1005, 0.4503, 0.1924],
          [0.0000, 0.0000, 0.2992,  ..., 0.0000, 0.0188, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0624, 0.1866, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0529, 0.2066, 0.0000],
          [0.0388, 0.0000, 0.0000,  ..., 0.0855, 0.1923, 0.0000]]),
  tensor([[0.0262, 0.0000, 0.0235,  ..., 0.0297, 0.2659, 0.0000],
          [0.0000, 0.0000, 0.2048,  ..., 0.0905, 0.3916, 0.0000],
          [0.