# Reappraisal Training For Linguistic Distancing and Emotion Regulation


## Setup
```bash
> pipenv shell  #Generates a new virtual environment based on Pipfile
> pipenv install # Installs the packages in Pipfile.lock (Use --dev) to also install dev packages
```
## Included Datasets
- LDHII 
- 
- Emobank
**Sources**
-  [Sentiment Analysis Text Classification Tutorial](https://www.youtube.com/watch?v=8N-nM3QW7O0)
- [Using Catalyst for Training Organization](https://github.com/catalyst-team/catalyst)



In [1]:
# %%capture
# !pip install wandb -qqq
# import wandb
# !wandb login

## Sample code for tracking model training runs in wandb 
# see: https://colab.research.google.com/github/wandb/examples/blob/master/colabs/intro/Intro_to_Weights_%26_Biases.ipynb#scrollTo=-VE3MabfZAcx
# import math
# import random

# # 1️⃣ Start a new run, tracking config metadata
# wandb.init(project="test-drive", config={
#     "learning_rate": 0.02,
#     "dropout": 0.2,
#     "architecture": "CNN",
#     "dataset": "CIFAR-100",
# })
# config = wandb.config

# # Simulating a training or evaluation loop
# for x in range(50):
#     acc = math.log(1 + x + random.random() * config.learning_rate) + random.random()
#     loss = 10 - math.log(1 + x + random.random() + config.learning_rate * x) + random.random()
#     # 2️⃣ Log metrics from your script to W&B
#     wandb.log({"acc":acc, "loss":loss})

# wandb.finish()

In [51]:
# TODO: Add Open in Colab Button
# TODO: Write scripts for running as CLI in pipfile
# TODO: hyperparameter search

In [1]:
import os
import numpy as np
import pandas as pd
import torch

In [2]:
from datasets import ReadInstruction

# Enable GPU usage, if we can.
if torch.cuda.is_available():
    print("Enabling GPU usage")
    device = torch.device("cuda:0")
    IS_GPU = True
else:
    print("No GPU available, running on CPU")
    device = torch.device("cpu") # Note: macOS incompatible with NVIDIA GPUs
    IS_GPU = False
    
# Constants and environment setup
# TODO: Set up env files for dev and "prod"
#Casing can matter for sentiment analysis ("bad" vs. "BAD")
PRETRAINED_MODEL_NAME = 'distilbert-base-cased'

No GPU available, running on CPU


### LDH Dataset Imports

In [6]:
from LDHData import LDHData

data = LDHData()
ldh_train = data.train_data
ldh_eval = data.eval_data

ldh_train, ldh_eval

({'spatiotemp': Dataset({
      features: ['response', 'spatiotemp'],
      num_rows: 13472
  }),
  'obj': Dataset({
      features: ['response', 'obj'],
      num_rows: 13472
  })},
 {'spatiotemp': Dataset({
      features: ['addcode', 'subjID', 'condition', 'response'],
      num_rows: 1638
  }),
  'obj': Dataset({
      features: ['addcode', 'subjID', 'condition', 'response'],
      num_rows: 1638
  })})

In [51]:
from collections import Counter, defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize

# Word tokenizer and sentence tokenizer with NLTK
resp_lengths = []
length_scores_spatiotemp = []
length_scores_obj = []
for row in ldh.itertuples():
    if row.Index == 0:
        continue
    response = row.response
    try:
        score_spat = float(row.spatiotemp)
    except:
        continue
    try:
        score_obj = float(row.obj)
    except:
        continue
    len_response = len(word_tokenize(response))
    resp_lengths.append(len_response)
    length_scores_spatiotemp.append((len_response, score_spat))
    length_scores_obj.append((len_response, score_obj))

[(27, 2.6875),
 (15, 2.5),
 (18, 5.0),
 (21, 4.0),
 (18, 1.0),
 (15, 5.0),
 (10, 2.75),
 (23, 4.0),
 (19, 6.0),
 (12, 2.0),
 (24, 5.0),
 (21, 1.5),
 (21, 2.0),
 (14, 3.0),
 (8, 2.3333333333333335),
 (22, 3.785714285714286),
 (7, 1.6666666666666667),
 (13, 1.0),
 (10, 2.5),
 (10, 4.5),
 (17, 4.0),
 (14, 7.0),
 (10, 2.5),
 (16, 2.5),
 (7, 1.0),
 (21, 2.0),
 (17, 2.0),
 (6, 3.0),
 (13, 1.5),
 (40, 3.0),
 (6, 2.0),
 (28, 5.0),
 (17, 1.0),
 (10, 1.0),
 (19, 2.947368421052631),
 (18, 1.5),
 (9, 4.5),
 (14, 3.5),
 (8, 2.0),
 (16, 3.0),
 (15, 2.5),
 (16, 3.0),
 (14, 2.5),
 (10, 2.5),
 (14, 3.0),
 (17, 1.0),
 (25, 3.5),
 (10, 3.0),
 (18, 3.384615384615385),
 (60, 3.5),
 (46, 2.5),
 (23, 3.5),
 (28, 2.837837837837838),
 (17, 6.0),
 (14, 2.0),
 (8, 2.0),
 (8, 2.0),
 (16, 6.5),
 (6, 3.5),
 (10, 2.0),
 (13, 3.0),
 (11, 1.891891891891892),
 (16, 3.0),
 (24, 3.0),
 (13, 1.0),
 (6, 2.0),
 (8, 1.5),
 (42, 3.0),
 (20, 6.0),
 (19, 2.0),
 (21, 2.0),
 (26, 4.5),
 (5, 2.5),
 (5, 2.6666666666666665),
 (8, 1.

In [60]:
# Split LDH Data into a training dataset and a validation dataset.
train_ldh, val_ldh = train_test_split(ldh, test_size=0.15) # shuffle
train_ldh_ds = Dataset.from_pandas(train_ldh)
val_ldh_ds = Dataset.from_pandas(val_ldh)
# TODO: Convert to DatasetDict

### IMDB Dataset Import

In [4]:
from datasets import Dataset, load_dataset
# For testing on a CPU, just grab the first few.
if IS_GPU:
    splits = [ReadInstruction('train'), ReadInstruction('test')]
else:
    splits = [ReadInstruction('train', to=256, unit="abs"), ReadInstruction('test', to=64, unit="abs")]

train_ds, eval_ds = load_dataset('imdb', split=splits)

# Split training data into model training and model validation
train_val_ds = train_ds.train_test_split(test_size=0.15)

Reusing dataset imdb (/Users/danielpham/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


### EmoBank Dataset Import

In [5]:
from torch import nn, optim
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Tokenize the datasets.
tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
encoded_ds= train_val_ds.map(
    lambda batch: tokenizer(
        batch['text'],
        add_special_tokens=True,
        padding=True,
        truncation=True), 
    batched=True, batch_size=16, remove_columns=['text'])

# Reformat the dataset to PyTorch tensors.
encoded_ds.set_format(type='torch')
encoded_ds.column_names, encoded_ds.shape

Loading cached processed dataset at /Users/danielpham/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-30641a4e352f1b55.arrow
Loading cached processed dataset at /Users/danielpham/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-e144e067c2fcff59.arrow


({'train': ['attention_mask', 'input_ids', 'label'],
  'test': ['attention_mask', 'input_ids', 'label']},
 {'train': (217, 3), 'test': (39, 3)})

## Generate NN Model

In [8]:
from transformers import TrainingArguments, Trainer, DistilBertModel

from ReappModel import ReappModel

# Create the training model.
# TODO: Suppress initialization errors.
model = ReappModel(PRETRAINED_MODEL_NAME)

num_train_epochs = 3 if IS_GPU else 1

# Define the parameters under which the model will be trained.
# By default, uses an AdamW optimizer w/ linear warmup.

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

encoded_train = encoded_ds['train']
encoded_test  = encoded_ds['test']

# HyperParameter search depending on the model.

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,                  
    train_dataset=encoded_train,      
    eval_dataset=encoded_test         
)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier

In [9]:
trainer.train()

  return torch.tensor(x, **format_kwargs)


Step,Training Loss


TrainOutput(global_step=14, training_loss=0.2399845634187971, metrics={'train_runtime': 442.7938, 'train_samples_per_second': 0.032, 'total_flos': 0, 'epoch': 1.0})

In [None]:
# Model Evaluation: Parse the TrainOutput Object 