# Reappraisal Training For Linguistic Distancing and Emotion Regulation


## Setup
```bash
> pipenv shell  #Generates a new virtual environment based on Pipfile
> pipenv install # Installs the packages in Pipfile.lock (Use --dev) to also install dev packages
```
## Included Datasets
- LDHII 
- 
- Emobank
**Sources**
-  [Sentiment Analysis Text Classification Tutorial](https://www.youtube.com/watch?v=8N-nM3QW7O0)
- [Using Catalyst for Training Organization](https://github.com/catalyst-team/catalyst)



In [1]:
# pip install transformers datasets nltk


In [2]:
import os
import numpy as np
import pandas as pd
import torch
from datasets import ReadInstruction

# Enable GPU usage, if we can.
if torch.cuda.is_available():
    print("Enabling GPU usage")
    device = torch.device("cuda:0")
    print(device)
    IS_GPU = True
else:
    print("No GPU available, running on CPU")
    device = torch.device("cpu") # Note: macOS incompatible with NVIDIA GPUs
    IS_GPU = False
    

PRETRAINED_MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'

No GPU available, running on CPU


### LDH Dataset Imports

In [3]:
from src.LDHData import LDHData

data = LDHData()
data.load_training_data()
data.load_eval_data()

Training data loaded from disk.
Evaluation data loaded from disk.


In [4]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained(PRETRAINED_MODEL_NAME)  
# Wrap tokenizer.
def tokenize(x):
    tokenized = tokenizer(x, add_special_tokens=True, padding="max_length", max_length=150)
    return tokenized
    
encoded_train = data.train_dataset['far'].map(
    lambda ds: tokenize(ds['response']), batched=True, batch_size=16
)
encoded_train.set_format(type='torch', output_all_columns=True)

Loading cached processed dataset at /Users/danielpham/Documents/code/reapp/src/training/far/cache-c09023214f11a34f.arrow


In [5]:
# encoded_train = encoded_train.select(range(100))

In [17]:
from transformers import DistilBertModel, TrainingArguments, Trainer
from src.ReappModel import ReappModel
# Define the parameters under which the model will be trained.
# By default, uses an AdamW optimizer w/ linear warmup.
model = ReappModel(DistilBertModel.from_pretrained(PRETRAINED_MODEL_NAME))

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)




In [18]:
trained = []
for i in range(5):
    encoded_train_split, encoded_val_split = encoded_train.shuffle().train_test_split(test_size=0.1).values()

    trainer = Trainer(
        model=model,
        args=training_args,                  
        train_dataset=encoded_train_split,
        eval_dataset=encoded_val_split         
    )
    output = trainer.train()
    trained.append(output)

Loading cached shuffled indices for dataset at /Users/danielpham/Documents/code/reapp/src/training/far/cache-b00d733959a806b5.arrow
Loading cached split indices for dataset at /Users/danielpham/Documents/code/reapp/src/training/far/cache-727b26d67b867b76.arrow and /Users/danielpham/Documents/code/reapp/src/training/far/cache-22a98a69e1066163.arrow
100%|██████████| 1/1 [00:04<00:00,  4.85s/it]
Loading cached shuffled indices for dataset at /Users/danielpham/Documents/code/reapp/src/training/far/cache-b00d733959a806b5.arrow
Loading cached split indices for dataset at /Users/danielpham/Documents/code/reapp/src/training/far/cache-727b26d67b867b76.arrow and /Users/danielpham/Documents/code/reapp/src/training/far/cache-22a98a69e1066163.arrow
  0%|          | 0/1 [00:00<?, ?it/s]{'train_runtime': 4.8515, 'train_samples_per_second': 0.206, 'epoch': 1.0}
100%|██████████| 1/1 [00:04<00:00,  4.96s/it]
Loading cached shuffled indices for dataset at /Users/danielpham/Documents/code/reapp/src/traini

In [19]:
trained

[(<transformers.trainer.Trainer at 0x143f42e20>,
  TrainOutput(global_step=1, training_loss=19.225055694580078, metrics={'train_runtime': 4.8515, 'train_samples_per_second': 0.206, 'epoch': 1.0})),
 (<transformers.trainer.Trainer at 0x143f4d8e0>,
  TrainOutput(global_step=1, training_loss=18.495969772338867, metrics={'train_runtime': 4.9803, 'train_samples_per_second': 0.201, 'epoch': 1.0})),
 (<transformers.trainer.Trainer at 0x143f79550>,
  TrainOutput(global_step=1, training_loss=17.777496337890625, metrics={'train_runtime': 4.8246, 'train_samples_per_second': 0.207, 'epoch': 1.0})),
 (<transformers.trainer.Trainer at 0x13ff65580>,
  TrainOutput(global_step=1, training_loss=17.065771102905273, metrics={'train_runtime': 4.8103, 'train_samples_per_second': 0.208, 'epoch': 1.0})),
 (<transformers.trainer.Trainer at 0x143eb6490>,
  TrainOutput(global_step=1, training_loss=16.521961212158203, metrics={'train_runtime': 4.7821, 'train_samples_per_second': 0.209, 'epoch': 1.0}))]