# Reappraisal Training For Linguistic Distancing and Emotion Regulation

## Setup
1. Create virtual environment and download required packages (use pipenv).

**Notes**
- Attention: which words are important for the decoder to focus on at a specific timestep?
    - Q = Query
    - K = Key
    - V = Value
- Self-attention: What if Q and K are both the same sentence.
- Multi-head Self-Attention: self-attention calculated independently and concurrently (allows transformers to learn representations at different positional encodings)

**Sources**
-  [Sentiment Analysis Text Classification Tutorial](https://www.youtube.com/watch?v=8N-nM3QW7O0)
- [Using Catalyst for Training Organization](https://github.com/catalyst-team/catalyst)



In [None]:
# TODO: Add Open in Colab Button
# TODO: Write scripts for running as CLI in pipfile
#

In [None]:
# Imports
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer
import torch

import runutils
import reappDataLoader
import SentimentClassifier


In [None]:
# Constants and environment setup
#TODO: Set up env files for dev and "prod"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # macOS incompatible with NVIDIA GPUs
#Casing can matter for sentiment analysis ("bad" vs. "BAD")
PRETRAINED_MODEL_NAME = 'distilbert-base-cased' 
RANDOM_SEED = 42
MAX_LEN = 160
BATCH_SIZE = 16
EPOCHS = 3

In [None]:
from datasets import load_dataset
from transformers import DistilBertTokenizerFast

# Load, preprocess, and encode data
dataset = load_dataset('imdb')
train = dataset['train'].select(range(30))
tokenizer = DistilBertTokenizerFast.from_pretrained(PRETRAINED_MODEL_NAME)


encoded_train_dataset = train.map(lambda batch: tokenizer(batch['text'], add_special_tokens=True, padding=True, truncation=True), batched=True)
encoded_train_dataset.set_format(type='torch', output_all_columns=True)
encoded_train_dataset

In [None]:
# Training
from transformers import DistilBertModel, AdamW, get_linear_schedule_with_warmup
from reappDataLoader import SentimentClassifier

from torch import nn, optim
from torch.utils.data import DataLoader

#TODO: Change num_training steps to get the length from a dataloader object?
model = SentimentClassifier(5)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
train_dataloader = DataLoader(encoded_train, batch_size=BATCH_SIZE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(encoded_train) * EPOCHS)
loss_fn = nn.CrossEntropyLoss()



In [None]:
%%time
from collections import defaultdict as ddict

history = ddict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'epoch {epoch+1} / {EPOCHS}')
    print('-' * 10)
    train_acc, train_loss = run_epoch(
        model,
        dataloader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train)
    )

    print(f'Loss: {train_loss}, Accuracy: {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    print(f'Val: Loss: {val_loss}, Accuracy: {val_acc}')
    print()

history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc
    
