# Reappraisal Training For Linguistic Distancing and Emotion Regulation




## Mount Google Drive and Install Dependencies


### Setup
```bash
> pipenv shell  #Generates a new virtual environment based on Pipfile
> pipenv install # Installs the packages in Pipfile.lock (Use --dev) to also install dev packages
```

**Sources**
-  [Sentiment Analysis Text Classification Tutorial](https://www.youtube.com/watch?v=8N-nM3QW7O0)
- [Using Catalyst for Training Organization](https://github.com/catalyst-team/catalyst)


In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
import os
os.chdir( '/content/drive/MyDrive/ldh')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/ldh'

In [2]:
# ! pip install transformers datasets wandb 

### Select GPU usage if one exists on the machine.

In [3]:
import numpy as np
import pandas as pd
import torch
from datasets import ReadInstruction

#import wandb
# wandb.login()

# Enable GPU usage, if we can.
if torch.cuda.is_available():
    print("Enabling GPU usage")
    device = torch.device("cuda:0")
    print(device)
    IS_GPU = True
else:
    print("No GPU available, running on CPU")
    device = torch.device("cpu") # Note: macOS incompatible with NVIDIA GPUs
    IS_GPU = False

Enabling GPU usage
cuda:0


## Import and Encode LDH Data
- Preprocessing of LDH Data is done in `src/LDHData.py`.

Datasets:
- Train Dataset: 
- Eval Dataset: 

In [4]:
from src.LDHData import LDHData
from transformers import DistilBertTokenizer

data = LDHData()
data.load_training_data()
data.load_eval_data()

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')  

# Wrap tokenizer with keyword arguments.
# Setting max_length to 150 because that seems to be the longest vector that exists already.
# If the dataset was previously encoded, it will be pulled from the cache.
def tokenize(x):
    tokenized = tokenizer(x, add_special_tokens=True, padding="max_length", max_length=150)
    return tokenized
    
encoded_train = data.train_dataset['obj'].map(
    lambda ds: tokenize(ds['response']), batched=True, batch_size=16
)
encoded_train.set_format(type='torch', output_all_columns=True)

encoded_eval = data.eval_dataset['obj'].map(
    lambda ds: tokenize(ds['response'])
)
encoded_eval.set_format(type='torch', output_all_columns=True)

Training data loaded from disk.
Evaluation data loaded from disk.


Loading cached processed dataset at /content/drive/MyDrive/ldh/src/training/obj/cache-9273660f280cce50.arrow
Loading cached processed dataset at /content/drive/MyDrive/ldh/src/eval/obj/cache-d4c25504bf31afff.arrow


## Set Training Arguments and Train Model



In [5]:
from transformers import TrainingArguments, Trainer
from src.ReappModel import ReappModel
# Define the parameters under which the model will be trained.
# By default, uses an AdamW optimizer w/ linear warmup.
model = ReappModel()

training_args = TrainingArguments(
    output_dir='./results/new',          # output directory
    num_train_epochs=5,
    evaluation_strategy="steps",
    logging_steps=100,
    save_steps=100,
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs/new',            # directory for storing logs
    prediction_loss_only=True,
    label_names=['score'] 
  )


In [7]:
from torch.nn import functional as F

# Training dataset: used to fine-tune the model
# Validation dataset: Evaluate performance of model (metrics go here.)
encoded_train_split, encoded_val_split = encoded_train.shuffle().train_test_split(test_size=0.15).values()

# Override the loss function to be used for the trainer.
class ReappTrainer(Trainer):
  def compute_loss(self, model, inputs):
    if "score" in inputs.keys(): # if we see a score
      score = inputs.pop('score')
      output = model(**inputs)
      print(output)
      return F.mse_loss(output.sum(dim=1), score)

trainer = ReappTrainer(
  model=model.to(device),
  args=training_args,                  
  train_dataset=encoded_train_split,
  eval_dataset=encoded_val_split
)

trainer.train()


Loading cached shuffled indices for dataset at /content/drive/MyDrive/ldh/src/training/obj/cache-759256467d39cc2b.arrow
Loading cached split indices for dataset at /content/drive/MyDrive/ldh/src/training/obj/cache-2807b64bd5c7926f.arrow and /content/drive/MyDrive/ldh/src/training/obj/cache-d2ded62b03f3ec12.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


AttributeError: ignored

# Torch Lightning

In [8]:
! pip install pytorch-lightning

Collecting pytorch-lightning
[?25l  Downloading https://files.pythonhosted.org/packages/c2/dd/0c326da04e021a9849a1e75dd639d8c2e22d3abb296a9fc39bed518d2879/pytorch_lightning-1.1.7-py3-none-any.whl (695kB)
[K     |████████████████████████████████| 696kB 12.9MB/s 
Collecting PyYAML!=5.4.*,>=5.1
[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)
[K     |████████████████████████████████| 276kB 30.5MB/s 
[?25hCollecting fsspec[http]>=0.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/ec/80/72ac0982cc833945fada4b76c52f0f65435ba4d53bc9317d1c70b5f7e7d5/fsspec-0.8.5-py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 9.0MB/s 
Collecting future>=0.17.1
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 2

In [15]:
import pytorch_lightning as pl
import torch

In [19]:
# from 


from torch.utils.data import DataLoader

train_data = DataLoader(data.train_dataset)

In [20]:
from pytorch_lightning import Trainer
model = ReappraisalTuner()
trainer = Trainer()
trainer.fit(model, train_data)

GPU available: True, used: False
TPU available: None, using: 0 TPU cores

  | Name       | Type            | Params
-----------------------------------------------
0 | model      | DistilBertModel | 65.2 M
1 | tokenizer  | DistilBertModel | 65.2 M
2 | classifier | Sequential      | 39.0 K
-----------------------------------------------
130 M     Trainable params
0         Non-trainable params
130 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




KeyError: ignored

In [22]:
!git checkout -b lightning


M	main.ipynb
M	poetry.lock
M	pyproject.toml
M	src/ReappModel.py
Switched to a new branch 'lightning'


In [23]:
! git status

On branch lightning
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   main.ipynb[m
	[31mmodified:   poetry.lock[m
	[31mmodified:   pyproject.toml[m
	[31mmodified:   src/ReappModel.py[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mlightning_logs/[m
	[31mlogs/[m
	[31mresults/[m
	[31msrc/NRCVADDataset.py[m
	[31msrc/read_torch_model.ipynb[m
	[31mwandb/[m

no changes added to commit (use "git add" and/or "git commit -a")


In [24]:
! git add . 
! git checkout -m "testing"
! git push

^C
error: pathspec 'testing' did not match any file(s) known to git.
fatal: The current branch lightning has no upstream branch.
To push the current branch and set the remote as upstream, use

    git push --set-upstream origin lightning

