# This notebook tuned the `flan-t5-large` model which was reported in results but _not_ used for the downstream experiments

In [162]:
import os
os.environ['LD_LIBRARY_PATH'] = '/usr/lib/x86_64-linux-gnu/'

import pandas as pd
import datasets
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments
import torch.nn as nn
import torch
from datasets import Dataset, DatasetDict
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, precision_recall_fscore_support
import wandb
import os
import evaluate
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModelForSeq2SeqLM
from sklearn.dummy import DummyClassifier
from collections import Counter
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PrefixTuningConfig, PromptTuningConfig, TaskType, PeftType
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm.auto import tqdm, trange
from peft import PeftModel, PeftConfig

os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # replace the 0 with other gpu ids
#os.environ["CUDA_VISIBLE_DEVICES"]="1"
torch.cuda.set_device(1)

In [163]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
#seed = 1234
#seed = 20230501
#seed = 120398412
#seed = 987654321
seed = 42
    
seed_everything(seed)

In [164]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = "cuda:1"
model_name_or_path = "google/flan-t5-large"
tokenizer_name_or_path = "google/flan-t5-large"

text_column = "text"
label_column = "answer"
max_length = 192
lr = 1e-2
num_epochs = 30
batch_size = 96

In [165]:
datadir = '/shared/2/projects/contextual-appropriateness/data/final-annotated-data-cleaned/'
train_df = pd.read_csv(datadir + 'train.csv')
dev_df = pd.read_csv(datadir + 'dev.csv')
test_df = pd.read_csv(datadir + 'test.csv')

In [166]:
len(set(test_df.quote))

96

In [167]:
len(train_df)

9107

In [168]:
train_df.head()

Unnamed: 0,id,relationship,is_appropriate,label,quote,origin
0,594,ex_dating,Yes,0,She thought it was funny... but the animals sh...,Adjudication
1,594,colleague,No,1,She thought it was funny... but the animals sh...,Adjudication
2,594,engaged,Yes,0,She thought it was funny... but the animals sh...,Adjudication
3,594,best_friend,Yes,0,She thought it was funny... but the animals sh...,Adjudication
4,594,neighbor,No,1,She thought it was funny... but the animals sh...,Adjudication


In [169]:
#train_df = train_df.tail(4000)

In [170]:
relationship_to_verbalization = {
    "friend": "a person to their friend",
    "parent":"a parent to their child",
    "child":"a child to their parent",
    "sibling":"a person to their sibling",
    "best friend":"a person to their best friend",
    "best_friend":"a person to their best friend",
    "neighbor":"a person to their neighbor",
    "coworker":"a person to their coworker",
    "boss":"a boss to their employee",
    "direct report (employee)":"a person to their boss",
    "direct_report":"a person to their boss",    
    "student":"a student to their teacher",
    "teacher":"a teacher to their student",
    "cousins":"a person to their cousin",
    "granparent":"a person to their grandchild",
    "grandparent":"a person to their grandchild",
    "grandchild":"a person to their grandparent",    
    "uncle/aunt":"an uncle/aunt to their niece or nephew",
    "uncle_aunt":"an uncle or aunt to their niece or nephew",    
    "neice_nephew":"a niece or nephew to their uncle or aunt",        
    "employee_in_large_company":"an employee in large company to another",
    "married":"a person to their spouse",
    "dating":"a person to the person they are dating",
    "engaged":"a person to their fiancee",
    "friends_with_benefits":"a person to their friend with benefits",
    "divorcee":"a person to their ex-spouse",
    "ex-girlfriend/ex-boyfriend":"a person to their ex-girlfriend or ex-boyfriend",
    "ex_dating":"a person to their ex-girlfriend or ex-boyfriend",    
    "step-sibling":"a person to their step-sibling",
    "step_sibling":"a person to their step-sibling",    
    "fan":"a fan to their hero",
    "hero":"a hero to their fan",
    "enemy":"a person to their enemy",
    "rival":"a person to their rival",
    "competitor":"a person to their competitor",
    "complete_stranger":"a person to a complete stranger",
    "acquaintance":"a person to an acquaintance",
    "person_with_authority":"a person with authority to a subordinate",
    "law_enforcement":"a member of law enforcement to a community member",
    "classmate":"a person to their classmate",
    "sports_teammate":"a person to their sports teammate",
    "club_member":"a person to a member of their club",
    "adopted_child":"an adopted child to their parent",
    "adopted child":"an adopted child to their parent",
    "domestic_partner":"a person to their domestic partner",
    "person_having_an_affair":"a person having an affair to their partner",
    "mentor":"a mentor to their mentee",
    "mentee":"a mentee to their mentor",
    "landlord":"a landlord to their tenant",
    "colleague":"a person to their colleague",
    "childhood_friend":"a person to their childhood friend",
    "old_friend":"a person to an old friend",
    "client":"a client to their lawyer",
    "lawyer":"a lawyer to their client",
    "patient":"a patient to their doctor",
    "doctor":"a doctor to their patient",   
}

In [171]:
def generate_text(row):
    rel = row['relationship']
    if rel in relationship_to_verbalization:
        desc = relationship_to_verbalization[rel]
    else:
        desc = relationship_to_verbalization[rel.replace(' ', '_')]
    text = 'Rate whether it is inappropriate for this message to be said in the following social setting?\nsetting: from %s\nmessage: %s\nanswer (yes or no):' % (desc, row['quote'])
    return text

In [172]:
#set(train_df.relationship)

In [173]:
train_df['text'] = train_df.apply(generate_text, axis=1)
dev_df['text'] = dev_df.apply(generate_text, axis=1)
test_df['text'] = test_df.apply(generate_text, axis=1)
train_df.head()

Unnamed: 0,id,relationship,is_appropriate,label,quote,origin,text
0,594,ex_dating,Yes,0,She thought it was funny... but the animals sh...,Adjudication,Rate whether it is inappropriate for this mess...
1,594,colleague,No,1,She thought it was funny... but the animals sh...,Adjudication,Rate whether it is inappropriate for this mess...
2,594,engaged,Yes,0,She thought it was funny... but the animals sh...,Adjudication,Rate whether it is inappropriate for this mess...
3,594,best_friend,Yes,0,She thought it was funny... but the animals sh...,Adjudication,Rate whether it is inappropriate for this mess...
4,594,neighbor,No,1,She thought it was funny... but the animals sh...,Adjudication,Rate whether it is inappropriate for this mess...


In [174]:
train_df['answer'] = train_df.label.apply(lambda x: 'yes' if x == 1 else 'no')
dev_df['answer'] = dev_df.label.apply(lambda x: 'yes' if x == 1 else 'no')
test_df['answer'] = test_df.label.apply(lambda x: 'yes' if x == 1 else 'no')

In [175]:
print(test_df.text.iloc[0])

Rate whether it is inappropriate for this message to be said in the following social setting?
setting: from a person to their ex-girlfriend or ex-boyfriend
message: He needs to get high school diploma too
answer (yes or no):


In [176]:
tds = Dataset.from_pandas(train_df)
vds = Dataset.from_pandas(dev_df)
test_ds = Dataset.from_pandas(test_df)

ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds
ds['test'] = test_ds

dataset = ds
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'relationship', 'is_appropriate', 'label', 'quote', 'origin', 'text', 'answer'],
        num_rows: 9107
    })
    validation: Dataset({
        features: ['id', 'relationship', 'is_appropriate', 'label', 'quote', 'origin', 'text', 'answer'],
        num_rows: 1100
    })
    test: Dataset({
        features: ['id', 'relationship', 'is_appropriate', 'label', 'quote', 'origin', 'text', 'answer'],
        num_rows: 2029
    })
})

In [177]:
torch.cuda.empty_cache()

In [178]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=2, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

In [179]:
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/9107 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1100 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/2029 [00:00<?, ? examples/s]

In [180]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]
eval_df = dev_df
test_dataset = processed_datasets["test"]


train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, 
    batch_size=batch_size, #pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, 
                             batch_size=batch_size, #pin_memory=True
)

In [181]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, 
                                 inference_mode=False, num_virtual_tokens=20)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 983040 || all params: 784133120 || trainable%: 0.1253664683874085


In [182]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [183]:
model.device

device(type='cuda')

In [184]:
model = model.to(device)
outdir = '/shared/2/projects/contextual-appropriateness/models/peft/%s/seed-%d/' % (model_name_or_path, seed)

best_f1 = 0
best_model = 0

for epoch in trange(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to('cuda:1') for k, v in batch.items()}
        #if True:
        #    continue
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), 
                                   skip_special_tokens=True)
        )

    eval_bin_preds = [1 if p == 'yes' else 0 for p in eval_preds]
    p,r,f1,s = precision_recall_fscore_support(eval_df['label'], eval_bin_preds, average='binary')


    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
    print('Precision: %.3f, Recall: %.3f, F1: %.3f' % (p,r,f1))
            
    if best_f1 < f1:
        best_f1 = f1
        print('saving best model (currently: epoch %d)' % epoch)
        model.save_pretrained(outdir + 'best/')
    model.save_pretrained(outdir + 'epoch-%03d/' % epoch)

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=0: train_ppl=tensor(1.4254, device='cuda:1') train_epoch_loss=tensor(0.3545, device='cuda:1') eval_ppl=tensor(1.4106, device='cuda:1') eval_epoch_loss=tensor(0.3440, device='cuda:1')
Precision: 0.388, Recall: 0.053, F1: 0.094
saving best model (currently: epoch 0)


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=1: train_ppl=tensor(1.3913, device='cuda:1') train_epoch_loss=tensor(0.3302, device='cuda:1') eval_ppl=tensor(1.4040, device='cuda:1') eval_epoch_loss=tensor(0.3393, device='cuda:1')
Precision: 0.367, Recall: 0.105, F1: 0.163
saving best model (currently: epoch 1)


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=2: train_ppl=tensor(1.3633, device='cuda:1') train_epoch_loss=tensor(0.3099, device='cuda:1') eval_ppl=tensor(1.3745, device='cuda:1') eval_epoch_loss=tensor(0.3181, device='cuda:1')
Precision: 0.595, Recall: 0.432, F1: 0.501
saving best model (currently: epoch 2)


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=3: train_ppl=tensor(1.3472, device='cuda:1') train_epoch_loss=tensor(0.2980, device='cuda:1') eval_ppl=tensor(1.3547, device='cuda:1') eval_epoch_loss=tensor(0.3036, device='cuda:1')
Precision: 0.658, Recall: 0.471, F1: 0.549
saving best model (currently: epoch 3)


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=4: train_ppl=tensor(1.3221, device='cuda:1') train_epoch_loss=tensor(0.2792, device='cuda:1') eval_ppl=tensor(1.3413, device='cuda:1') eval_epoch_loss=tensor(0.2936, device='cuda:1')
Precision: 0.688, Recall: 0.475, F1: 0.562
saving best model (currently: epoch 4)


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=5: train_ppl=tensor(1.3077, device='cuda:1') train_epoch_loss=tensor(0.2682, device='cuda:1') eval_ppl=tensor(1.3278, device='cuda:1') eval_epoch_loss=tensor(0.2835, device='cuda:1')
Precision: 0.656, Recall: 0.700, F1: 0.677
saving best model (currently: epoch 5)


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=6: train_ppl=tensor(1.2931, device='cuda:1') train_epoch_loss=tensor(0.2570, device='cuda:1') eval_ppl=tensor(1.3397, device='cuda:1') eval_epoch_loss=tensor(0.2924, device='cuda:1')
Precision: 0.792, Recall: 0.430, F1: 0.557


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=7: train_ppl=tensor(1.2818, device='cuda:1') train_epoch_loss=tensor(0.2483, device='cuda:1') eval_ppl=tensor(1.3116, device='cuda:1') eval_epoch_loss=tensor(0.2713, device='cuda:1')
Precision: 0.729, Recall: 0.570, F1: 0.640


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=8: train_ppl=tensor(1.2739, device='cuda:1') train_epoch_loss=tensor(0.2421, device='cuda:1') eval_ppl=tensor(1.3352, device='cuda:1') eval_epoch_loss=tensor(0.2891, device='cuda:1')
Precision: 0.663, Recall: 0.724, F1: 0.692
saving best model (currently: epoch 8)


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=9: train_ppl=tensor(1.2609, device='cuda:1') train_epoch_loss=tensor(0.2318, device='cuda:1') eval_ppl=tensor(1.3229, device='cuda:1') eval_epoch_loss=tensor(0.2799, device='cuda:1')
Precision: 0.688, Recall: 0.656, F1: 0.672


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=10: train_ppl=tensor(1.2556, device='cuda:1') train_epoch_loss=tensor(0.2276, device='cuda:1') eval_ppl=tensor(1.3210, device='cuda:1') eval_epoch_loss=tensor(0.2784, device='cuda:1')
Precision: 0.695, Recall: 0.615, F1: 0.653


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=11: train_ppl=tensor(1.2498, device='cuda:1') train_epoch_loss=tensor(0.2230, device='cuda:1') eval_ppl=tensor(1.3296, device='cuda:1') eval_epoch_loss=tensor(0.2849, device='cuda:1')
Precision: 0.697, Recall: 0.623, F1: 0.658


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=12: train_ppl=tensor(1.2460, device='cuda:1') train_epoch_loss=tensor(0.2199, device='cuda:1') eval_ppl=tensor(1.3413, device='cuda:1') eval_epoch_loss=tensor(0.2936, device='cuda:1')
Precision: 0.679, Recall: 0.648, F1: 0.663


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=13: train_ppl=tensor(1.2354, device='cuda:1') train_epoch_loss=tensor(0.2114, device='cuda:1') eval_ppl=tensor(1.3630, device='cuda:1') eval_epoch_loss=tensor(0.3097, device='cuda:1')
Precision: 0.734, Recall: 0.527, F1: 0.613


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=14: train_ppl=tensor(1.2391, device='cuda:1') train_epoch_loss=tensor(0.2144, device='cuda:1') eval_ppl=tensor(1.3501, device='cuda:1') eval_epoch_loss=tensor(0.3002, device='cuda:1')
Precision: 0.722, Recall: 0.549, F1: 0.624


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=15: train_ppl=tensor(1.2286, device='cuda:1') train_epoch_loss=tensor(0.2059, device='cuda:1') eval_ppl=tensor(1.3606, device='cuda:1') eval_epoch_loss=tensor(0.3079, device='cuda:1')
Precision: 0.708, Recall: 0.584, F1: 0.640


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=16: train_ppl=tensor(1.2222, device='cuda:1') train_epoch_loss=tensor(0.2006, device='cuda:1') eval_ppl=tensor(1.3461, device='cuda:1') eval_epoch_loss=tensor(0.2972, device='cuda:1')
Precision: 0.714, Recall: 0.584, F1: 0.643


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=17: train_ppl=tensor(1.2188, device='cuda:1') train_epoch_loss=tensor(0.1978, device='cuda:1') eval_ppl=tensor(1.3475, device='cuda:1') eval_epoch_loss=tensor(0.2983, device='cuda:1')
Precision: 0.699, Recall: 0.611, F1: 0.652


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=18: train_ppl=tensor(1.2327, device='cuda:1') train_epoch_loss=tensor(0.2092, device='cuda:1') eval_ppl=tensor(1.3681, device='cuda:1') eval_epoch_loss=tensor(0.3134, device='cuda:1')
Precision: 0.712, Recall: 0.553, F1: 0.623


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=19: train_ppl=tensor(1.2170, device='cuda:1') train_epoch_loss=tensor(0.1964, device='cuda:1') eval_ppl=tensor(1.3590, device='cuda:1') eval_epoch_loss=tensor(0.3068, device='cuda:1')
Precision: 0.729, Recall: 0.566, F1: 0.637


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=20: train_ppl=tensor(1.2087, device='cuda:1') train_epoch_loss=tensor(0.1895, device='cuda:1') eval_ppl=tensor(1.3689, device='cuda:1') eval_epoch_loss=tensor(0.3140, device='cuda:1')
Precision: 0.727, Recall: 0.580, F1: 0.645


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=21: train_ppl=tensor(1.2087, device='cuda:1') train_epoch_loss=tensor(0.1896, device='cuda:1') eval_ppl=tensor(1.3584, device='cuda:1') eval_epoch_loss=tensor(0.3063, device='cuda:1')
Precision: 0.706, Recall: 0.607, F1: 0.653


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=22: train_ppl=tensor(1.2097, device='cuda:1') train_epoch_loss=tensor(0.1904, device='cuda:1') eval_ppl=tensor(1.3620, device='cuda:1') eval_epoch_loss=tensor(0.3090, device='cuda:1')
Precision: 0.728, Recall: 0.582, F1: 0.647


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=23: train_ppl=tensor(1.2054, device='cuda:1') train_epoch_loss=tensor(0.1868, device='cuda:1') eval_ppl=tensor(1.3576, device='cuda:1') eval_epoch_loss=tensor(0.3057, device='cuda:1')
Precision: 0.692, Recall: 0.623, F1: 0.656


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=24: train_ppl=tensor(1.2043, device='cuda:1') train_epoch_loss=tensor(0.1859, device='cuda:1') eval_ppl=tensor(1.3614, device='cuda:1') eval_epoch_loss=tensor(0.3085, device='cuda:1')
Precision: 0.709, Recall: 0.591, F1: 0.644


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=25: train_ppl=tensor(1.2031, device='cuda:1') train_epoch_loss=tensor(0.1849, device='cuda:1') eval_ppl=tensor(1.3616, device='cuda:1') eval_epoch_loss=tensor(0.3087, device='cuda:1')
Precision: 0.697, Recall: 0.619, F1: 0.656


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=26: train_ppl=tensor(1.1988, device='cuda:1') train_epoch_loss=tensor(0.1813, device='cuda:1') eval_ppl=tensor(1.3740, device='cuda:1') eval_epoch_loss=tensor(0.3177, device='cuda:1')
Precision: 0.721, Recall: 0.578, F1: 0.642


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=27: train_ppl=tensor(1.1991, device='cuda:1') train_epoch_loss=tensor(0.1816, device='cuda:1') eval_ppl=tensor(1.3699, device='cuda:1') eval_epoch_loss=tensor(0.3147, device='cuda:1')
Precision: 0.711, Recall: 0.593, F1: 0.646


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=28: train_ppl=tensor(1.1959, device='cuda:1') train_epoch_loss=tensor(0.1789, device='cuda:1') eval_ppl=tensor(1.3681, device='cuda:1') eval_epoch_loss=tensor(0.3134, device='cuda:1')
Precision: 0.711, Recall: 0.597, F1: 0.649


  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

epoch=29: train_ppl=tensor(1.1962, device='cuda:1') train_epoch_loss=tensor(0.1791, device='cuda:1') eval_ppl=tensor(1.3713, device='cuda:1') eval_epoch_loss=tensor(0.3158, device='cuda:1')
Precision: 0.719, Recall: 0.578, F1: 0.641


# Load in best model

In [185]:
best_model_dir = '/shared/2/projects/contextual-appropriateness/models/peft/%s/seed-%d/best' \
    % (model_name_or_path, seed)

config = PeftConfig.from_pretrained(best_model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, best_model_dir)
model.to(device)

test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, 
                             batch_size=batch_size, pin_memory=True)

In [186]:
model.eval()
eval_preds = []
for step, batch in enumerate(tqdm(test_dataloader)):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    eval_preds.extend(
        tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), 
                               skip_special_tokens=True)
    )

eval_bin_preds = [1 if p == 'yes' else 0 for p in eval_preds]

  0%|          | 0/22 [00:00<?, ?it/s]

In [187]:
correct = 0
total = 0
for pred, true in zip(eval_preds, dataset["test"]["answer"]):
    if pred.strip() == true.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{dataset['test']['answer'][:10]=}")

accuracy=67.12666338097584 % on the evaluation dataset
eval_preds[:10]=['yes', 'yes', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes']
dataset['test']['answer'][:10]=['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no']


In [188]:
test_preds = [1 if p == 'yes' else 0 for p in eval_preds]
test_df2 = test_df.copy()
test_df2['predicted'] = test_preds

In [189]:
def score(sdf):
    #f1 = f1_score(sdf.actual, sdf.predicted, average='binary')
    #cat = list(set(sdf.category))[0]
    p, r, f1, sup = precision_recall_fscore_support(sdf.label, sdf.predicted, average='binary')
    return pd.Series({'precision': p, 'recall': r, 'f1': f1, 'training examples': len(sdf)})

res = test_df2.groupby('relationship').apply(score).sort_values(by='f1', ascending=False).reset_index()
res.head()

Unnamed: 0,relationship,precision,recall,f1,training examples
0,client,0.833333,1.0,0.909091,8.0
1,patient,0.8,1.0,0.888889,6.0
2,student,0.769231,1.0,0.869565,13.0
3,boss,0.797101,0.948276,0.866142,74.0
4,direct_report,0.793651,0.943396,0.862069,68.0


In [190]:
res.tail(10)

Unnamed: 0,relationship,precision,recall,f1,training examples
39,dating,0.285714,0.25,0.266667,58.0
40,married,0.333333,0.2,0.25,58.0
41,enemy,0.25,0.25,0.25,10.0
42,step_sibling,0.2,0.25,0.222222,56.0
43,person_having_an_affair,0.285714,0.166667,0.210526,50.0
44,childhood_friend,0.2,0.111111,0.142857,64.0
45,friend,0.142857,0.142857,0.142857,72.0
46,old_friend,0.142857,0.095238,0.114286,65.0
47,cousins,0.083333,0.142857,0.105263,56.0
48,best_friend,0.0,0.0,0.0,80.0


In [191]:
res[res.columns[1:]].corr()

Unnamed: 0,precision,recall,f1,training examples
precision,1.0,0.81709,0.958415,-0.419826
recall,0.81709,1.0,0.941595,-0.465744
f1,0.958415,0.941595,1.0,-0.434024
training examples,-0.419826,-0.465744,-0.434024,1.0


# Note that this is *one seed's* scores; the paper reports the mean across all 5 seeds

In [192]:
precision_recall_fscore_support(test_df2.label, test_df2.predicted, average='binary')

(0.6059113300492611, 0.6972789115646258, 0.6483921982076963, None)

## 