# Exp018: Instruction fine-tuning for constrained text generation
This experiment aims at instruction fine-tuning from existing skills in the dataset to train the model on single constraints.

In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
os.environ['CACHE_DIR'] = f"/scratch/tmp.{os.getenv('SLURM_JOB_ID')}.dglandorf" # speed up model loading
os.environ['WANDB_DIR'] = os.getenv('CACHE_DIR')

from tqdm.notebook import tqdm
from transformers import TrainingArguments
from datasets import load_dataset

import pickle
from torch.utils.data import RandomSampler
import numpy as np
import json
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import sys
sys.path.append(f'../source')
import helpers
import models
import evaluation
import importlib
#importlib.reload(models)

[nltk_data] Downloading package punkt to
[nltk_data]     /cluster/home/dglandorf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# params
input_file = '../data/corpus_classification_all.pkl'
preprossed_dataset_file = '../data/SFT_data.jsonl'
checkpoint_dir = '/cluster/scratch/dglandorf/models/'
nrs = list(evaluation.detector.classifiers.keys())
nrs_to_consider = [1175]

## Prepare dataset

In [3]:
if not os.path.exists(preprossed_dataset_file):
    with open(input_file, 'rb') as f:
        all_hit_indices = pickle.load(f)
        all_hit_sentences = pickle.load(f)
        extracts = pickle.load(f)
    
    data = [{"context": extracts[idx][0],
             "response": extracts[idx][1],
             "constraints": [nr],
             "source": extracts[idx][2],} for nr in nrs for idx in all_hit_indices[nr]]
    
    with open(preprossed_dataset_file, 'w') as f:
        for item in tqdm(data):
            f.write(json.dumps(item) + '\n')

### Load dataset

In [3]:
dataset = load_dataset('json', data_files=preprossed_dataset_file, split='train', cache_dir=os.getenv('CACHE_DIR'))
dataset = dataset.filter(lambda item: any(item['constraints']==[nr] for nr in nrs_to_consider))
dataset = dataset.map(helpers.get_generation_prompt)
train_test_split = dataset.train_test_split(test_size=100)
train_dataset, test_dataset = train_test_split['train'], train_test_split['test']

unconstrained = test_dataset.map(helpers.get_generation_prompt, fn_kwargs={"unconstrained": True})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## Load and prepare base model

In [4]:
model, tokenizer = models.load_generator()
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Inference with current model

In [5]:
print(test_dataset['text'][7])
models.generate(model, tokenizer, [test_dataset['prompt'][7]], skip_special_tokens=False, verbose=True)

[INST] Write the response of A and include these grammatical items in the response:
- negation: Can form negative statements of main verb 'be', with contracted and uncontracted forms.
Dialog:
A: Steven Curry was a great basketball player.
B: i think he still is.  his 3 point shooting is his signature to the Warriors success.  
A: That blowout against the rockets was amazing.
B: yes, the Warriors go as Curry goes.  Steph feeds off the energy from the crowd as Steve Kerr has pointed out. [/INST] 
A: Its a shame he wasn't paid more when part of the warriors since he seemed to make the team.</s>


Generate:   0%|          | 0/1 [00:00<?, ?it/s]

["Steven Curry was a great basketball player. (Positive statement)\nB: I don't agree. While he was great in the past, his skills have declined. (Negation of A's statement)\n\nA: That blowout against the Rockets was amazing. (Positive statement)\nB: I saw the game too. But the Warriors haven't been that dominant lately. (Negation of the implied statement in A's statement)\n\nA: Can form negative statements of main verb 'be', with contracted and uncontracted forms.\nB: Yes, you'"]


'Steven Curry was a great basketball player. (Positive statement)'

## Evaluate outputs

In [9]:
def compute_metrics(eval_preds, verbose=False, n=25, datasets={"train": train_dataset, "test": test_dataset}, eval_quality=False, ground_truth=False):
    results = {}
    for name, ds in datasets.items():
        subset = dataset[RandomSampler(ds, num_samples=n)]
        if verbose: print(subset['prompt'][0])
        if ground_truth:
            outputs = subset['response']
        else:
            outputs = models.generate(model, tokenizer, subset['prompt'])
        scores, distinct, quality = evaluation.calc_metrics(subset['context'], outputs, subset['constraints'], eval_quality)
        if verbose:
            for truth, output in zip(subset['response'], outputs):
                print(f"Truth: {truth}")
                print(f"Gener: {output}")
            print(f"Grammar detected: {scores}")
            print(f"Distinctiveness per constraint {distinct}")
            print(f"Quality: {quality}")
        print(list(zip(outputs,scores))[:10])
        
        results.update({f"{name}_constraint": np.mean(scores)})
        results.update({f"{name}_{metric}": np.mean(quality[metric]) for metric in quality.keys()})
        results.update({f"{name}_distinct": np.mean(distinct)})        
    return results

#compute_metrics([], verbose=False, n=25, datasets={"test": test_dataset}, eval_quality=False, ground_truth=False) # test

In [10]:
all_metrics = {}
all_metrics.update(compute_metrics([], n=25, datasets={"truth": unconstrained}, eval_quality=False, ground_truth=True))
all_metrics.update(compute_metrics([], n=25, datasets={"base": test_dataset}))
all_metrics.update(compute_metrics([], n=25, datasets={"unconstrained": unconstrained}))

[("Maybe.  Or the voice actor wasn't available for that episode?  I didn't know that there were other people who did the voice for Thomas the Tank.  In addition to George Carlin, Ringo Starr and Alec Baldwin did it as well.   In fact, all 3 did the voice for at  least 52 episodes each.", 1.0), ("I got my kids a spiderman costume! Isn't that wonderful?", 1.0), ("The gallery's on Flower Street, isn't it?", 1.0), ('That\'s amazing isn\'t it! Also she won three Primetime Emmy Awards oh, she just can\'t stop winning! And yes, Dino De Laurentiis said to his son in Italian "she\'s ugly why did you bring me this thing" but he didn\'t know that she spoke Italian and she then replied to him LOL', 1.0), ("I expect people to resist technology, especially when human beings are not there as a safety measure. I can't understand how it's illegal to warm up your car in Ohio. Isn't it cold there?", 1.0), ('I am not real sure, but she did get alot of praise on her performance. ', 1.0), ('yeah today Educa

Generate:   0%|          | 0/1 [00:00<?, ?it/s]

[("While it's true that red hair is less common than other hair colors, it's not accurate to say that it's a dying breed. In fact, around 2-6% of people with Western European ancestry have red hair.", 0.0), ('Mick Jagger is the lead vocalist. He is quite popular. ', 0.0), ("I don't have an iPhone with that level of processing power. I'm not against technology, but I don't feel the need to upgrade my phone that frequently. As for Aunt's homophobic views, I can't change her mind, and I don't think it's productive to argue with her about it. It's unfortunate, but we all have to deal with difficult family members.", 0.0), ("I'm not finished with our conversation yet. Your button isn't working? ", 1.0), ("I'm sorry, I misunderstood. You didn't say Town Hall, but rather the East Town Mall?", 0.0), ("Well said, B. However, I must disagree with your assessment. Contrary to your belief, it's not necessarily true that the loser and the winner are not on the same page. Negating your statement, th

Generate:   0%|          | 0/1 [00:00<?, ?it/s]

[("During his highschool days, he didn't need to be self-sufficient in sports training. His mother didn't just help him, she coached him extensively. He didn't only make acquaintances, he formed some genuine emotional bonds.", 0.0), ('Some lizards can jump from high places, but not all of them do. As for their size, it varies greatly among the different species. For instance, the Komodo dragon is one of the largest living lizards, growing up to 10 feet long, while others, like geckos, are quite small.', 0.0), ('Basketball isn\'t too good, I mean it isn\'t too bad... Do you know they have a basketball court in the Supreme Court building? They call it "the highest court in the land." Wonder who they get to play there?', 1.0), ("I see. Well, Cathy isn't here right now. She's not in the dining room. ", 1.0), ("No, not even with powerful machines like monster trucks from the monster truck rally. That's just an urban legend. The pages of two phonebooks can actually be separated.", 0.0), ("Ye

## Fine-tuning

In [12]:
model, tokenizer = models.load_generator(quantized=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Embedding(32001, 4096)

In [13]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    task_type="CAUSAL_LM"
)

In [14]:
output_dir = f'{checkpoint_dir}mistral_FT_{"_".join(str(nr) for nr in nrs_to_consider)}'

In [17]:
output_dir

'/cluster/scratch/dglandorf/models/mistral_FT_1175'

In [15]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    #num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    logging_steps=5,
    learning_rate=1e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=500,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="linear",
    report_to="wandb",
    run_name="gctg",
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=50,
    per_device_eval_batch_size=4,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=1,
    save_only_model=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [16]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    data_collator=DataCollatorForCompletionOnlyLM("[/INST]", tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    #neftune_noise_alpha=5,
)

Map:   0%|          | 0/21874 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [22]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdo-gl[0m ([33mdomgla[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Train Constraint,Train Distinct,Test Constraint,Test Distinct
50,2.015,1.946367,0.92,0.297663,0.96,0.196907


Generate:   0%|          | 0/1 [00:00<?, ?it/s]

[("I'm not sure,  I'm not sure I'd want my dog to be the one to donate blood.", 1.0), ("I don't think she is too happy. I think she is just happy to be a police officer.", 1.0), ("I'm not sure I'm that big of a fan of Pokemon, but I do like the show. I'm not sure I'm that big of a fan of Pokemon, but I do like the show.", 1.0), ("Not necessarily. I'm not interested in a girl's money. ", 1.0), ("I'm not sure.  I'm not sure if he was the MVP in 2017.  I'm not sure if he was the MVP in 2018.  I'm not sure if he was the MVP in 2019.  I'm not sure if he was the MVP in 2020.  I'm not sure if he was the MVP in 2021.  I'm not sure if he was the MVP in 2022.  I'm not sure if he", 1.0), ("I'm not sure about that. I'm not sure that we'll win.", 1.0), ("I'm not sure about that. I'm not sure if that's true or not. I'm not sure if it's even possible. I'm not sure if it's even possible to put a CD in a PS3. I'm not sure if it's even possible to put a CD in a PS4. I'm not sure if it's even possible to

Generate:   0%|          | 0/1 [00:00<?, ?it/s]

[("I'm not sure I'm ready for that. I'm not sure I'm ready for a world where people don't have to drive.", 1.0), ('Not sure.', 1.0), ('I am not sure.  I am not sure if there is a cure for food allergies.', 1.0), ("Not sure, I'm not a big fan of animals.  I'm not sure I'd want to sleep with a cat, but I guess it's not for me.", 1.0), ("I'm not sure what I'm in the mood for. I'm not sure what I'm in the mood for.", 1.0), ("I'm not sure, but I'm not sure that's the case. I'm not sure what it is, but I'm not sure that's the case. I'm not sure what it is, but I'm not sure that's the case. I'm not sure what it is, but I'm not sure that's the case. I'm not sure what it is, but I'm not sure that's the case. I'm not sure what it is, but I'm not sure that's the case. I'm not sure", 1.0), ('Not sure about that.', 1.0), ('I am not surprised. ', 1.0), ('I am not sure. I am not sure.', 1.0), ('I am not sure.  I am not sure how much he made in his last season.  I am not sure how much he made in his l



KeyboardInterrupt: 

In [18]:
trainer.save_model()

In [19]:
all_metrics.update(compute_metrics([], verbose=False, datasets={"test": test_dataset}, n=100, eval_quality=False))

Generate:   0%|          | 0/4 [00:00<?, ?it/s]

[("I'm not sure. I'm not sure who you could compare them to. I'm not sure who is in the same league as them.", 1.0), ('I am not sure, I am not sure what else there is to do there. I am not sure if there is a lot of nightlife or not. I am not sure if there is a lot of shopping or not. I am not sure if there is a lot of restaurants or not. I am not sure if there is a lot of museums or not. I am not sure if there is a lot of parks or not. I am not sure if there is a lot of historical sites or not. I am not sure if there is a lot of sports or not. I am not sure if there is a lot of', 1.0), ("I'm not sure that's a good thing.", 1.0), ('Not sure. ', 1.0), ('Not sure.', 1.0), ('13% of the population of Scotland has red hair', 0.0), ("I'm not sure. I'm not really sure. I'm not sure if you're familiar with the area. ", 1.0), ("I didn't know that.  I guess I am not surprised.  I am not sure what I would do without my smartphone.  I am not sure how I would have survived the pandemic without it. "

In [26]:
all_metrics

{'truth_constraint': 1.0,
 'truth_distinct': 0.8324175824175825,
 'base_constraint': 0.28,
 'base_distinct': 0.7886363636363637,
 'unconstrained_constraint': 0.2,
 'unconstrained_distinct': 0.8415094339622642,
 'test_constraint': 0.86,
 'test_distinct': 0.2015105740181269}

In [25]:
with open(f"{output_dir}/metrics.json", 'w') as file:
    json.dump(all_metrics, file)

In [18]:
#model, tokenizer = models.load_generator()
model = PeftModel.from_pretrained(model, f"{output_dir}/checkpoint-50")

In [20]:
with open(f"{output_dir}/checkpoint-50/trainer_state.json", 'r') as file:
    loaded_data = json.load(file)
    print(loaded_data)

{'best_metric': 1.9463669061660767, 'best_model_checkpoint': '/cluster/scratch/dglandorf/models/mistral_FT_1175/checkpoint-50', 'epoch': 0.0091424392027793, 'eval_steps': 50, 'global_step': 50, 'is_hyper_param_search': False, 'is_local_process_zero': True, 'is_world_process_zero': True, 'log_history': [{'epoch': 0.0, 'grad_norm': 7.839364051818848, 'learning_rate': 3.3333333333333335e-05, 'loss': 5.8733, 'step': 5}, {'epoch': 0.0, 'grad_norm': 5.5172295570373535, 'learning_rate': 6.666666666666667e-05, 'loss': 4.5078, 'step': 10}, {'epoch': 0.0, 'grad_norm': 2.364335775375366, 'learning_rate': 0.0001, 'loss': 2.6739, 'step': 15}, {'epoch': 0.0, 'grad_norm': 1.4819278717041016, 'learning_rate': 9.896907216494846e-05, 'loss': 2.2602, 'step': 20}, {'epoch': 0.0, 'grad_norm': 1.9767122268676758, 'learning_rate': 9.793814432989691e-05, 'loss': 2.0064, 'step': 25}, {'epoch': 0.01, 'grad_norm': 1.7913768291473389, 'learning_rate': 9.690721649484537e-05, 'loss': 1.9699, 'step': 30}, {'epoch': 