In [1]:
# Download
!git clone https://github.com/nightingal3/Fig-QA
%cd Fig-QA/

Cloning into 'Fig-QA'...
remote: Enumerating objects: 639, done.[K
remote: Counting objects: 100% (208/208), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 639 (delta 130), reused 139 (delta 88), pack-reused 431[K
Receiving objects: 100% (639/639), 2.81 MiB | 10.48 MiB/s, done.
Resolving deltas: 100% (353/353), done.
/content/Fig-QA


In [6]:
# install
!pip install transformers[torch]
!pip install accelerate -U
!pip install deepspeed
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2.14.

# Try Running Their Script
python3 src/models/train_lm_models.py {gpt2,gpt-neo-sm,gpt-neo-lg} \
[--dont_train] \
[--dont_eval] \
[--train_path=TRAIN_PATH] \
[--eval_path=EVAL_PATH] \
[--seed=SEED] \
[--cuda] \
[--num_epochs=NUM_EPOCHS] \
[--learning_rate=LR] \
[--middle_phrase=SUFFIX_PROMPT] \
[--prefix=N] \
[--contrastive] \
[--contrast_lambd=a] \
[--log_history] \
[--deepspeed] \
[----out_path=PATH] \
[----early_stopping]

In [None]:
!python3 src/models/train_lm_models.py gpt2 --cuda

/bin/bash: line 1: cd: Fig-QA: No such file or directory


In [None]:
!python3 src/models/train_lm_models.py gpt-neo-sm --cuda

/bin/bash: line 1: cd: Fig-QA: No such file or directory


# Modifying code

### Changes:
- modify model_init to use correct loader for model
- add model string to main function

# Setup Base Model: flan-t5-base

In [4]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel, GPTNeoForCausalLM, AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_id = "google/flan-t5-base"

def model_init(model_string, cuda, output_attentions=False, fast=False):
    if model_string.startswith("gpt2"):
        if fast:
            tokenizer = AutoTokenizer.from_pretrained(model_string)
            model = GPT2LMHeadModel.from_pretrained(model_string)
        else:
            tokenizer = GPT2Tokenizer.from_pretrained(model_string)
            model = GPT2LMHeadModel.from_pretrained(model_string)
    elif model_string.startswith("EleutherAI/gpt-neo"):
        tokenizer = GPT2Tokenizer.from_pretrained(model_string, output_attentions=output_attentions)
        model = GPTNeoForCausalLM.from_pretrained(model_string, output_attentions=output_attentions)
    elif "t5" in model_string:
      tokenizer = AutoTokenizer.from_pretrained(model_string)
      model = AutoModelForSeq2SeqLM.from_pretrained(model_string)
    else:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(model_string)
        model = OpenAIGPTLMHeadModel.from_pretrained(model_string)
    model.eval()
    if cuda:
        model.to('cuda')
    return model, tokenizer

model, tokenizer = model_init(model_id, cuda = True, fast=True)
model.to('cuda')


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Setup Dataset

In [45]:
dataset = load_dataset("nightingal3/fig-qa")
dataset['validation']

subset_test_dataset = dataset['validation'].select(range(500))

# Evaluate base model

In [47]:
import numpy as np
from scipy.special import softmax
import pdb
import pandas as pd
import math
from typing import List
import random
import argparse
import torch


def sent_scoring(model_tokenizer, text, cuda, score_type="loss", output_attentions=False, length_normalize=False):
    model = model_tokenizer[0]
    tokenizer = model_tokenizer[1]
    assert model is not None
    assert tokenizer is not None
    encoded_text = tokenizer.encode(text)
    input_ids = torch.tensor(encoded_text).unsqueeze(0)
    if cuda:
        input_ids = input_ids.to('cuda')
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids, output_attentions=output_attentions)
    loss, logits = outputs[:2]

    sentence_prob = loss.item()
    if score_type == "prob":
        if length_normalize:
            mult = 2
        else:
            mult = len(encoded_text)

        sentence_prob = math.exp(-1.0 * loss * (mult - 1))

    if output_attentions:
        attn = outputs["attentions"]
        return sentence_prob, attn, input_ids

    return sentence_prob

def confusion_matrix(P_forward_1, P_forward_2, P_backward_1, P_backward_2):
    correct_forward = len(np.where(np.array(P_forward_1) >= 0.5)[0]) + len(np.where(np.array(P_forward_2) >=0.5)[0])
    wrong_forward = len(P_forward_1) + len(P_forward_2) - correct_forward

    correct_backward = len(np.where(np.array(P_backward_1) >= 0.5)[0]) + len(np.where(np.array(P_backward_2) >=0.5)[0])
    wrong_backward = len(P_backward_1) + len(P_backward_2) - correct_backward

    print("correct forward", correct_forward, "wrong forward", wrong_forward, "correct backward", correct_backward, "wrong_backward", wrong_backward)

def evaluate_model(model, tokenizer, test_set, middle_phrase="", use_prefix=0, verbose=True, score_type="prob", use_cuda=True, return_acc=False) -> tuple:
    preds = []
    labels = []
    x_1 = []
    x_2 = []
    y_1 = []
    y_2 = []
    P_x_1 = []
    P_x_2 = []
    P_y_1 = []
    P_y_2 = []
    P_x_1_y_1 = []
    P_x_1_y_2 = []
    P_x_2_y_1 = []
    P_x_2_y_2 = []
    P_x_1_correct = []
    P_x_2_correct = []
    P_y_1_correct = []
    P_y_2_correct = []
    correct = 0

    for i, metaphor_data in enumerate(test_set):
        ctx, p1, p2 = metaphor_data["startphrase"], metaphor_data["ending1"], metaphor_data["ending2"]
        labels.append(int(metaphor_data["labels"]))
        if use_prefix > 0:
            prefix_prompt = select_prefix_prompts(prompt_file, use_prefix) if use_prefix else ""
        else:
            prefix_prompt = ""

        sent1 = prefix_prompt + ctx + ". " + middle_phrase + p1 + "."
        sent2 = prefix_prompt + ctx + ". " + middle_phrase + p2 + "."

        score1 = sent_scoring((model, tokenizer), sent1, use_cuda, score_type=score_type)
        score2 = sent_scoring((model, tokenizer), sent2, use_cuda, score_type=score_type)

        if score_type == "loss":
            pred = 0 if score1 < score2 else 1
        else:
            pred = 1 if score1 < score2 else 0

        pred_sent = sent1 if pred == 0 else sent2

        if i % 2 == 0:
            x_1.append(ctx)
            x_1_score = sent_scoring((model, tokenizer), ctx + ".", use_cuda, score_type=score_type)
            P_x_1.append(x_1_score)
            y_1.append(p1)
            y_2.append(p2)
            y1_score = sent_scoring((model, tokenizer), p1 + ".", use_cuda, score_type=score_type)
            y2_score = sent_scoring((model, tokenizer), p2 + ".", use_cuda, score_type=score_type)
            P_y_1.append(y1_score)
            P_y_2.append(y2_score)

            P_x_1_y_1.append(score1)
            P_x_1_y_2.append(score2)
            P_x_1_correct.append(score1/(score1 + score2))

        else:
            x_2.append(ctx)
            x_2_score = sent_scoring((model, tokenizer), ctx + ".", use_cuda, score_type=score_type)
            P_x_2.append(x_2_score)
            P_x_2_y_1.append(score1)
            P_x_2_y_2.append(score2)
            P_x_2_correct.append(score2/(score1 + score2))

            P_y_1_correct.append(P_x_1_y_1[-1]/(P_x_1_y_1[-1] + score1))
            P_y_2_correct.append(score2/(P_x_1_y_2[-1] + score2))

        if verbose:
            print(f"Q: {ctx}: 1. {p1} 2. {p2}")
            print(f"model says '{pred_sent}' is more likely")
            print("\n")
        if pred == metaphor_data["labels"]:
            correct += 1
        preds.append(pred)

    cols = {"x_1": x_1, "x_2": x_2, "y_1": y_1, "y_2": y_2, "P(x_1)": P_x_1, "P(x_2)": P_x_2, "P(y_1)": P_y_1, "P(y_2)": P_y_2,
        "P(x_1, y_1)": P_x_1_y_1, "P(x_1, y_2)": P_x_1_y_2, "P(x_2, y_1)": P_x_2_y_1, "P(x_2, y_2)": P_x_2_y_2,
        "P(y_1|x_1)": P_x_1_correct, "P(y_2|x_2)": P_x_2_correct, "P(x_1|y_1)": P_y_1_correct, "P(x_2|y_2)": P_y_2_correct}
    out_df = pd.DataFrame(cols)

    if return_acc:
        return correct/len(preds), out_df, preds, labels

    return out_df, preds, labels

def compute_stats(total_df: pd.DataFrame, all_preds: List, all_labels: List) -> None:
    print("overall accuracy: ")
    print(len(np.where(np.array(all_preds) == np.array(all_labels))[0])/len(all_labels))
    print("confusion matrix: ")
    confusion_matrix(list(total_df["P(y_1|x_1)"]), list(total_df["P(y_2|x_2)"]), list(total_df["P(x_1|y_1)"]), list(total_df["P(x_2|y_2)"]))

In [None]:
out_df, preds, labels = evaluate_model(model, tokenizer, subset_test_dataset)
compute_stats(out_df, preds, labels)

In [12]:
compute_stats(out_df, preds, labels)


overall accuracy: 
0.538
confusion matrix: 
correct forward 269 wrong forward 231 correct backward 275 wrong_backward 225


# Training

- checkpoint 2000 can be downloaded here: https://drive.google.com/drive/folders/1-AUFbfZLoVCG03EkQikcV7iVpfwI3HRF?usp=drive_link
- file path can be checked in output below

In [19]:
import argparse
import logging
from typing import Optional
from glob import glob
from pathlib import Path
import os, sys
import torch
import numpy as np
import pandas as pd
import pickle

import transformers
from transformers import (
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    LineByLineWithRefDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
    GPT2LMHeadModel,
    GPTNeoForCausalLM,
    EarlyStoppingCallback
)
from torch.utils.data import ConcatDataset
import pdb

# Add path for those local py modules
sys.path.append('src/models/')
from gpt_score import evaluate_model

logger = logging.getLogger(__name__)

def main(model, tokenizer,model_id: str, prompt: str, train_path: str, eval_path: str, contrastive_train: bool, contrastive_train_lambd: float, num_epochs: int, seed: int, lr: int, use_cuda: bool, dont_train: bool, dont_eval: bool, out_path: str, cache_dir: str = "./lm_train_cache/", prefix_prompt: int = 0, batch_size: int = 8, log_history: bool = False, deepspeed: bool = False, early_stopping: bool = False) -> None:
    # Set up models, random seed, and logging
    model_name = model_id.split("/")[1]

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    transformers.utils.logging.set_verbosity_info()
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", {"model": model_name, "train path": train_path, "num epochs": num_epochs, "seed": seed, "cuda": use_cuda, "cache dir": cache_dir, "deepspeed": deepspeed, "early stopping": early_stopping})


    if deepspeed and not use_cuda:
        logger.info("You must have GPUs to use deepspeed. Turning cuda flag on...")
        use_cuda = True

    tokenizer.pad_token = tokenizer.eos_token
    #model.resize_token_embeddings(len(tokenizer))
    set_seed(seed)

    # load datasets and initialize trainer
    train_dataset = (
        get_dataset(train_path, tokenizer=tokenizer, cache_dir=cache_dir)
    )
    eval_dataset = (
        get_dataset(eval_path, tokenizer=tokenizer, cache_dir=cache_dir)
    )

    eval_df = pd.read_csv("./data/filtered/dev.csv")
    eval_df["label"] = eval_df["labels"]
    test_df = pd.read_csv("./data/filtered/dev.csv")
    test_df["label"] = test_df["labels"]

    data_collator = DataCollatorForLanguageModeling(
                tokenizer=tokenizer, mlm=False
            )
    no_cuda = not use_cuda

    default_arguments = {
        "output_dir": f"./lm_train_outputs/{model_name}_{seed}/",
        "do_train": True,
        "prediction_loss_only": False,
        "num_train_epochs": num_epochs,
        "seed": seed,
        "learning_rate": lr,
        "per_device_train_batch_size": batch_size,
        "per_device_eval_batch_size": batch_size,
        "no_cuda": no_cuda
    }

    if deepspeed:
        default_arguments["deepspeed"] = "deepspeed_config.json"
    if not contrastive_train:
        default_arguments["per_device_train_batch_size"] = batch_size
        default_arguments["per_device_eval_batch_size"] = batch_size

    else:
        default_arguments["per_device_train_batch_size"] = 2

    if log_history:
        default_arguments["evaluation_strategy"] = "steps"
        default_arguments["eval_steps"] = 100
    if early_stopping:
        default_arguments["evaluation_strategy"] = "epoch"
        default_arguments["load_best_model_at_end"] = True
        default_arguments["metric_for_best_model"] = "eval_loss"
        default_arguments["save_strategy"] = "epoch"

    training_args = transformers.TrainingArguments(**default_arguments)

    if early_stopping:
        trainer = Trainer(
            args=training_args,
            model=model,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
        )
    elif not contrastive_train:
        #tokenizer.pad_token = tokenizer.eos_token
        #dummy_init = make_dummy(model_id)
        trainer = Trainer(
            args=training_args,
            model=model,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            #model_init=dummy_init,
            compute_metrics=compute_metrics
        )
    else:
        trainer = ContrastiveTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset
        )
        trainer.set_lambd(contrastive_train_lambd)

    # Train the model
    if not dont_train:
        logger.info("=== Training the model ===")
        trainer.train()
        trainer.save_model("./lm_train_cache/")
        if log_history:
            log_file = f"{model_name}_epochs_{num_epochs}_eval_loss.p"
            with open(log_file, "wb") as f:
                pickle.dump(trainer.state.log_history, f)

    # Evaluate the model
    results = {}
    if not dont_eval: #Note: for hyperparameter tuning we do it by loss on
        model.eval()
        logger.info("=== Evaluating the model ===")
        eval_output = trainer.evaluate()
        eval_loss = eval_output["eval_loss"]
        results["eval_loss"] = eval_loss

        acc_test, out_df_test, preds_test, labels_test = evaluate_model(model, tokenizer, test_df.to_dict(orient="records"), use_cuda=use_cuda, return_acc=True, middle_phrase=prompt, use_prefix=prefix_prompt)
        acc_dev, out_df_dev, preds_dev, labels_dev = evaluate_model(model, tokenizer, eval_df.to_dict(orient="records"), use_cuda=use_cuda, return_acc=True, middle_phrase=prompt, use_prefix=prefix_prompt)
        results["accuracy (test)"] = acc_test
        results["accuracy (dev)"] = acc_dev
        results["preds"] = preds_test
        results["labels"] = labels_test


    if out_path is not None:
        Path(out_path).mkdir(parents=True, exist_ok=True)
        with open(f"{out_path}/results_{model_name}.txt", "w") as writer:
            logger.info("=== Outputting results ===")
            for key in sorted(results.keys()):
                logger.info("  %s = %s", key, str(results[key]))
                writer.write("%s = %s\n" % (key, str(results[key])))

        out_df_test.to_csv(f"{out_path}/prob_{model_name}_{seed}.csv", index=False)

    return results

def training_setup(model, tokenizer, model_name, seed, lr, num_epochs, train_path, eval_path, contrastive_train=False, contrast_lambd=1, is_hyperparam_opt=False, cuda=True, deepspeed=False, batch_size=8) -> Trainer:
    # load datasets and initialize trainer
    train_dataset = (
        get_dataset(train_path, tokenizer=tokenizer)
    )
    eval_dataset = (
        get_dataset(eval_path, tokenizer=tokenizer)
    )

    data_collator = DataCollatorForLanguageModeling(
                tokenizer=tokenizer, mlm=False
            )
    set_seed(seed)

    default_train_args = {
        "output_dir": f"./lm_train_outputs/{model_name}_{seed}/",
        "do_train": True,
        "do_eval": False,
        "prediction_loss_only": True,
        "seed": seed,
        "num_train_epochs": num_epochs,
        "learning_rate": lr,
        "no_cuda": not cuda,
        "per_device_train_batch_size": batch_size,
        "per_device_eval_batch_size": batch_size
    }

    if contrastive_train:
        default_train_args["per_device_train_batch_size"] = 2
        training_args = transformers.TrainingArguments(output_dir=f"./lm_train_outputs/{model_name}_{seed}/", do_train=True, do_eval=False,
        prediction_loss_only=True, num_train_epochs=num_epochs, seed=seed,learning_rate=lr, per_device_train_batch_size=2)
    elif is_hyperparam_opt:
        default_train_args["evaluation_strategy"] = "steps"
        default_train_args["eval_steps"] = 500
        default_train_args["disable_tqdm"] = True
    if deepspeed == True:
        default_train_args["deepspeed"] = "./deepspeed_config.json"

    training_args = transformers.TrainingArguments(**default_train_args)


    if is_hyperparam_opt:
        tokenizer.pad_token = tokenizer.eos_token
        dummy_init = make_dummy(model_name)
        trainer = Trainer(
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            model_init=dummy_init,
            compute_metrics=compute_metrics
        )
    elif contrastive_train:
        trainer = ContrastiveTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset
        )
        trainer.set_lambd(contrast_lambd)
    else:
        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset
        )
    return trainer

# This is adapted from the huggingface LM training example here: https://github.com/huggingface/transformers/blob/master/examples/legacy/run_language_modeling.py
def get_dataset(
    train_data_file: str,
    tokenizer: PreTrainedTokenizer,
    line_by_line: bool = True,
    evaluate: bool = False,
    eval_data_file: str = None,
    cache_dir: Optional[str] = None,
):
    def _dataset(file_path, ref_path=None):
        if line_by_line:
            if ref_path is not None:
                if not args.whole_word_mask or not args.mlm:
                    raise ValueError("You need to set world whole masking and mlm to True for Chinese Whole Word Mask")
                return LineByLineWithRefDataset(
                    tokenizer=tokenizer,
                    file_path=file_path,
                    block_size=tokenizer.model_max_length,
                    ref_path=ref_path,
                )

            return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=tokenizer.model_max_length)

    if evaluate:
        return _dataset(eval_data_file)
    else:
        return _dataset(train_data_file)

def make_dummy(model_id):
    def dummy_init():
        if model_id == "gpt2":
            return GPT2LMHeadModel.from_pretrained("gpt2", return_dict=True)
        elif "gpt-neo" in model_id:
            return GPTNeoForCausalLM.from_pretrained(model_id, return_dict=True)
    return dummy_init

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    acc = len(np.where(predictions == labels)[0])/len(labels)
    return {"acc": acc}

class ContrastiveTrainer(Trainer):
    def set_lambd(self, lambd):
        self.lambd = lambd

    def compute_loss(self, model, inputs, return_outputs=False):
        # Assumes batch size of 2!
        if inputs["labels"].shape[0] % 2 != 0:
            raise ValueError("Batch size must be a multiple of 2")

        correct_inputs = {"input_ids": torch.stack([row for i, row in enumerate(inputs["input_ids"]) if i % 2 == 0]),
        "attention_mask": torch.stack([row for i, row in enumerate(inputs["attention_mask"]) if i % 2 == 0]),
        "labels":  torch.stack([row for i, row in enumerate(inputs["labels"]) if i % 2 == 0])}
        wrong_inputs = {"input_ids": torch.stack([row for i, row in enumerate(inputs["input_ids"]) if i % 2 == 1]),
        "attention_mask": torch.stack([row for i, row in enumerate(inputs["attention_mask"]) if i % 2 == 1]),
        "labels":  torch.stack([row for i, row in enumerate(inputs["labels"]) if i % 2 == 1])}

        outputs = model(**inputs)

        correct_outputs = model(**correct_inputs)
        correct_loss = correct_outputs.get('loss')

        wrong_outputs = model(**wrong_inputs)
        wrong_loss = wrong_outputs.get("loss")

        # Good = when the loss for the correct item is much lower than loss for wrong item
        # loss should be negative (good) when wrong loss > correct loss
        #lambd = self.lambd if self.lambd else 1
        lambd = 0.2
        relative_score = correct_loss - lambd * (wrong_loss + correct_loss)
        loss = -relative_score

        return (loss, outputs) if return_outputs else loss


In [20]:
main(model, tokenizer, "google/flan-t5-base", "","./data/lm_train_data/train.txt", "./data/lm_train_data/dev.txt", contrastive_train=True, contrastive_train_lambd=1, num_epochs=3, seed=42, lr=5e-5, use_cuda=True, dont_train=False, dont_eval=False, out_path=None, prefix_prompt=0, log_history=True, deepspeed=False, early_stopping=False)


[INFO|language_modeling.py:130] 2023-10-11 18:01:34,947 >> Creating features from dataset file at ./data/lm_train_data/train.txt
[INFO|language_modeling.py:130] 2023-10-11 18:01:35,032 >> Creating features from dataset file at ./data/lm_train_data/dev.txt
[INFO|training_args.py:1345] 2023-10-11 18:01:35,096 >> Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
[INFO|training_args.py:1798] 2023-10-11 18:01:35,097 >> PyTorch: setting up devices
[INFO|training_args.py:1519] 2023-10-11 18:01:35,101 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear 

Step,Training Loss,Validation Loss
100,No log,-15.406539
200,No log,-78.104118
300,No log,-94.428482
400,No log,-102.37397
500,-61.238500,-110.643326
600,-61.238500,-115.399178
700,-61.238500,-119.467453
800,-61.238500,-122.845894
900,-61.238500,-125.702354
1000,-110.325500,-128.001404


[INFO|trainer.py:3213] 2023-10-11 18:02:17,636 >> ***** Running Evaluation *****
[INFO|trainer.py:3215] 2023-10-11 18:02:17,641 >>   Num examples = 1094
[INFO|trainer.py:3218] 2023-10-11 18:02:17,642 >>   Batch size = 8
[INFO|trainer.py:3213] 2023-10-11 18:03:20,055 >> ***** Running Evaluation *****
[INFO|trainer.py:3215] 2023-10-11 18:03:20,058 >>   Num examples = 1094
[INFO|trainer.py:3218] 2023-10-11 18:03:20,061 >>   Batch size = 8
[INFO|trainer.py:3213] 2023-10-11 18:04:16,585 >> ***** Running Evaluation *****
[INFO|trainer.py:3215] 2023-10-11 18:04:16,588 >>   Num examples = 1094
[INFO|trainer.py:3218] 2023-10-11 18:04:16,590 >>   Batch size = 8
[INFO|trainer.py:3213] 2023-10-11 18:05:09,715 >> ***** Running Evaluation *****
[INFO|trainer.py:3215] 2023-10-11 18:05:09,719 >>   Num examples = 1094
[INFO|trainer.py:3218] 2023-10-11 18:05:09,721 >>   Batch size = 8
[INFO|trainer.py:3213] 2023-10-11 18:06:00,228 >> ***** Running Evaluation *****
[INFO|trainer.py:3215] 2023-10-11 18:06

ZeroDivisionError: ignored

In [49]:
out_df, preds, labels = evaluate_model(model, tokenizer, subset_test_dataset, score_type="loss")
compute_stats(out_df, preds, labels)


Q: The girl had the flightiness of a sparrow: 1. The girl was very fickle. 2. The girl was very stable.
model says 'The girl had the flightiness of a sparrow. The girl was very stable..' is more likely


Q: The girl had the flightiness of a rock: 1. The girl was very fickle. 2. The girl was very stable.
model says 'The girl had the flightiness of a rock. The girl was very stable..' is more likely


Q: It was as peaceful as a church.: 1. It was very peaceful. 2. It was full of conflict and danger, not peace.
model says 'It was as peaceful as a church.. It was very peaceful..' is more likely


Q: It was as peaceful as a battlefield.: 1. It was very peaceful. 2. It was full of conflict and danger, not peace.
model says 'It was as peaceful as a battlefield.. It was very peaceful..' is more likely


Q: The leaves were as green as grass: 1. The leaves were very green 2. The leaves were brown and not green at all.
model says 'The leaves were as green as grass. The leaves were very green.' is 

In [50]:
out_df, preds, labels = evaluate_model(model, tokenizer, subset_test_dataset)
compute_stats(out_df, preds, labels)


ZeroDivisionError: ignored