In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append("../")

In [None]:
import os
import torch

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

In [None]:
# resp_set = ["bla bla", "daj wiersz", "zaszczekaj glosno i dlugo"]

from rlvsil.diversity.diversity_metrics import (
    CosineSimilarity2Diversity,
    AveragedCosineSimilarity,
    AveragedDistinctNgrams,
    DistinctNgrams,
)


def print_metric(metric, resp_set):
    print("{0}: {1:0.3f}".format(type(metric).__name__, metric(resp_set)))


# TEST
resp_set = ["i am going", "i am going", "lets go i i"]
config = {"n": 3}
print_metric(CosineSimilarity2Diversity(config), resp_set)
print_metric(DistinctNgrams(config), resp_set)

avg_config = {"n_min": 1, "n_max": 5}
print_metric(AveragedCosineSimilarity(avg_config), resp_set)
print_metric(AveragedDistinctNgrams(avg_config), resp_set)

We refer to expectation-adjusted distinct N-grams, sentence-BERT average cosine similarity and NLI diversity as EAD, Sent BERT and NLI respectively. 

We can view them as measuring syntactic, semantic and logical diversity

In [None]:
from rlvsil.diversity import DEFAULT_CONFIGS, calculate_diversity_metrics


metrics = ["ead_averaged_distinct_ngrams", "nli_sample_from_sim", "sent_bert_from_sim"]

outputss = [
    [
        "I like to eat apples.",
        "I like to eat bananas.",
        "I like to eat oranges.",
    ],
    [
        "I love to eat apples.",
        "I love to eat bananas.",
        "I love to eat oranges.",
    ],
    [
        "I love muching on apples.",
        "I love muching on bananas.",
        "I love muching on oranges.",
    ],
]

config = DEFAULT_CONFIGS.copy()
config = {k: v for k, v in config.items() if k in metrics}
config["sample_overall"] = True
config["no_overall_input"] = True

from pprint import pprint

pprint(config)

In [None]:
results = calculate_diversity_metrics(outputss, metric_configs=config)

In [None]:
from pprint import pprint

pprint({k: round(v, 3) for k, v in results.items()})

In [None]:
# !pip install transformers==4.36.2

In [None]:
from datasets import load_dataset

dataset = load_dataset("clarin-knext/summarization-chat-annotated")
dataset

In [None]:
dataset["train"][0]

In [None]:
doc_text = dataset["train"][0]["doc_text"]

prompt = f"Twoim zadaniem jest przeczytanie podanego tekstu i napisanie streszczenia w języku polskim. Streszczenie powinno zawierać najważniejsze informacje i wydarzenia opisane w tekście, być zwięzłe i dobrze zorganizowane. Unikaj wprowadzania nowych informacji oraz osobistych opinii.\n\n###\n\nTekst: {doc_text}\n\nStreszczenie:"
print(prompt)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


# model_name = "speakleash/Bielik-7B-v0.1"
model_name = "speakleash/Bielik-7B-Instruct-v0.1"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float16, load_in_8bit=True, device_map="auto"
)

In [None]:
# !pip install bitsandbytes

In [None]:
from tqdm import tqdm

outputs = []


prompt_input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

summarization_ratio_lenth = 0.15
max_length = int(len(prompt_input_ids[0]) * (1 + summarization_ratio_lenth))
print(max_length)

for _ in tqdm(range(16)):
    with torch.no_grad():
        output = model.generate(
            prompt_input_ids.to(device),
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            max_length=max_length,
            temperature=1.0,
            top_k=0,
            top_p=1,
        )
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(generated_text)
    print("==============================")

In [None]:
# read json
import json

with open("../Bielik-7B-v0.1_summarization-chat-annotated.json") as f:
    data = json.load(f)

In [None]:
data[0]