In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("../")

In [3]:
import os
import torch

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [4]:
# resp_set = ["bla bla", "daj wiersz", "zaszczekaj glosno i dlugo"]

from rlvsil.diversity.diversity_metrics import CosineSimilarity2Diversity, AveragedCosineSimilarity, AveragedDistinctNgrams, DistinctNgrams


def print_metric(metric, resp_set):
    print("{0}: {1:0.3f}".format(type(metric).__name__, metric(resp_set)))

# TEST
resp_set = ["i am going", "i am going", "lets go i i"]
config = {"n": 3}
print_metric(CosineSimilarity2Diversity(config), resp_set)
print_metric(DistinctNgrams(config), resp_set)

avg_config = {"n_min": 1, "n_max": 5}
print_metric(AveragedCosineSimilarity(avg_config), resp_set)
print_metric(AveragedDistinctNgrams(avg_config), resp_set)


CosineSimilarity2Diversity: 0.667
DistinctNgrams: 0.750
AveragedCosineSimilarity: 0.737
AveragedDistinctNgrams: 0.593


We refer to expectation-adjusted distinct N-grams, sentence-BERT average cosine similarity and NLI diversity as EAD, Sent BERT and NLI respectively. 

We can view them as measuring syntactic, semantic and logical diversity

In [6]:
from rlvsil.diversity import DEFAULT_CONFIGS, calculate_diversity_metrics


metrics = ['ead_averaged_distinct_ngrams' , 'nli_sample_from_sim' , 'sent_bert_from_sim']

outputss = [
    [
        "I like to eat apples.",
        "I like to eat bananas.",
        "I like to eat oranges.",
    ],
    [
        "I love to eat apples.",
        "I love to eat bananas.",
        "I love to eat oranges.",
    ],
    [
        "I love muching on apples.",
        "I love muching on bananas.",
        "I love muching on oranges.",
    ],
]

config = DEFAULT_CONFIGS.copy()
config = {k:v for k,v in config.items() if k in metrics}
config["sample_overall"] = True
config["no_overall_input"] = True

from pprint import pprint
pprint(config)


{'ead_averaged_distinct_ngrams': {'n_max': 5, 'n_min': 1, 'vocab_size': 50257},
 'nli_sample_from_sim': {'model_name': 'roberta-large-mnli',
                         'n': 5,
                         'top_k': 1},
 'no_overall_input': True,
 'sample_overall': True,
 'sent_bert_from_sim': {}}


In [7]:
results = calculate_diversity_metrics(outputss, metric_configs=config)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating per-input diversities
Average per-input diversities:
{'mean_per_input_ead_averaged_distinct_ngrams': 0.6378211785884894, 'mean_per_input_sent_bert_from_sim': 0.40668171644210815, 'mean_per_input_nli_sample_from_sim': 0.9585304107930925}
Std per-input diversities:
{'std_per_input_ead_averaged_distinct_ngrams': 0.0, 'std_per_input_sent_bert_from_sim': 0.0597790032252103, 'std_per_input_nli_sample_from_sim': 0.045166222806895744}
calculating overall diversities
Average overall diversities:
{'overall_ead_averaged_distinct_ngrams': 0.538994964957481, 'overall_sent_bert_from_sim': 0.3989078998565674, 'overall_nli_sample_from_sim': 0.9326768695645862}
calculating overall single-input diversities
Average overall single-input diversities:
{'overall_single_output_ead_averaged_distinct_ngrams': 0.8345037971855376, 'overall_single_output_sent_bert_from_sim': 0.09744042158126831, 'overall_single_output_nli_sample_from_sim': 0.9376957925160726}


In [8]:
from pprint import pprint

pprint({k: round(v, 3) for k,v in results.items()})


{'mean_per_input_ead_averaged_distinct_ngrams': 0.638,
 'mean_per_input_nli_sample_from_sim': 0.959,
 'mean_per_input_sent_bert_from_sim': 0.407,
 'overall_ead_averaged_distinct_ngrams': 0.539,
 'overall_nli_sample_from_sim': 0.933,
 'overall_sent_bert_from_sim': 0.399,
 'overall_single_output_ead_averaged_distinct_ngrams': 0.835,
 'overall_single_output_nli_sample_from_sim': 0.938,
 'overall_single_output_sent_bert_from_sim': 0.097,
 'std_per_input_ead_averaged_distinct_ngrams': 0.0,
 'std_per_input_nli_sample_from_sim': 0.045,
 'std_per_input_sent_bert_from_sim': 0.06}


In [8]:
# !pip install transformers==4.36.2

In [9]:
from datasets import load_dataset

dataset = load_dataset('clarin-knext/summarization-chat-annotated')

Downloading readme:   0%|          | 0.00/257 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'doc_text', 'summary'],
        num_rows: 32480
    })
})

In [11]:
dataset['train'][0]

{'id': 'complexqa_806054',
 'doc_text': 'Zamach w Sarajewie Zamach w Sarajewie – zamach na następcę austro-węgierskiego tronu, arcyksięcia Franciszka Ferdynanda i jego żonę Zofię, księżnę Hohenberg, dokonany 28 czerwca 1914 roku przez bośniackiego Serba Gavrila Principa, członka serbskiej nacjonalistycznej organizacji "Młoda Bośnia". Princip wraz z pięcioma innymi uczestnikami zamachu powiązany był z serbską tajną organizacją "Zjednoczenie lub śmierć", popularnie nazywaną "Czarna Ręka", którą kierował Dragutin Dimitrijević pseudonim "Apis", szef serbskiego wywiadu wojskowego. Morderstwo następcy tronu austro-węgierskiego stworzyło napięcie pomiędzy Wiedniem a Belgradem. Austro-Węgry wysunęły ultimatum domagając się usunięcia wrogiej propagandy, a także udziału reprezentantów rządu cesarsko-królewskiego w śledztwie na terenie Serbii. Odrzucenie części żądań przez Serbię doprowadziło do tzw. kryzysu lipcowego, a w konsekwencji do wybuchu I wojny światowej. Przygotowania Przygotowania do 

In [14]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


model_name = "speakleash/Bielik-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
from tqdm import tqdm

N = 16
temperature = 1.0  
outputs = []

for example in tqdm(dataset["train"].select(range(1000))):
    input_ids = tokenizer.encode(example["doc_text"], return_tensors="pt")
    samples = []
    for _ in range(N):
        output = model.generate(input_ids.to(device),
                                pad_token_id=tokenizer.eos_token_id,
                                do_sample=True, 
                                max_length=50, 
                                temperature=temperature, 
                                top_k=0,
                                top_p=1
                                )
        
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        samples.append(generated_text)
    outputs.append(samples)

  0%|          | 2/1000 [00:39<5:29:53, 19.83s/it]


KeyboardInterrupt: 

In [None]:
model_name = "speakleash/Bielik-Instruct-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)