In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "ai-forever/ruGPT-3.5-13B"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards: 100%|██████████| 6/6 [00:40<00:00,  6.77s/it]


In [22]:
import pandas as pd

labeled_dataset_path = "./datasets/filtered_datasets/oasst/ru/oasst_ru_v1_labeled - oasst_ru_v1.csv"

labeled_dataset = pd.read_csv(labeled_dataset_path)
labeled_dataset['ppl'] = 0
labeled_dataset.head(5)

Unnamed: 0,conversation,conversation_ids,ok/trash,ppl
0,Напиши информацию о игре Hytale\n---\nHytale -...,e480f611-0d31-433a-93d2-0e2bc675aa30 d5766ce8-...,ok,0
1,Напиши информацию о игре Hytale\n---\nСогласно...,e480f611-0d31-433a-93d2-0e2bc675aa30 e22f8f08-...,ok,0
2,Напиши информацию о игре Hytale\n---\nСогласно...,e480f611-0d31-433a-93d2-0e2bc675aa30 e22f8f08-...,ok,0
3,Напиши информацию о игре Hytale\n---\nСогласно...,e480f611-0d31-433a-93d2-0e2bc675aa30 4db2c738-...,trash,0
4,"Докончи поговорку, без труда не выловишь и\n--...",fa8c92d6-6daa-42a3-a2f5-e2e0eb610c41 6b90c8c0-...,trash,0


In [30]:
def extract_first(conversation_ids):
    conversation_id = conversation_ids.split()[0]
    return conversation_id

labeled_dataset['conversation_id_first'] = labeled_dataset['conversation_ids'].apply(extract_first) 

unique_first_ids = {ids: [] for ids in labeled_dataset['conversation_id_first'].unique()}

In [31]:
from tqdm import tqdm

for i in tqdm(range(len(labeled_dataset))):
    dialog = labeled_dataset['conversation'].iloc[i]
    label = labeled_dataset['ok/trash'].iloc[i]
    unique_first_id = labeled_dataset['conversation_id_first'].iloc[i]
    # print(dialog)
    # break
    input_ids = tokenizer(
        dialog,
        return_tensors="pt",
        max_length=2048,
        truncation=True,
    ).to("cuda")
    input_ids = input_ids.input_ids
    with torch.no_grad():
        loss = model(input_ids, labels=input_ids)
        loss = loss.loss
        ppl = torch.exp(loss).item()
        # labeled_dataset['ok/trash'].iloc[i] = loss
        # print(f"label={label} loss={loss}")
        unique_first_ids[unique_first_id].append({
            "label": label,
            "ppl": ppl
        })

100%|██████████| 3140/3140 [03:37<00:00, 14.41it/s]


In [32]:
unique_first_ids

{'e480f611-0d31-433a-93d2-0e2bc675aa30': [{'label': 'ok', 'ppl': 9.5234375},
  {'label': 'ok', 'ppl': 11.625},
  {'label': 'ok', 'ppl': 12.1328125},
  {'label': 'trash', 'ppl': 21.796875}],
 'fa8c92d6-6daa-42a3-a2f5-e2e0eb610c41': [{'label': 'trash',
   'ppl': 14.6640625},
  {'label': 'ok', 'ppl': 16.109375},
  {'label': 'ok', 'ppl': 13.828125},
  {'label': 'ok', 'ppl': 17.109375},
  {'label': 'ok', 'ppl': 21.09375}],
 'ef69d986-aa2d-4c3e-a122-1910d9625c79': [{'label': 'ok', 'ppl': 11.7578125},
  {'label': 'trash', 'ppl': 13.640625},
  {'label': 'trash', 'ppl': 15.671875}],
 'e321f8f6-01f8-4975-afe6-73db989f538e': [{'label': 'ok', 'ppl': 6.484375},
  {'label': 'trash', 'ppl': 11.921875},
  {'label': 'trash', 'ppl': 11.1796875},
  {'label': 'trash', 'ppl': 13.484375}],
 '68489e5c-978f-4ad7-a849-39a741fb5ae7': [{'label': 'ok', 'ppl': 4.52734375},
  {'label': 'ok', 'ppl': 3.927734375},
  {'label': 'trash', 'ppl': 5.5234375}],
 '5083658c-e251-4259-b180-77afaf907d66': [{'label': 'ok', 'ppl'

In [34]:
import pickle
# with open('unique_first_ids.pickle', 'wb') as handle:
#     pickle.dump(unique_first_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('unique_first_ids.pickle', 'rb') as handle:
    unique_first_ids = pickle.load(handle)

unique_first_ids

{'e480f611-0d31-433a-93d2-0e2bc675aa30': [{'label': 'ok', 'ppl': 9.5234375},
  {'label': 'ok', 'ppl': 11.625},
  {'label': 'ok', 'ppl': 12.1328125},
  {'label': 'trash', 'ppl': 21.796875}],
 'fa8c92d6-6daa-42a3-a2f5-e2e0eb610c41': [{'label': 'trash',
   'ppl': 14.6640625},
  {'label': 'ok', 'ppl': 16.109375},
  {'label': 'ok', 'ppl': 13.828125},
  {'label': 'ok', 'ppl': 17.109375},
  {'label': 'ok', 'ppl': 21.09375}],
 'ef69d986-aa2d-4c3e-a122-1910d9625c79': [{'label': 'ok', 'ppl': 11.7578125},
  {'label': 'trash', 'ppl': 13.640625},
  {'label': 'trash', 'ppl': 15.671875}],
 'e321f8f6-01f8-4975-afe6-73db989f538e': [{'label': 'ok', 'ppl': 6.484375},
  {'label': 'trash', 'ppl': 11.921875},
  {'label': 'trash', 'ppl': 11.1796875},
  {'label': 'trash', 'ppl': 13.484375}],
 '68489e5c-978f-4ad7-a849-39a741fb5ae7': [{'label': 'ok', 'ppl': 4.52734375},
  {'label': 'ok', 'ppl': 3.927734375},
  {'label': 'trash', 'ppl': 5.5234375}],
 '5083658c-e251-4259-b180-77afaf907d66': [{'label': 'ok', 'ppl'

In [35]:
test_texts = [
    '---------------',
    "AAAAAAAAAAAAAAAAAAAA",
    "abbcfcad23324604a1a458870bac45ac399c6a830f98bfc73473c3f8602d02b8",
    "edfcaac579024f574adbcaa3c13e4fd2b7f1797826afe679f2144af2cb5c062d"   
]

for text in tqdm(test_texts):
    # print(dialog)
    # break
    input_ids = tokenizer(
        text,
        return_tensors="pt",
        max_length=2048,
        truncation=True,
    ).to("cuda")
    input_ids = input_ids.input_ids
    with torch.no_grad():
        loss = model(input_ids, labels=input_ids)
        loss = loss.loss
        ppl = torch.exp(loss).item()
        print(text)
        print(ppl)

100%|██████████| 4/4 [00:00<00:00, 28.18it/s]

---------------
2.658203125
AAAAAAAAAAAAAAAAAAAA
1.2841796875
abbcfcad23324604a1a458870bac45ac399c6a830f98bfc73473c3f8602d02b8
69.0
edfcaac579024f574adbcaa3c13e4fd2b7f1797826afe679f2144af2cb5c062d
60.1875



