In [1]:
import os
import torch

from collections import defaultdict
from tqdm import tqdm
from modeling.modeling_llada import LLaDAModelLM
from transformers import AutoTokenizer
from datasets import Dataset, load_dataset

from get_log_likelihood import forward_process, get_log_likelihood
from generate import generate

from jinyu_utils.jinyu_tokenizer import Tokenizer_
from jinyu_utils.jinyu_preprocess_wiki import parse_lines_with_index, merge_subdocs, PATTEN_REG_WIKI, simple_calculate_sim

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
id_model = 'GSAI-ML/LLaDA-8B-Base'
path_cache_base = os.environ['HF_HUB_CACHE']
folder_model = '--'.join(['models'] + id_model.split('/'))
path_cache_model = os.path.join(path_cache_base, folder_model)
path_snapshot_model = os.path.join(path_cache_model, 'snapshots')
folder_snapshot_model_1 = [entity for entity in os.listdir(path_snapshot_model) if entity[0] != '.'][0]
path_snapshot_model_1 = os.path.join(path_snapshot_model, folder_snapshot_model_1)
print(path_snapshot_model_1)


/home/exx/hf_hub_cache/models--GSAI-ML--LLaDA-8B-Base/snapshots/0f2787f2d87eac5eed8a087d5ecd24277e6255b2


In [3]:
'''load tokenizer'''
tokenizer = AutoTokenizer.from_pretrained(
    path_snapshot_model_1,
    local_files_only=True,
    trust_remote_code=True
)

if tokenizer.padding_side != 'left':
    tokenizer.padding_side = 'left'
# end

assert tokenizer.pad_token_id != 126336

In [4]:
'''load model'''
model_kwargs = {}
model = LLaDAModelLM.from_pretrained(
    path_snapshot_model_1,
    local_files_only=True,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    **model_kwargs
)

model = model.eval()
device_for_input = model.get_input_embeddings().weight.device

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 6/6 [00:03<00:00,  1.59it/s]


In [5]:
'''load dataset'''
names_dataset = [('Idavidrein/gpqa', 'gpqa_main'), ('Salesforce/wikitext', 'wikitext-2-raw-v1')]
ds = load_dataset(*names_dataset[1], split='train')['text']


In [6]:
'''preprocess dataset'''
docs, _ = parse_lines_with_index(PATTEN_REG_WIKI, ds)
docs = docs['subdocs']

In [7]:
samples = []
for doc in docs:
    lines_1 = doc['texts']
    paragraph_1 = ' '.join(lines_1)
    lines_remain, titles = merge_subdocs(doc['subdocs'])
    paragraph_remain = ' '.join(lines_remain)
    prefix = paragraph_1
    target = paragraph_remain
    samples.append({'prefix': prefix, 'target': target})
# end


In [8]:
samples = samples[:100]
len(samples)

100

In [None]:
prompts = [sample['prefix'] for sample in samples]
outputs = []
with torch.no_grad():
    for prompt in tqdm(prompts, desc='starting to get outputs...'):
        encoded_inputs = tokenizer(
                prompt,
                add_special_tokens=False,
                padding=True,
                return_tensors="pt"
        )

        input_ids = encoded_inputs['input_ids'].to(device_for_input)
        attention_mask = encoded_inputs['attention_mask'].to(device_for_input)

        out = generate(model, input_ids, attention_mask, steps=128, gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
        output = tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0]
        outputs.append(output)
    # end for
# end with


starting to get outputs...:  50%|█████     | 50/100 [05:17<05:45,  6.91s/it]

In [None]:
sims_target = []
for sample, predict in zip(samples, outputs):
    sims_target.append(simple_calculate_sim(sample['target'], predict[0]))
# end
sims_target = [(idx, sim) for idx, sim in enumerate(sims_target)]
sims_target_sorted = sorted(sims_target, key=lambda copus: -copus[1])
sims_target_sorted[:10]

print(samples[91]['target'])
print(outputs[91][0])

= = Background and development = = The Dangerously In Love Tour was the debut solo concert tour by American recording artist Beyoncé . The tour was intended to showcase songs from Beyoncé ' debut solo album , Dangerously in Love released in 2003 . However , the set list also contained a special segment of her show dedicated to her girl group Destiny 's Child and songs from Beyoncé ' 2003 film The Fighting Temptations ( " Fever " and " Summertime " ) . The stage was simple and featured a large LED screen in the back that moved up and down throughout the entire show and displayed video images of Beyoncé and her dancers , as well as some images from her music videos and some prerecorded images with special effects . The show also featured a small staircase and platforms on both side of the stairs for her band . Beyoncé later toured alongside Missy Elliott and Alicia Keys as ensemble for the Verizon Ladies First Tour ( 2004 ) in North America . = = Synopsis and reception = = Dave Simpson o

In [None]:
sims_prefix = []
for sample, predict in zip(samples, outputs):
    sims_prefix.append(simple_calculate_sim(sample['prefix'], predict[0]))
# end
sims_prefix = [(idx, sim) for idx, sim in enumerate(sims_prefix)]
sims_prefix_sorted = sorted(sims_prefix, key=lambda copus: -copus[1])
sims_prefix_sorted[:10]
print(samples[43]['prefix'])
print(outputs[43][0])

= Soviet cruiser Krasnyi Kavkaz = Krasnyi Kavkaz ( from Russian : " Красный Кавказ " - " Red Caucasus " ) was a cruiser of the Soviet Navy that began construction during World War I , but was still incomplete during the Russian Revolution . Her design was heavily modified by the Soviets and she was completed in 1932 . During World War II she supported Soviet troops during the Siege of Odessa , Siege of Sevastopol , and the Kerch @-@ Feodosiya Operation in the winter of 1941 — 42 . She was awarded the Guards title on 3 April 1942 . She was reclassified as a training ship in May 1947 before being used as a target in 1952 .
 = Soviet cruiser Krasnyi Kavkaz = Krasnyi Kavkaz ( from Russian : " Красный Кавказ " - " Red Caucasus " ) was a cruiser of the Soviet Navy that began construction during World War I , but was still incomplete during the Russian Revolution . Her design was heavily modified by the Soviets and she was completed in 1932 . During World War II she supported Soviet troops du

In [None]:
'''one-by-one testing'''
idx = 43
prompts = [samples[idx]['prefix']]
with torch.no_grad():
    for prompt in tqdm(prompts, desc='starting to get outputs...'):
        encoded_inputs = tokenizer(
                prompt,
                add_special_tokens=False,
                padding=True,
                return_tensors="pt"
        )

        input_ids = encoded_inputs['input_ids'].to(device_for_input)
        attention_mask = encoded_inputs['attention_mask'].to(device_for_input)

        out = generate(model, input_ids, attention_mask, steps=32, gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
        output = tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)
        print(simple_calculate_sim(samples[idx]['prefix'], output[0]))
        print(samples[idx]['prefix'])
        print(output[0])
    # end for
# end with


starting to get outputs...: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]

0.0
= Soviet cruiser Krasnyi Kavkaz = Krasnyi Kavkaz ( from Russian : " Красный Кавказ " - " Red Caucasus " ) was a cruiser of the Soviet Navy that began construction during World War I , but was still incomplete during the Russian Revolution . Her design was heavily modified by the Soviets and she was completed in 1932 . During World War II she supported Soviet troops during the Siege of Odessa , Siege of Sevastopol , and the Kerch @-@ Feodosiya Operation in the winter of 1941 — 42 . She was awarded the Guards title on 3 April 1942 . She was reclassified as a training ship in May 1947 before being used as a target in 1952 .





In [None]:
'''get log likelihood parts'''

# samples = []
# for doc in docs:
#     lines_1 = doc['texts']
#     paragraph_1 = ' '.join(lines_1)
#     lines_remain, titles = merge_subdocs(doc)
#     paragraph_remain = ' '.join(lines_remain)
#     prefix = 'I will give you a general description of a person. I will also give you some subtitles and you need to give me the detail of them respectively . '
#     prefix += paragraph_1
#     prefix += " Titles are : "
#     prefix += ' , '.join(titles)

#     target = paragraph_remain
#     samples.append({'prefix': prefix, 'target': target})
# # end

# class Tokenizer_test(Tokenizer_):
#     def _tokenize(self, e):
#         prefix, target = self._encode_pair(e['prefix'], e['target'])
#         return {
#             'prefix_text': e['prefix'],
#             'target_text': e['target'],
#             'prefix': prefix,
#             'target': target
#         }
#     # end
# # end


# ds =  Dataset.from_list(samples)
# ds = ds.map(Tokenizer_test(tokenizer))
# ds = ds.with_format("torch")
# ds = ds.filter(lambda x: len(x["prefix"]) + len(x['target']) <= 4096)

# out = []
# with torch.no_grad():
#     for elem in tqdm(ds, desc="Computing likelihood..."):
#         prefix = elem["prefix"]
#         target = elem["target"]

#         ll = get_log_likelihood(model, prefix, target)
#         out.append(ll)
#     # end
# # end


'get log likelihood parts'