In [17]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
import torch

text = ['the boy is outside or so i thought', 'i hate the weather', 'the girl']
inputs = tokenizer(
    text,
    max_length=5,
    padding=True,
    truncation=True,
    return_overflowing_tokens=True,
    stride=2,
    return_tensors='pt',
)

In [18]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(text, 
                   padding=True, 
                   return_tensors='pt')


{'input_ids': tensor([[ 1169,  2933,   318,  2354,   393,   523,  1312,  1807],
        [   72,  5465,   262,  6193, 50256, 50256, 50256, 50256],
        [ 1169,  2576, 50256, 50256, 50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0, 0, 0, 0]])}

In [26]:
attn_mask = inputs.attention_mask
ignore_mask = (attn_mask - 1)*100
target_ids = (inputs.input_ids.clone() * attn_mask) + ignore_mask
print(target_ids)
print(attn_mask)

tensor([[1169, 2933,  318, 2354,  393,  523, 1312, 1807],
        [  72, 5465,  262, 6193, -100, -100, -100, -100],
        [1169, 2576, -100, -100, -100, -100, -100, -100]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0, 0, 0, 0]])


In [None]:
A = torch.tensor([[1, 2, 3], [4, 5, 6]])


In [2]:
from transformers import AutoModelForCausalLM, GPT2TokenizerFast

device = "mps"
model_id = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [29]:
encodings['input_ids'] = encodings.input_ids[:,:1024]
encodings['attention_mask'] = encodings.attention_mask[:,:1024]

In [11]:
tokenizer.pad_token = tokenizer.eos_token

In [52]:
encodings = tokenizer(['the boy is outside.'
                       , 'I hate it']
                      , return_tensors='pt', padding=True).to(device)

In [56]:
@torch.no_grad()
def get_output(texts, stride=None):
    if stride is None:
        stride = 2

    max_length = 4
    #MAX_LENGTH = self.tokenizer.model_max_length

    inputs_dict = tokenizer(texts, 
                                 padding=True,
                                 return_tensors='pt').to(device)
    print(inputs_dict)
    print('-'*80)
    seq_length = inputs_dict.input_ids.size(1)
    logits = None
    prev_end_loc = 0

    for begin_loc in range(0, seq_length, stride):
        # Find the end location, it's either the whole sequence 
        # if the sequence length is less than the max length or 
        # up until the max length
        end_loc = min(begin_loc + max_length, seq_length)
        # If the sequence ends early, it may be different than stride on last loop
        target_length = end_loc - prev_end_loc 
        input_ids = inputs_dict.input_ids[:, begin_loc:end_loc].to(device)
        attn_mask = inputs_dict.attention_mask[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        # Set repeated targets target to -100 
        target_ids[:, :-target_length] = -100
        print(input_ids)
        print(attn_mask)
        print(target_ids)
        print()
        outputs = model(**{'input_ids': input_ids, 
                         'attention_mask': attn_mask, 
                         'labels': target_ids})

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

        
get_output(['the boy is outside.', 'i hate it'])

{'input_ids': tensor([[ 1169,  2933,   318,  2354,    13],
        [   72,  5465,   340, 50256, 50256]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0]], device='mps:0')}
--------------------------------------------------------------------------------
tensor([[ 1169,  2933,   318,  2354],
        [   72,  5465,   340, 50256]], device='mps:0')
tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]], device='mps:0')
tensor([[ 1169,  2933,   318,  2354],
        [   72,  5465,   340, 50256]], device='mps:0')

tensor([[  318,  2354,    13],
        [  340, 50256, 50256]], device='mps:0')
tensor([[1, 1, 1],
        [1, 0, 0]], device='mps:0')
tensor([[ -100,  -100,    13],
        [ -100,  -100, 50256]], device='mps:0')



In [31]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions
#max_length = 5
stride = 512
seq_len = encodings.input_ids.size(1)
nlls = []
logits = None
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    print(input_ids.shape)
    print(target_ids)
    print()


    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break
print(nlls)
ppl = torch.exp(torch.stack(nlls).mean())

  0%|                                                                                                                                           | 0/2 [00:00<?, ?it/s]

torch.Size([1, 1024])
tensor([[  628,   796,  5199,  ...,   290, 43823, 24265]], device='mps:0')

[tensor(2.8187, device='mps:0')]





In [22]:
ppl

tensor(351.5309, device='mps:0')

In [48]:
model(**encodings, labels=encodings['input_ids']).loss

tensor(5.9670, device='mps:0', grad_fn=<NllLossBackward0>)

In [63]:
import torch
from tqdm import tqdm

max_length = 4
stride = 4
seq_len = encodings.input_ids.size(1)

print(encodings)
print('-'*80)

nlls = []
prev_end_loc = 0
for begin_loc in range(0, seq_len, stride):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    print(input_ids)
    print(target_ids)

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
print(ppl)

{'input_ids': tensor([[ 1169,  2933,   318,  2354,    13],
        [   40,  5465,   340, 50256, 50256]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0]], device='mps:0')}
--------------------------------------------------------------------------------
tensor([[ 1169,  2933,   318,  2354],
        [   40,  5465,   340, 50256]], device='mps:0')
tensor([[ 1169,  2933,   318,  2354],
        [   40,  5465,   340, 50256]], device='mps:0')
tensor([[   13],
        [50256]], device='mps:0')
tensor([[   13],
        [50256]], device='mps:0')


RuntimeError: [srcBuf length] > 0 INTERNAL ASSERT FAILED at "/Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/mps/OperationUtils.mm":387, please report a bug to PyTorch. Placeholder tensor is empty!

In [64]:
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

In [72]:
blah = encodings.copy()

In [92]:
encodings = blah.copy()
encodings['input_ids'] = encodings.input_ids[:, :20]

In [115]:
encodings = tokenizer(['the boy is outside and i hate that he is so much'], padding=True, return_tensors='pt')
encodings.input_ids.shape

torch.Size([1, 12])

In [117]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 512
max_length = 12
stride = 5
seq_len = encodingsssts.input_ids.size(1)

nlls = []
all_logits = None
prev_end_loc = 0
for begin_loc in range(0, seq_len, stride):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100
    print(begin_loc, end_loc, trg_len, input_ids, target_ids, 'subsetted', target_ids[:, target_ids[0,:] != -100])

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss
        logits = outputs.logits[:, target_ids[0,:] != -100, :]
        if all_logits is None:
            all_logits = logits
        else:
            print(all_logits.shape, logits.shape)
            all_logits = torch.cat((all_logits, logits), 1)


    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
print(ppl)
print(all_logits.shape)

0 12 12 tensor([[1169, 2933,  318, 2354,  290, 1312, 5465,  326,  339,  318,  523,  881]],
       device='mps:0') tensor([[1169, 2933,  318, 2354,  290, 1312, 5465,  326,  339,  318,  523,  881]],
       device='mps:0') subsetted tensor([[1169, 2933,  318, 2354,  290, 1312, 5465,  326,  339,  318,  523,  881]],
       device='mps:0')
tensor(122.6580, device='mps:0')
torch.Size([1, 12, 50257])
