In [1]:
from transformers import AutoTokenizer

In [46]:
# model_name = 'meta-llama/Llama-2-7b'
model_name = 'meta-llama/Llama-3.1-8B'
# model_name = 'meta-llama/Meta-Llama-3-8B'
# model_name = 'gpt2'

In [47]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [48]:
tokenizer.special_tokens_map

{'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>'}

In [49]:
tokenizer.padding_side

'right'

### `tokenizer.pad_token = tokenizer.eos_token`

- training
    - sent1: "Hello, how are you?"
        - `["Hello,", " how", " are", " you?", <eos>]`
    - sent2: "I am fine."
        - `["I", " am", "fine.", <eos>]`
- padding for batch processing
    - sent1: `["Hello,", " how", " are", " you?", <eos>]`
    - sent2: `["I", " am", "fine.", <eos>, <pad>]`
- pad_token = eos_token
    - sent1: `["Hello,", " how", " are", " you?", <eos>]`
    - sent2: `["I", " am", "fine.", <eos>, <eos>]`

In [50]:
sent1 = "Hello, how are you?"
sent2 = "I am fine."

In [51]:
tokenizer.encode(sent1)

[128000, 9906, 11, 1268, 527, 499, 30]

In [53]:
tokenizer.tokenize(sent1)

['Hello', ',', 'Ġhow', 'Ġare', 'Ġyou', '?']

In [52]:
tokenizer.decode(128000), tokenizer.decode(30)

('<|begin_of_text|>', '?')

### gpt2 generate

- The issue being GPT2 model adds position embeddings to every token in the input sequence including pad_tokens.

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained('gpt2')

# run this only for gpt-2 as we do not have a pad token in gpt2
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'

model = AutoModelForCausalLM.from_pretrained('gpt2', pad_token_id = tokenizer.eos_token_id)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [17]:
tokenizer.eos_token, tokenizer.eos_token_id

('<|endoftext|>', 50256)

In [2]:
sentence = "I went to the"

results = tokenizer(
	[sentence],
	add_special_tokens=True,
	truncation=True,
	padding=True,
	return_tensors='pt',
)

In [3]:
results

{'input_ids': tensor([[  40, 1816,  284,  262]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [4]:
print("========= With No Padding ==========")

print("Tokenizing the input sentence \"{0}\" leads to ".format(sentence) )
print(tokenizer.convert_ids_to_tokens( results['input_ids'][0] ))


Tokenizing the input sentence "I went to the" leads to 
['I', 'Ġwent', 'Ġto', 'Ġthe']


In [5]:
with torch.no_grad():
	logits = model(results['input_ids'].to(device), 
					attention_mask=results['attention_mask'].to(device),
					).logits[:, -1, :]
	index = torch.argmax(logits).item()
	print( sentence + " " +  tokenizer.convert_ids_to_tokens(index) )

I went to the Ġhospital


In [6]:
max_length= 30
print("========= Using Padding of size {0} ==========".format(max_length))

results = tokenizer(
    [sentence],
    add_special_tokens=True,
    max_length=max_length,
    truncation=False,
    padding='max_length',
    return_tensors='pt',
)




In [7]:
print("Tokenizing the padded input sentence \"{0}\" leads to ".format(sentence) )
print(tokenizer.convert_ids_to_tokens( results['input_ids'][0] ))


Tokenizing the padded input sentence "I went to the" leads to 
['<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', 'I', 'Ġwent', 'Ġto', 'Ġthe']


In [8]:
with torch.no_grad():
    logits = model(results['input_ids'].to(device), 
                    attention_mask=results['attention_mask'].to(device),
                    ).logits[:, -1, :]
    index = torch.argmax(logits).item()
    print( sentence + " " +  tokenizer.convert_ids_to_tokens(index) )

print("\n" * 2)


I went to the Ġthe





In [9]:
sentence = "I went to the"

results = tokenizer(
	[sentence],
	add_special_tokens=True,
	truncation=True,
	padding=True,
	return_tensors='pt',
)

position_ids = torch.zeros(results['attention_mask'].size(), dtype=torch.int32)
starting_index = 0
for index in range(results['attention_mask'][0].size(0)):
    if results['attention_mask'][0][index] == 1:
        position_ids[0][index] = starting_index
        starting_index += 1

print("========= With No Padding ==========")

print("Tokenizing the input sentence \"{0}\" leads to ".format(sentence) )
print(tokenizer.convert_ids_to_tokens( results['input_ids'][0] ))

Tokenizing the input sentence "I went to the" leads to 
['I', 'Ġwent', 'Ġto', 'Ġthe']


In [10]:
with torch.no_grad():
	logits = model(results['input_ids'].to(device), 
					attention_mask=results['attention_mask'].to(device),
                    position_ids=position_ids.to(device),
					).logits[:, -1, :]
	index = torch.argmax(logits).item()
	print( sentence + " " +  tokenizer.convert_ids_to_tokens(index) )

print("\n" * 2)


I went to the Ġhospital





In [18]:
max_length= 30
print("========= Using Padding of size {0} ==========".format(max_length))

results = tokenizer(
    [sentence],
    add_special_tokens=True,
    max_length=max_length,
    truncation=False,
    padding='max_length',
    return_tensors='pt',
)

print("Tokenizing the padded input sentence \"{0}\" leads to ".format(sentence) )
print(tokenizer.convert_ids_to_tokens( results['input_ids'][0] ))

Tokenizing the padded input sentence "I went to the" leads to 
['<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', 'I', 'Ġwent', 'Ġto', 'Ġthe']


In [19]:
results['input_ids'][0]

tensor([50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256,    40,  1816,   284,   262])

In [12]:
position_ids = torch.zeros(results['attention_mask'].size(), dtype=torch.int32)
starting_index = 0
for index in range(results['attention_mask'][0].size(0)):
    if results['attention_mask'][0][index] == 1:
        position_ids[0][index] = starting_index
        starting_index += 1

with torch.no_grad():
    logits = model(results['input_ids'].to(device), 
                    attention_mask=results['attention_mask'].to(device),
                    position_ids=position_ids.to(device),
                    ).logits[:, -1, :]
    index = torch.argmax(logits).item()
    print( sentence + " " +  tokenizer.convert_ids_to_tokens(index) )

print("\n" * 2)

I went to the Ġhospital





In [15]:
results['attention_mask'], position_ids

(tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 1, 1, 1, 1]]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 1, 2, 3]], dtype=torch.int32))

### batch generate

In [25]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'

model = AutoModelForCausalLM.from_pretrained('gpt2', pad_token_id = tokenizer.eos_token_id)
prompt_text = ['I went to the',
               'we are trying to',
               'The purpose of this workshop is to check whether we can']
encodings_dict = tokenizer.batch_encode_plus(prompt_text, max_length=10, pad_to_max_length=True, return_tensors= "pt")
encodings_dict

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256, 50256,    40,  1816,   284,   262],
        [50256, 50256, 50256, 50256, 50256, 50256,   732,   389,  2111,   284],
        [  464,  4007,   286,   428, 20243,   318,   284,  2198,  1771,   356]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [26]:
input_ids = torch.tensor(encodings_dict['input_ids'])
attn_mask = torch.tensor(encodings_dict['attention_mask'])
tokenizer.batch_decode(model.generate(input_ids, attention_mask=attn_mask, max_length=15))

  input_ids = torch.tensor(encodings_dict['input_ids'])
  attn_mask = torch.tensor(encodings_dict['attention_mask'])
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>I went to the hospital and was told that',
 '<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>we are trying to get the best out of',
 'The purpose of this workshop is to check whether we can make a difference in']