In [76]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# MODEL_NAME = 'openai-community/gpt2-large' TODO: change to gpt2-large
MODEL_NAME = 'openai-community/gpt2'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [132]:
from torch.nn.utils.rnn import pad_sequence
arr = [
    torch.tensor([0, 0, 2]),
    torch.tensor([0, 1, 2]),
    torch.tensor([0, 2, 5, 6, 7]),
    torch.tensor([0, 3, 5, 6, 7]),
    
    torch.tensor([1, 0]),
    torch.tensor([1, 1]),
    torch.tensor([1, 2, 2]),
    torch.tensor([1, 3, 2]),
    
    torch.tensor([2, 0, 5, 6, 7]),
    torch.tensor([2, 1, 5, 6, 7]),
    torch.tensor([2, 2]),
    torch.tensor([2, 3])
]
result = pad_sequence(arr, batch_first=True, padding_value=-1)
print(result)
result = result.view(3, 4, -1)
result

tensor([[ 0,  0,  2, -1, -1],
        [ 0,  1,  2, -1, -1],
        [ 0,  2,  5,  6,  7],
        [ 0,  3,  5,  6,  7],
        [ 1,  0, -1, -1, -1],
        [ 1,  1, -1, -1, -1],
        [ 1,  2,  2, -1, -1],
        [ 1,  3,  2, -1, -1],
        [ 2,  0,  5,  6,  7],
        [ 2,  1,  5,  6,  7],
        [ 2,  2, -1, -1, -1],
        [ 2,  3, -1, -1, -1]])


tensor([[[ 0,  0,  2, -1, -1],
         [ 0,  1,  2, -1, -1],
         [ 0,  2,  5,  6,  7],
         [ 0,  3,  5,  6,  7]],

        [[ 1,  0, -1, -1, -1],
         [ 1,  1, -1, -1, -1],
         [ 1,  2,  2, -1, -1],
         [ 1,  3,  2, -1, -1]],

        [[ 2,  0,  5,  6,  7],
         [ 2,  1,  5,  6,  7],
         [ 2,  2, -1, -1, -1],
         [ 2,  3, -1, -1, -1]]])

In [135]:
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Transformers are a powerful tool for natural language processing."
]

inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
print("Input IDs:\n", inputs['input_ids'])
print("Attention Mask:\n", inputs['attention_mask'])

outputs = model(**inputs, output_hidden_states=True)
last_hidden_state = outputs.hidden_states[-1]
print(last_hidden_state.shape)

Input IDs:
 tensor([[  464,  2068,  7586, 21831, 18045,   625,   262, 16931,  3290,    13,
         50256],
        [41762,   364,   389,   257,  3665,  2891,   329,  3288,  3303,  7587,
            13]])
Attention Mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
torch.Size([2, 11, 768])


In [137]:
torch.tensor([1, 2, 3, 4, 5]) * torch.tensor([1, 2, 3, 4, 5])

tensor([ 1,  4,  9, 16, 25])

In [73]:
encoded = tokenizer(['who are you'], return_tensors="pt")
print(encoded)
encoded = model.generate(**encoded)
tokenizer.decode(encoded[0], skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'input_ids': tensor([[8727,  389,  345]]), 'attention_mask': tensor([[1, 1, 1]])}


'who are you?"\n\n"I\'m not sure. I\'m not sure if I\'m going to be able'

In [95]:
texts = [
    "hello folds!",
    "I am eryaw",
    "who am I"
]

inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

input_ids = torch.cat([inputs['input_ids'][0], inputs['input_ids'][1], inputs['input_ids'][2]], dim=0).unsqueeze(dim=0)
attention_mask = torch.cat([inputs['attention_mask'][0], inputs['attention_mask'][1], inputs['attention_mask'][2]], dim=0).unsqueeze(dim=0)

print(input_ids)

output_encoded = model.generate(input_ids=input_ids, attention_mask=attention_mask)

tokenizer.decode(output_encoded[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[31373, 38744,     0, 50256, 50256,    40,   716,   220,  1924,   707,
          8727,   716,   314, 50256, 50256]])


'hello folds!<|endoftext|><|endoftext|>I am eryawwho am I<|endoftext|><|endoftext|>\n\nI am eryawwho am I\n\nI am eryawwho am I'

In [103]:
def _combine_query_sample(tokenizer_pad, query: torch.Tensor, sample: torch.Tensor) -> torch.Tensor:
    """
        Args:
            query: torch.tensor, shape (batch_size, seq_len)
            sample: torch.tensor, shape (batch_size, seq_len)
        Returns:
            combined: torch.tensor, shape (batch_size, seq_len)
        
        Combine query and samples prompt, remove pad between query and samples
        Query format is [Q1, Q2, ..., PAD, PAD]
        Sample format is [S1, S2, ..., PAD, PAD]
        Output format is [Q1, Q2, ..., S1, S2, ..., PAD, PAD]
    """
    assert query.dim() == 1 and sample.dim() == 1, 'Query and Sample must be 1D tensor'
    combined_query_sample = torch.full((query.size(0) + sample.size(0),), tokenizer_pad)
    # remove pad tokens on query
    new_query = query[query != tokenizer_pad] 
    combined_query_sample[:new_query.size(0)] = new_query
    
    combined_query_sample[new_query.size(0):new_query.size(0)+sample.size(0)] = sample
    return combined_query_sample
    
x = torch.tensor([1, 2, 3, 0, 0])
y = torch.tensor([4, 5, 0, 0, 0])

_combine_query_sample(0, x, y)

tensor([1, 2, 3, 4, 5, 0, 0, 0, 0, 0])

In [100]:
text = ['hello   ']
input_tokens = tokenizer(text, return_tensors='pt')
print(input_tokens)
# torch.masked_fill(input_tokens)
# model(**input_tokens, attention_mask=)

{'input_ids': tensor([[31373,   220,   220,   220]]), 'attention_mask': tensor([[1, 1, 1, 1]])}


In [102]:
tsr = torch.tensor([31373])
model(input_ids=tsr)

IndexError: too many indices for tensor of dimension 1

# Dataset

In [45]:
import sys
import os

sys.path.append('../')

In [None]:
from lm_human_preferences import train_reward

reward_data1 = train_reward.RewardData.from_openai('../data/descriptiveness_offline_5k.json').to_dataset().select(range(10))
reward_data2 = train_reward.RewardData.from_openai('../data/descriptiveness_offline_5k.json').to_dataset().select(range(10))

In [29]:
import torch

def map_reward_to_tensor(batch):
    tensor_batch = {
        key: torch.tensor(batch[key], dtype=torch.float32)
        for key in batch.keys()
    }
    assert isinstance(tensor_batch['query'], torch.Tensor)
    return tensor_batch

# Ensure reward_data1 is a dataset that supports the map method
print("Type of reward_data1:", type(reward_data1))  # Debugging print statement

reward_data1.set_transform(map_reward_to_tensor)
print("Mapped reward data:", reward_data1)  # Debugging print statement

# Accessing the first query to check the result
print("First query tensor:", reward_data1['query'][0])  # Debugging print statement

Type of reward_data1: <class 'datasets.arrow_dataset.Dataset'>
Mapped reward data: Dataset({
    features: ['query', 'sample0', 'sample1', 'sample2', 'sample3', 'best'],
    num_rows: 6260
})
First query tensor: tensor([6.5420e+03, 3.1140e+03, 8.6600e+02, 1.1000e+01, 2.9000e+02, 2.4970e+03,
        6.0600e+02, 3.3930e+03, 1.3000e+01, 6.7900e+02, 6.3500e+02, 2.4970e+03,
        4.8400e+02, 5.4700e+02, 2.0450e+03, 3.2640e+03, 7.3600e+02, 3.7900e+02,
        6.8300e+02, 1.3000e+01, 1.3180e+03, 3.7300e+02, 2.5700e+02, 5.8200e+02,
        3.5100e+02, 2.5700e+02, 1.3360e+03, 2.1213e+04, 1.1000e+01, 1.1940e+03,
        5.8200e+02, 2.1804e+04, 3.1900e+02, 2.5700e+02, 3.3009e+04, 1.1000e+01,
        2.9000e+02, 2.5700e+02, 6.2830e+03, 1.2000e+01, 1.1534e+04, 4.2700e+02,
        3.6300e+02, 1.3600e+03, 2.3300e+03, 1.2000e+01, 3.9200e+02, 1.2000e+01,
        3.3282e+04, 3.2900e+03, 3.2600e+02, 3.7300e+02, 4.9880e+03, 1.3110e+03,
        5.0600e+02, 5.1600e+02, 2.8700e+02, 2.5460e+03, 1.3000e+01, 