In [1]:
# Reference: https://huggingface.co/docs/transformers/v4.17.0/en/tasks/language_modeling
import torch
import time
import math
import datasets
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

eli5 = datasets.load_dataset("/home/mcwave/data/textbooks/eqs_withcoords", split="train[:100]")
eli5 = eli5.train_test_split(test_size=0.1)

eli5 = eli5.flatten()
eli5

Using pad_token, but it is not set yet.


DatasetDict({
    train: Dataset({
        features: ['a'],
        num_rows: 90
    })
    test: Dataset({
        features: ['a'],
        num_rows: 10
    })
})

In [2]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["a"]], padding=True, truncation=True)

tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=eli5["train"].column_names,
)
# for i in tokenized_eli5["train"]["input_ids"]:
#     print(len(i))

block_size = 257
# print("_____")
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    } 
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_eli5.map(group_texts, batched=True, batch_size=1, num_proc=1)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [3]:
# def add_padding(val):
#     pad_tokens = [1] * (256 - len(val["input_ids"]))
#     val["input_ids"] = pad_tokens + [2] + val["input_ids"]
#     val["attention_mask"] = pad_tokens + [1] + val["attention_mask"]
#     return val
# lm_dataset["train"] = lm_dataset["train"].map(add_padding)
# lm_dataset["test"] = lm_dataset["test"].map(add_padding)
# for i in lm_dataset["test"]:
#     if len(i["input_ids"]) != 257:
#         print("ids", len(i["input_ids"]))
#     if len(i["attention_mask"]) != 257:
#         print("mask", len(i["attention_mask"]))

In [4]:
# import numpy as np

# SEQ_LEN = len(lm_dataset["test"]["input_ids"][0])
# print(SEQ_LEN)
# RANGE_START = -1
# RANGE_END = 1
# EMBED_SIZE = 768
# LOWER_BOUND = -10
# UPPER_BOUND = 10

# # Input:
# #    formula: A formula containing "x", which will be replaced to numbers between range_start and range_end
# # Output:
# #    a sequence of embeddings, each has embed_size dimensions, and each dimension is between LOWER_BOUND and UPPER_BOUND
# def generate_seq_embed(func,
#                        seq_len = SEQ_LEN,
#                        range_start = RANGE_START,
#                        range_end = RANGE_END,
#                        embed_size = EMBED_SIZE):
#     start = time.time()
#     seq = torch.zeros(seq_len, embed_size)
#     step = (range_end - range_start) / (seq_len*embed_size - 1)
#     for i in range(seq_len*embed_size):
#         x = range_start + i * step
#         #print(formula.replace('x', str(x)))
#         #y = eval(formula.replace('x', str(x)))
#         y = func(x)
#         y = max(LOWER_BOUND, min(UPPER_BOUND,y))
#         seq[i // embed_size][i % embed_size] = y
#     end = time.time()
#     return seq

# def find_var(eq):
#     replace = ["math", "log", "exp", "sin", "cos", "tan", "sec", "arc", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "(", ")"]
#     for i in replace:
#         eq = eq.replace(i, "")
#     eq = eq.replace("+", "`").replace("-", "`").replace("*", "`").replace("/", "`").split("`")
#     for i in eq:
#         if len(i) > 0:
#             return i
#     return ""

# train_embeds = []
# start_time = time.time()
# i = 0
# for inputs in lm_dataset["train"]["input_ids"]:
#     try:
#         splitted = tokenizer.decode(inputs).replace(" ", "").replace("[PAD]", "")
#         splitted = splitted.replace(find_var(splitted), "x")
#         eqs_embeds = generate_seq_embed(eval("lambda x: " + splitted))
#         padded_ids = np.array(torch.tensor([eqs_embeds]))
#         train_embeds.append(padded_ids)
#     except:
#         train_embeds.append(np.array(torch.ones([1, 257, 768])))
#     i += 1
#     if i % 100 == 0:
#         start_time = time.time()
# np.save("/home/mcwave/data/textbooks/eqs_embeds_small", np.asarray(train_embeds))
# test_embeds = []
# start_time = time.time()
# i = 0
# for inputs in lm_dataset["test"]["input_ids"]:
#     try:
#         splitted = tokenizer.decode(inputs).replace(" ", "").replace("[PAD]", "")
#         splitted = splitted.replace(find_var(splitted), "x")
#         eqs_embeds = generate_seq_embed(eval("lambda x: " + splitted))
#         padded_ids = np.array(torch.tensor([eqs_embeds]))
#         test_embeds.append(padded_ids)
#     except:
#         test_embeds.append(np.array(torch.ones([1, 257, 768])))
#     i += 1
#     if i % 100 == 0:
#         start_time = time.time()
# np.save("/home/mcwave/data/textbooks/eqs_embeds_small_test", np.asarray(test_embeds))
# data = np.load("/home/mcwave/data/textbooks/eqs_embeds_full.npy")
# data_list = []
# from datasets import Dataset
# # len(lm_dataset["train"]["input_ids"])
# for i in range(len(lm_dataset["train"])):
#     # try:
#     flattened = torch.flatten(torch.Tensor(data[i]))
#     if flattened.shape[0] != 197376:
#         print("good2")
#         data_list.append(torch.Tensor.tolist(torch.ones([197376])))
#     else:
#         data_list.append(torch.Tensor.tolist(flattened))
#         continue
#     # except:
#     #     print("bad")
#     #     data_list.append(torch.Tensor.tolist(torch.ones([98304])))

# data_test = np.load("/home/mcwave/data/textbooks/eqs_embeds_full_test.npy")
# data_list_test = []
# for i in range(len(lm_dataset["test"])):
#     try:
#         # flattened = torch.flatten(torch.Tensor(data_test[i]))
#         flattened = []
#         # print(torch.Tensor(data[i]).shape)
#         # print(flattened.shape)
#         if flattened.shape[0] != 197376:
#             data_list_test.append(torch.Tensor.tolist(torch.ones([197376])))
#         else:
#             data_list_test.append(torch.Tensor.tolist(flattened))
#             continue
#     except:
#         data_list_test.append(torch.Tensor.tolist(torch.ones([98304])))
# print("Loaded data")

# lm_dataset['train'] = lm_dataset['train'].add_column("inputs_embeds", data_list)
# lm_dataset['test'] = lm_dataset['test'].add_column("inputs_embeds", data_list_test)
# lm_dataset["train"].save_to_disk("train_dataset_full.hf")
# lm_dataset["test"].save_to_disk("test_dataset_full.hf")

In [5]:
# lm_dataset["test"] = datasets.load_from_disk("test_dataset_2.hf")
lm_dataset["test"] = datasets.load_from_disk("train_dataset_small.hf")

In [6]:
len(lm_dataset["test"])

90

In [7]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [8]:
import os
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    CausalLMOutputWithPast,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from transformers.models.gpt_neo.configuration_gpt_neo import GPTNeoConfig
from transformers.models.gpt_neo.modeling_gpt_neo import *

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class MyGPTNeoForCausalLM(GPTNeoPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.transformer = GPTNeoModel(config)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
        token_type_ids = kwargs.get("token_type_ids", None)
        # only last token for inputs_ids if past is defined in kwargs
        if past_key_values:
            input_ids = input_ids[:, -1].unsqueeze(-1)
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)

        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -1].unsqueeze(-1)

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "position_ids": position_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
            }
        )

        return model_inputs

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        print(inputs_embeds.shape)
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if len(inputs_embeds.shape) != 3:
            batch_size = inputs_embeds.shape[0]
            inputs_embeds = torch.reshape(inputs_embeds, (batch_size, 257, 768))
            # print("inputs_embeds", inputs_embeds.shape)
            self.transformer.wte = nn.Embedding(50258, 768)
            eq_embeds = self.transformer.wte(input_ids.to("cuda"))[:, :129]
            # print("ids", input_ids.shape)
            # print(eq_embeds)
            # print(eq_embeds.shape)
            # print(inputs_embeds[:, :128].shape)
            # print("eq_embeds", eq_embeds.shape)
            inputs_embeds = torch.cat((inputs_embeds[:, :128].to("cuda"), eq_embeds), 1)
            # print(inputs_embeds.shape)
        transformer_outputs = self.transformer(
            None, #input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds.to("cuda"),
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # print("outputs", transformer_outputs)
        hidden_states = transformer_outputs[0]
        # print("hidden", hidden_states)
        lm_logits = self.lm_head(hidden_states)
        # print("logits", lm_logits)
        loss = None
        if labels is not None:
            # move labels to correct device to enable model parallelism
            labels = labels.to(lm_logits.device)
            # Compute loss in fp32 to match with mesh-tf version
            # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
            lm_logits = lm_logits.to(torch.float32)

            # Shift so that tokens < n predict n
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

            lm_logits = lm_logits.to(hidden_states.dtype)
            loss = loss.to(hidden_states.dtype)
        # print("loss", loss)
        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output
        return CausalLMOutputWithPast(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    @staticmethod
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
    ) -> Tuple[Tuple[torch.Tensor]]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """
        return tuple(
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            for layer_past in past_key_values
        )

model = MyGPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125m")
model.resize_token_embeddings(len(tokenizer))

[2023-12-09 20:21:33,674] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 50258. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(50258, 768)

In [9]:
# from transformers import Trainer, TrainingArguments
# from datasets import load_dataset

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     per_device_train_batch_size=10,
#     per_device_eval_batch_size=10, 
#     num_train_epochs = 100, 
#     save_steps = 100
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=lm_dataset["test"],
#     eval_dataset=lm_dataset["test"],
#     data_collator=data_collator,
# )

# trainer.train()

In [10]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Pipeline,
    PreTrainedModel,
    PreTrainedTokenizer,
)
import torch
from training.generate import generate_response

model_path = "/home/mcwave/code/word_problem_magnifier/results/checkpoint-200"
# tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")
model = MyGPTNeoForCausalLM.from_pretrained(
    model_path
)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m")

Some weights of the model checkpoint at /home/mcwave/code/word_problem_magnifier/results/checkpoint-200 were not used when initializing MyGPTNeoForCausalLM: ['transformer.h.7.attn.attention.q_proj.weight', 'transformer.h.6.attn.attention.q_proj.weight', 'transformer.h.9.attn.attention.k_proj.weight', 'transformer.h.11.attn.attention.q_proj.weight', 'transformer.h.5.mlp.c_fc.weight', 'transformer.h.0.attn.attention.v_proj.weight', 'transformer.h.5.mlp.c_proj.weight', 'transformer.h.1.attn.attention.out_proj.weight', 'transformer.h.0.mlp.c_fc.weight', 'transformer.h.10.attn.attention.q_proj.weight', 'transformer.h.3.attn.attention.q_proj.weight', 'transformer.h.11.attn.attention.k_proj.weight', 'transformer.h.8.attn.attention.q_proj.weight', 'transformer.h.4.mlp.c_fc.weight', 'transformer.h.2.attn.attention.v_proj.weight', 'transformer.h.5.attn.attention.out_proj.weight', 'transformer.h.3.mlp.c_fc.weight', 'transformer.h.3.attn.attention.v_proj.weight', 'transformer.h.6.mlp.c_proj.weight

In [11]:
embeds = torch.ones([1, 257, 768]).to("cuda")
print(model.device)
print(embeds.device)
gen_tokens = model.generate(
    inputs_embeds = embeds,
    do_sample=True,
    temperature=0.9,
    max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]#functional 2518, linear 96

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


cuda:0
cuda:0
torch.Size([1, 257, 768])
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0


AttributeError: 'NoneType' object has no attribute 'shape'