In [1]:
import time
import torch
import math

def truncate(number, decimals=0):
    """
    Returns a value truncated to a specific number of decimal places.
    """
    if not isinstance(decimals, int):
        raise TypeError("decimal places must be an integer.")
    elif decimals < 0:
        raise ValueError("decimal places has to be 0 or more.")
    elif decimals == 0:
        return math.trunc(number)

    factor = 10.0 ** decimals
    return math.trunc(number * factor) / factor

def generate_seq_embed(func,                       
                       seq_len = 128,
                       range_start = -2,
                       range_end = 2,
                       lower_bound = -2000,
                       upper_bound = 2000, 
                       embed_size = 768):
    # start = time.time()
    seq = torch.zeros(seq_len, embed_size)
    step = (range_end - range_start) / (seq_len*embed_size - 1)
    for i in range(seq_len*embed_size):
        x = range_start + i * step
        #print(formula.replace('x', str(x)))
        #y = eval(formula.replace('x', str(x)))
        y = func(x)
        y = max(lower_bound, min(upper_bound,y))
        trunc_digits = max(0, 2 - math.floor(math.log(abs(y), 10)))
        seq[i // embed_size][i % embed_size] = truncate(y, trunc_digits)
    # end = time.time()
    # print(end - start, "seconds")
    return seq
generate_seq_embed(eval("lambda x: x**2"))

tensor([[4.0000, 3.9900, 3.9900,  ..., 3.8700, 3.8700, 3.8700],
        [3.8700, 3.8700, 3.8700,  ..., 3.7500, 3.7500, 3.7500],
        [3.7500, 3.7500, 3.7500,  ..., 3.6300, 3.6300, 3.6300],
        ...,
        [3.6300, 3.6300, 3.6300,  ..., 3.7500, 3.7500, 3.7500],
        [3.7500, 3.7500, 3.7500,  ..., 3.8700, 3.8700, 3.8700],
        [3.8700, 3.8700, 3.8700,  ..., 3.9900, 3.9900, 4.0000]])

In [2]:
# Reference: https://huggingface.co/docs/transformers/v4.17.0/en/tasks/language_modeling

import datasets

eli5 = datasets.load_dataset("/home/mcwave/data/textbooks/eqs_withcoords_test")
# eli5 = eli5.train_test_split(test_size=0.2)

#
eli5 = eli5.flatten()
eli5["train"] = datasets.load_dataset("/home/mcwave/data/textbooks/eqs_withcoords")["train"]
eli5["test"] = datasets.load_dataset("/home/mcwave/data/textbooks/eqs_withcoords_test")["test"]
eli5

DatasetDict({
    test: Dataset({
        features: ['a'],
        num_rows: 118
    })
    train: Dataset({
        features: ['a'],
        num_rows: 1220
    })
})

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m")
tokenizer.add_special_tokens({'pad_token': '[PAD]', "bos_token": "[BOS]"})
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.add_special_tokens({'bos_token': '[BOS]'})

def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["a"]], padding=True, truncation=True)

tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

block_size = 128

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result
def add_labels(examples):
    # print(len(examples["input_ids"]))
    examples["input_ids"] += [50257] * (128 - len(examples["input_ids"]))
    examples["labels"] = examples["input_ids"].copy()
    # while len(examples["labels"]) < 128:
    #     examples["labels"].append(50257)
    return examples

lm_dataset = tokenized_eli5.map(add_labels, batched=False, num_proc=4)
# lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
# lm_dataset["train"]["labels"][0]
# for i in range(len(lm_dataset["train"]["attention_mask"])):
#     for j in range(len(lm_dataset["train"]["attention_mask"][i])):
#         lm_dataset["train"]["attention_mask"][i][j] = 1

In [4]:
# import numpy as np
# import json

# def find_var(eq):
#     replace = ["math", "log", "exp", "sin", "cos", "tan", "sec", "arc", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "(", ")"]
#     for i in replace:
#         eq = eq.replace(i, "")
#     eq = eq.replace("+", "`").replace("-", "`").replace("*", "`").replace("/", "`").split("`")
#     for i in eq:
#         if len(i) > 0:
#             return i
#     return ""

# embeds = []
# start_time = time.time()
# i = 0
# for inputs in lm_dataset["train"]["input_ids"]:
#     try:
#         splitted = tokenizer.decode(inputs).replace(" ", "").replace("[PAD]", "")
#         splitted = splitted.replace(find_var(splitted), "x")
#         eqs_embeds = generate_seq_embed(eval("lambda x: " + splitted))
#         padded_ids = np.array([eqs_embeds])
#         embeds.append(padded_ids)
#     except:
#         embeds.append(np.ones([1, 128, 768]))
#     i += 1
#     if i % 100 == 0:
#         print(i, time.time()-start_time)
#         start_time = time.time()
# np.save("/home/mcwave/data/textbooks/eqs_embeds_2", np.asarray(embeds))
# embeds = []
# start_time = time.time()
# i = 0
# for inputs in lm_dataset["test"]["input_ids"]:
#     try:
#         splitted = tokenizer.decode(inputs).replace(" ", "").replace("[PAD]", "")
#         splitted = splitted.replace(find_var(splitted), "x")
#         eqs_embeds = generate_seq_embed(eval("lambda x: " + splitted))
#         padded_ids = np.array([eqs_embeds])
#         embeds.append(padded_ids)
#     except:
#         embeds.append(np.ones([1, 128, 768]))
#     i += 1
#     if i % 100 == 0:
#         print(i, time.time()-start_time)
#         start_time = time.time()
# np.save("/home/mcwave/data/textbooks/eqs_embeds_2_test", np.asarray(embeds))

In [5]:
# import numpy as np
# import torch

# data = np.load("/home/mcwave/data/textbooks/eqs_embeds.npy")
# data_list = []
# from datasets import Dataset
# # len(lm_dataset["train"]["input_ids"])
# for i in range(len(lm_dataset["train"])):
#     try:
#         # print("good")
#         flattened = torch.flatten(torch.Tensor(data[i]))
#         # print(torch.Tensor(data[i]).shape)
#         # print(flattened.shape)
#         if flattened.shape[0] != 98304:
#             # print("good2")
#             data_list.append(torch.Tensor.tolist(torch.ones([98304])))
#         else:
#             data_list.append(torch.Tensor.tolist(flattened))
#             continue
#     except:
#         print("bad")
#         data_list.append(torch.Tensor.tolist(torch.ones([98304])))

# data_test = np.load("/home/mcwave/data/textbooks/eqs_embeds_test.npy")
# data_list_test = []
# for i in range(len(lm_dataset["test"])):
#     try:
#         flattened = torch.flatten(torch.Tensor(data_test[i]))
#         # print(torch.Tensor(data[i]).shape)
#         # print(flattened.shape)
#         if flattened.shape[0] != 98304:
#             data_list_test.append(torch.Tensor.tolist(torch.ones([98304])))
#         else:
#             data_list_test.append(torch.Tensor.tolist(flattened))
#             continue
#     except:
#         data_list_test.append(torch.Tensor.tolist(torch.ones([98304])))
# print("Loaded data")
# # dataset_2 = Dataset.from_dict({"q": data_list})
# # dataset_2

In [6]:
# lm_dataset['train'] = lm_dataset['train'].add_column("inputs_embeds", data_list)
# lm_dataset['test'] = lm_dataset['test'].add_column("inputs_embeds", data_list_test)
# lm_dataset["train"].save_to_disk("train_dataset_2.hf")
# lm_dataset["test"].save_to_disk("test_dataset_2.hf")

In [None]:
import datasets
#pad 128 tokens in front and one BOS token

def add_padding(val):
    pad_and_bos_token = torch.cat((torch.full((1, 128-len(val["input_ids"])), 50257)[0], torch.tensor([50258])))
    
    val["input_ids"] = torch.Tensor.tolist(torch.cat((pad_and_bos_token, torch.Tensor(val["input_ids"]))))
    val["attention_mask"] = torch.Tensor.tolist(torch.cat((torch.full((1, 129), 0)[0], torch.Tensor(val["attention_mask"]))))
    return val

selected = []
for i in range(110):
    selected.append(i)
lm_dataset["train"] = datasets.load_from_disk("train_dataset_2.hf").map(add_padding)
lm_dataset["test"] = datasets.load_from_disk("test_dataset_2.hf").select(selected).map(add_padding)
for i in lm_dataset["train"]["input_ids"]:
    print(len(i))

In [8]:
from transformers import DataCollatorForLanguageModeling
import numpy as np

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [18]:
import os
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    CausalLMOutputWithPast,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from transformers.models.gpt_neo.configuration_gpt_neo import GPTNeoConfig
from transformers.models.gpt_neo.modeling_gpt_neo import *

class MyGPTNeoForCausalLM(GPTNeoPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.transformer = GPTNeoModel(config)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
        token_type_ids = kwargs.get("token_type_ids", None)
        # only last token for inputs_ids if past is defined in kwargs
        if past_key_values:
            input_ids = input_ids[:, -1].unsqueeze(-1)
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)

        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -1].unsqueeze(-1)

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "position_ids": position_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
            }
        )

        return model_inputs

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # inputs_embeds = torch.ones([1,128,768]) #<-set inference inputs here
        # print("forward")
        # print(inputs_embeds)
        # print(input_ids)
        if self.training:
            if len(inputs_embeds.shape) != 3:
                # print(inputs_embeds.shape)
                inputs_embeds = torch.reshape(inputs_embeds, (1, 128, 768))
        eq_embeds = self.transformer.wte(input_ids)[:,128:]
        # print(eq_embeds.shape)
        inputs_embeds = torch.cat((inputs_embeds, eq_embeds), 1)
        print(attention_mask.shape)
        transformer_outputs = self.transformer(
            None,  
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            # move labels to correct device to enable model parallelism
            labels = labels.to(lm_logits.device)
            # Compute loss in fp32 to match with mesh-tf version
            # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
            lm_logits = lm_logits.to(torch.float32)

            # Shift so that tokens < n predict n
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

            lm_logits = lm_logits.to(hidden_states.dtype)
            loss = loss.to(hidden_states.dtype)

        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    @staticmethod
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
    ) -> Tuple[Tuple[torch.Tensor]]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """
        return tuple(
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            for layer_past in past_key_values
        )

model = MyGPTNeoForCausalLM.from_pretrained("/home/mcwave/code/word_problem_magnifier/results/2/checkpoint-1000")
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 50259. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(50259, 768)

In [21]:
from transformers import Trainer, TrainingArguments
from datasets import load_dataset # pass in tokenized string, decode and turn into embeds in forward
#debug lines: 1842, 2718

training_args = TrainingArguments(
    output_dir = "./results/2/3",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    weight_decay = 0.01,
    num_train_epochs = 50, 
    per_device_train_batch_size = 1, 
    per_device_eval_batch_size = 1,
    logging_steps = 1000,
    eval_steps = 1000,
    save_steps = 1000
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = lm_dataset["train"],
    eval_dataset = lm_dataset["test"],
    data_collator = data_collator,
)

trainer.train()
trainer.save_model("/home/mcwave/code/word_problem_magnifier/results/model")

RuntimeError: The size of tensor a (129) must match the size of tensor b (213) at non-singleton dimension 3

In [None]:
# from transformers import (
#     AutoModelForCausalLM,
#     AutoTokenizer,
#     Pipeline,
#     PreTrainedModel,
#     PreTrainedTokenizer,
# )
# import torch
# from training.generate import generate_response

# model_path = "/home/mcwave/code/word_problem_magnifier/results/2/checkpoint-1000"
# # tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")
# model = MyGPTNeoForCausalLM.from_pretrained(
#     model_path
# )
# tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m")

In [None]:
# gen_tokens = model.generate(
#     torch.Tensor([add_padding(tokenizer("x"))["input_ids"]]).type(torch.int64),
#     do_sample=True,
#     temperature=0.9,
#     max_length=100,
# )
# gen_text = tokenizer.batch_decode(gen_tokens)[0]#utils 2760, 2771

In [None]:
# prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
#          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
#          "researchers was the fact that the unicorns spoke perfect English."

# input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
# gen_text = tokenizer.batch_decode(gen_tokens)[0]
# gen_text