In [1]:
import gzip
import json
from typing import Callable, List, Tuple, Iterable, Dict, Type, Any
from functools import reduce, lru_cache
from collections import OrderedDict
import inspect

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams["figure.facecolor"] = "white"
from tqdm import tqdm

import torch as th
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.nn import Embedding
from torch.utils.data import DataLoader, random_split
# from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, PackedSequence
# from torchtext.vocab import vocab, Vocab, GloVe, build_vocab_from_iterator
# from torchtext.data.utils import get_tokenizer

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from torchmetrics import MeanSquaredError

# import optuna
# from optuna.visualization import plot_parallel_coordinate, plot_contour
# from optuna.importance import get_param_importances

import wandb

from transformers import (
    AutoTokenizer, 
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    AdamW,
    get_linear_schedule_with_warmup,
    # GPT2LMHeadModel,
    AutoConfig,
    pipeline
)
from datasets import load_dataset, DatasetDict, Dataset

# one-time: get hf model from pt model

In [2]:
class LitCausalLMModel(pl.LightningModule):
    def __init__(
        self, 
        hf_model_name: str, 
        total_steps: int,
        lr: float = 5e-5, 
        weight_decay: float = 0.01,
        adam_epsilon: float = 1e-6,
        warmup_steps: int = 1000,
    ) -> None:
        super().__init__()
        
        # choose this if want blank slate
        # self.config = AutoConfig.from_pretrained(
        #     "gpt2",
        #     vocab_size=len(tokenizer),
        #     n_ctx=context_length,
        #     bos_token_id=tokenizer.bos_token_id,
        #     eos_token_id=tokenizer.eos_token_id,
        # )
        # self.hf_model = GPT2LMHeadModel(self.config)
        # self.hf_model(**self.hf_model.dummy_inputs)  # Builds the model

        # choose this if want pre-trained weights
        self.hf_model = AutoModelForCausalLM.from_pretrained(hf_model_name)
        
        self.save_hyperparameters()

        # self.wandb_table = wandb.Table(columns=["step", "text"])
        # self.logger.log_table({"generated_text": self.wandb_table})
    
    def forward(self, **inputs):
        outputs = self.hf_model(**inputs)
        return outputs
    
    def training_step(self, batch: th.Tensor, batch_idx: int):
        outputs = self(**batch)
        loss = outputs.loss
        self.log("train_loss", loss)
        return loss
    
    def validation_step(self, batch: th.Tensor, batch_idx: int):
        outputs = self(**batch)
        loss = outputs.loss
        self.log("val_loss", loss)
        return loss
    
    def validation_epoch_end(self, outputs):
        # visualize the output
        pipe = pipeline(
            "text-generation", model=self.hf_model, tokenizer=tokenizer, device=0
        )
        txt = "We develop a method to"
        gen_text = pipe(txt, num_return_sequences=1)[0]["generated_text"]
        # self.wandb_table.add_data(self.global_step, gen_text)
        # wandb.log({"generated_text": self.wandb_table})
        # self.logger.log_table({"generated_text": self.wandb_table})
        print(gen_text)
    
    def configure_optimizers(self):
        model = self.hf_model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.hf_model.named_parameters() 
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [
                    p for n, p in self.hf_model.named_parameters() 
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters, 
            lr=self.hparams.lr, 
            eps=self.hparams.adam_epsilon
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.hparams.total_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]


In [207]:
model = LitCausalLMModel("distilgpt2", total_steps=1, lr=1e-4)
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# checkpoint = torch.load(PATH)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']
# loss = checkpoint['loss']

# model.eval()
# # - or -
# model.train()

In [208]:
checkpoint = th.load("models/model.ckpt", map_location=th.device("cpu"))

In [209]:
model.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [210]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [211]:
pt_model = model

In [212]:
model = pt_model.hf_model

In [225]:
model.save_pretrained("./models/")

In [17]:
tokenizer.save_pretrained("./models/")

('./models/tokenizer_config.json',
 './models/special_tokens_map.json',
 './models/vocab.json',
 './models/merges.txt',
 './models/added_tokens.json',
 './models/tokenizer.json')

In [227]:
del model

# Start here

In [7]:
model = AutoModelForCausalLM.from_pretrained("./models/")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [3]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [4]:
# txt = (
#     "Increasingly many"
# )
# gen_text = pipe(
#     txt, 
#     num_return_sequences=1, 
#     temperature=1.0,
#     top_p=1.0,
# )[0]["generated_text"]
# gen_text

In [5]:
# tokenizer(gen_text, return_length=True)

In [6]:
# model.config.task_specific_params["text-generation"]["max_length"] = 100
# pipe = pipeline("text-generation", model=model.hf_model, tokenizer=tokenizer)


In [7]:
# create function: take last k tokens (or less) from user input, 
# apply model, then output last N-k tokens

# init_text = "Using a CNN, we propose a method to"
# max_input_tokens = 3
# tokenizer
# pipeline
# temperature = 1.0
# top_p = 1.0

def generate_new_text(
    init_text: str, 
    pipeline: Callable, 
    max_last_input_tokens: int, 
    temperature=1.0, 
    top_p=1.0,
):
    tokenizer = pipeline.tokenizer

    init_text_tk = tokenizer(init_text, return_length=True)
    num_input_tk = min(init_text_tk["length"][0], max_last_input_tokens)

    input_text = tokenizer.decode(init_text_tk["input_ids"][-num_input_tk:])
    pipeline_output = pipe(input_text, temperature=temperature, top_p=top_p)
    output_text = pipeline_output[0]["generated_text"]
    
    output_text_tk = tokenizer(output_text, return_length=True)
    new_text = tokenizer.decode(output_text_tk["input_ids"][num_input_tk:])
    
    return new_text

In [8]:
# generate_new_text(
#     "Using a CNN, we propose a method to newly",
#     pipe,
#     max_last_input_tokens=5,
# )

In [9]:
model.config.task_specific_params["text-generation"]["max_length"] = 100
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
generate_new_text(
    "Using a CNN, we propose a method to newly characterize",
    pipe,
    max_last_input_tokens=50,
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' the degree of change in data by predicting two different types of transformations and how the transform affects the change in the data. The transformation can be categorized into four categories to identify the transformation and its magnitude. We have also established theoretical guarantees for a wide range of transformations, including transformations that increase the efficiency of the model. We further empirically evaluate the effects of such transformations on the performance of various deep learning architectures. As an example, we find that'

# package to onnx

In [2]:
from typing import OrderedDict
from transformers.models.gpt2 import GPT2OnnxConfig
from pathlib import Path
from transformers.onnx import export, validate_model_outputs
import onnx

In [3]:
from transformers.models.gpt2 import GPT2OnnxConfig

In [14]:
onnx_config = GPT2OnnxConfig(model.config, task="causal-lm")

In [15]:
onnx_config.default_onnx_opset

13

In [16]:
onnx_config.outputs

OrderedDict([('logits', {0: 'batch', 1: 'sequence'})])

In [17]:
onnx_path = Path("onnx/model.onnx")

In [18]:
onnx_inputs, onnx_outputs = export(
    tokenizer,
    model,
    onnx_config,
    onnx_config.default_onnx_opset,
    onnx_path
)

  if batch_size <= 0:
  attn_weights = attn_weights / (float(value.size(-1)) ** 0.5)


In [19]:
onnx_inputs

['input_ids', 'attention_mask']

In [20]:
onnx_outputs

['logits']

In [21]:
onnx_model = onnx.load("onnx/model.onnx")

In [22]:
onnx.checker.check_model(onnx_model)

In [23]:
validate_model_outputs(
    onnx_config, tokenizer, model, onnx_path, onnx_outputs, onnx_config.atol_for_validation
)

# package to onnx (quantized!)

In [11]:
from typing import OrderedDict
from transformers.models.gpt2 import GPT2OnnxConfig
from pathlib import Path
from transformers.onnx import export, validate_model_outputs
from onnxruntime.quantization import QuantizationMode, quantize
import onnx

In [27]:
onnx_model = onnx.load(Path("onnx/model.onnx").as_posix())

In [29]:
quantized_model = quantize(
        model=onnx_model,
        quantization_mode=QuantizationMode.IntegerOps,
        force_fusions=True,
        symmetric_weight=True,
    )

         Please use quantize_static for static quantization, quantize_dynamic for dynamic quantization.


In [31]:
onnx.save_model(quantized_model, Path("onnx/model_quantized.onnx").as_posix())

# Test the onnx model

In [8]:
from onnxruntime import InferenceSession

In [9]:
session = InferenceSession("onnx/model_quantized.onnx")

In [10]:
%%time
inputs = tokenizer("hello hello my name is", return_tensors="np")

CPU times: user 1.78 ms, sys: 1.36 ms, total: 3.15 ms
Wall time: 1.08 ms


In [11]:
inputs

{'input_ids': array([[31373, 23748,   616,  1438,   318]]), 'attention_mask': array([[1, 1, 1, 1, 1]])}

In [12]:
%%time 
outputs = session.run(output_names=['logits'], input_feed=dict(inputs))

CPU times: user 70.4 ms, sys: 2.7 ms, total: 73.1 ms
Wall time: 13.4 ms


In [13]:
outputs

[array([[[-32.130417, -31.926243, -35.131798, ..., -40.72961 ,
          -41.40339 , -35.54015 ],
         [-13.938381, -16.497381, -19.185692, ..., -26.25697 ,
          -25.579788, -18.103561],
         [-12.386647, -15.976733, -17.351515, ..., -22.23131 ,
          -23.252188, -16.279594],
         [ -9.177689, -12.604435, -15.799781, ..., -22.282354,
          -22.306173, -11.495081],
         [-10.045435, -10.654558, -13.455166, ..., -20.121496,
          -18.981516, -10.902972]]], dtype=float32)]

In [14]:
np.argmax(outputs[0], axis=-1)

array([[  11,   13, 1438,   11,  616]])

In [15]:
tokenizer.convert_ids_to_tokens(np.argmax(outputs[0], axis=-1).flatten())

[',', '.', 'Ġname', ',', 'Ġmy']

In [16]:
tokenizer.decode(np.argmax(outputs[0], axis=-1).flatten())

',. name, my'

In [17]:
# hf model
output_pt = model(th.tensor(inputs["input_ids"]))

In [18]:
output_pt.logits.shape

torch.Size([1, 5, 50257])

In [19]:
np.argmax(output_pt.logits.detach().numpy(), axis=-1).flatten()

array([  11,   13, 1438,   11,  616])

In [20]:
tokenizer.decode(np.argmax(output_pt.logits.detach().numpy(), axis=-1).flatten())

',. name, my'

# Build an autoregressive pipeline lol

In [75]:
import einops

In [76]:
init_text = "My name is Bob and I like"
input_ids = tokenizer(init_text, return_tensors="np")["input_ids"]
tokenizer.decode(input_ids[0, :])

'My name is Bob and I like'

In [77]:
input_ids

array([[3666, 1438,  318, 5811,  290,  314,  588]])

In [78]:
input_feed = dict(
    input_ids = input_ids,
    attention_mask = np.ones((1, input_ids.shape[-1]), dtype=int),
)
output_logits = session.run(output_names=['logits'], input_feed=input_feed,)
last_token_logits = output_logits[0][0, -1, :]
sampled_token_id = sample_token_id_from_logits(last_token_logits)
input_ids = np.append(input_ids, [[sampled_token_id]])  # flattens input_ids implicitly
input_ids = einops.rearrange(input_ids, "i -> 1 i")



In [79]:
tokenizer.decode(input_ids[0, :])

'My name is Bob and I like to'

In [70]:
np.array([[1,2],[3,4]]).ndim

2

In [83]:
np.append(input_ids, 1)

array([3666, 1438,  318, 5811,  290,  314,  588,  284,    1])

In [84]:
last_token_logits

array([ -2.8670678,  -2.8882113,  -6.5249047, ..., -10.038966 ,
       -10.068566 ,  -4.1525965], dtype=float32)

In [38]:
def sample_token_id_from_logits(
    logits: np.ndarray, temperature: float = 1.0, top_p: float = 1.0
) -> int:
    """
    Given an array of logits, sample the ID, including temperature 
    and top_p features.
    """
    print("warning: currently ignoring temperature and top_p")
    return np.argmax(logits)

assert sample_token_id_from_logits(np.array([-29.858946 , -29.195038 , -30.607428])) == 1



In [108]:
tokenizer.decode([284])

' to'

# check if pipline already supports onnx

In [None]:
pipeline

# Scratch

In [85]:
tokenizer(init_text, return_tensors="np")

{'input_ids': array([[3666, 1438,  318, 5811,  290,  314,  588]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1]])}

In [85]:
from transformers import AutoModelForMaskedLM

In [86]:
bart = AutoModelForMaskedLM.from_pretrained("lysandre/onnx-bart")

404 Client Error: Entry Not Found for url: https://huggingface.co/lysandre/onnx-bart/resolve/main/config.json


OSError: lysandre/onnx-bart does not appear to have a file named config.json.

In [87]:
from transformers import TopPLogitsWarper, TemperatureLogitsWarper

In [98]:
t = TemperatureLogitsWarper(0.5)
p = TopPLogitsWarper(1.0)

In [97]:
t(input_ids[:,:3], np.array([-21,-22,-23]))

array([-42., -44., -46.])

In [102]:
logits = np.array([-1.5, -2.0, -1.0, -1.2])

In [104]:
np.sort(logits,)

array([-2. , -1.5, -1.2, -1. ])

In [109]:
np.argsort(logits,)[::-1]

array([2, 3, 0, 1])

In [110]:
logits[np.argsort(logits,)[::-1]]

array([-1. , -1.2, -1.5, -2. ])

In [105]:
logits

array([-1.5, -2. , -1. , -1.2])

In [111]:
from scipy.special import softmax

ModuleNotFoundError: No module named 'scipy'

In [112]:
from pathlib import Path

In [113]:
Path(".")

PosixPath('.')

In [120]:
Path(".").absolute().joinpath("hello/hi") 

PosixPath('/Users/dennis/repos/text-generation/model_creation/hello/hi')