In [2]:
import gzip
import json
from typing import Callable, List, Tuple, Iterable, Dict, Type, Any
from functools import reduce, lru_cache
from collections import OrderedDict
import inspect

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams["figure.facecolor"] = "white"
from tqdm import tqdm

import torch as th
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.nn import Embedding
from torch.utils.data import DataLoader, random_split
# from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, PackedSequence
# from torchtext.vocab import vocab, Vocab, GloVe, build_vocab_from_iterator
# from torchtext.data.utils import get_tokenizer

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from torchmetrics import MeanSquaredError

# import optuna
# from optuna.visualization import plot_parallel_coordinate, plot_contour
# from optuna.importance import get_param_importances

import wandb

from transformers import (
    AutoTokenizer, 
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    AdamW,
    get_linear_schedule_with_warmup,
    # GPT2LMHeadModel,
    AutoConfig,
    pipeline
)
from datasets import load_dataset, DatasetDict, Dataset

In [206]:
class LitCausalLMModel(pl.LightningModule):
    def __init__(
        self, 
        hf_model_name: str, 
        total_steps: int,
        lr: float = 5e-5, 
        weight_decay: float = 0.01,
        adam_epsilon: float = 1e-6,
        warmup_steps: int = 1000,
    ) -> None:
        super().__init__()
        
        # choose this if want blank slate
        # self.config = AutoConfig.from_pretrained(
        #     "gpt2",
        #     vocab_size=len(tokenizer),
        #     n_ctx=context_length,
        #     bos_token_id=tokenizer.bos_token_id,
        #     eos_token_id=tokenizer.eos_token_id,
        # )
        # self.hf_model = GPT2LMHeadModel(self.config)
        # self.hf_model(**self.hf_model.dummy_inputs)  # Builds the model

        # choose this if want pre-trained weights
        self.hf_model = AutoModelForCausalLM.from_pretrained(hf_model_name)
        
        self.save_hyperparameters()

        # self.wandb_table = wandb.Table(columns=["step", "text"])
        # self.logger.log_table({"generated_text": self.wandb_table})
    
    def forward(self, **inputs):
        outputs = self.hf_model(**inputs)
        return outputs
    
    def training_step(self, batch: th.Tensor, batch_idx: int):
        outputs = self(**batch)
        loss = outputs.loss
        self.log("train_loss", loss)
        return loss
    
    def validation_step(self, batch: th.Tensor, batch_idx: int):
        outputs = self(**batch)
        loss = outputs.loss
        self.log("val_loss", loss)
        return loss
    
    def validation_epoch_end(self, outputs):
        # visualize the output
        pipe = pipeline(
            "text-generation", model=self.hf_model, tokenizer=tokenizer, device=0
        )
        txt = "We develop a method to"
        gen_text = pipe(txt, num_return_sequences=1)[0]["generated_text"]
        # self.wandb_table.add_data(self.global_step, gen_text)
        # wandb.log({"generated_text": self.wandb_table})
        # self.logger.log_table({"generated_text": self.wandb_table})
        print(gen_text)
    
    def configure_optimizers(self):
        model = self.hf_model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.hf_model.named_parameters() 
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [
                    p for n, p in self.hf_model.named_parameters() 
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters, 
            lr=self.hparams.lr, 
            eps=self.hparams.adam_epsilon
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.hparams.total_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]


In [207]:
model = LitCausalLMModel("distilgpt2", total_steps=1, lr=1e-4)
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# checkpoint = torch.load(PATH)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']
# loss = checkpoint['loss']

# model.eval()
# # - or -
# model.train()

In [208]:
checkpoint = th.load("models/model.ckpt", map_location=th.device("cpu"))

In [209]:
model.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [210]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [211]:
pt_model = model

In [212]:
model = pt_model.hf_model

In [225]:
model.save_pretrained("./models/")

In [227]:
del model

In [228]:
model = AutoModelForCausalLM.from_pretrained("./models/")

In [229]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [233]:
txt = (
    "Increasingly many"
)
gen_text = pipe(
    txt, 
    num_return_sequences=1, 
    temperature=1.0,
    top_p=1.0,
)[0]["generated_text"]
gen_text

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Increasingly many large, more complex systems, such as robots and computers, fail to perform well. In this paper, we propose three algorithms to enhance the performance of reinforcement learning (RL) over standard RL algorithms. First, a neural network's output model, i.e., a generalized maximum likelihood (MLP) algorithm with explicit parameterization or training data, can be learned using only one or more MLP and also the parameters in these learning algorithms. Second, the learning algorithm can"

In [150]:
tokenizer(gen_text, return_length=True)

{'input_ids': [15562, 2313, 306, 867, 318, 991, 257, 2408, 4876, 13, 2312, 5050, 11, 2158, 11, 4031, 284, 2987, 262, 2854, 286, 262, 2656, 2746, 393, 2746, 2346, 13, 554, 428, 3348, 11, 356, 9161, 2842, 284, 2987, 262, 9922, 286, 4981, 287, 262, 4732, 286, 7386, 16711, 13, 775, 18077, 734, 5050, 284, 2987, 262, 2854, 286, 262, 2656, 2746, 618, 691, 257, 1178, 10007, 1656, 287, 262, 1459, 2746, 11, 329, 1672, 11, 1262, 2705, 19232, 11, 393, 257, 1178, 7104, 2940, 709, 4981, 13, 554, 6273, 284, 777, 5050, 11, 356, 1205, 284, 9494, 257, 2746, 338, 9922], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'length': [100]}

In [None]:
model.config.task_specific_params["text-generation"]["max_length"] = 100
pipe = pipeline("text-generation", model=model.hf_model, tokenizer=tokenizer)


In [197]:
# create function: take last k tokens (or less) from user input, 
# apply model, then output last N-k tokens

# init_text = "Using a CNN, we propose a method to"
# max_input_tokens = 3
# tokenizer
# pipeline
# temperature = 1.0
# top_p = 1.0

def generate_new_text(
    init_text: str, 
    pipeline: Callable, 
    max_last_input_tokens: int, 
    temperature=1.0, 
    top_p=1.0,
):
    tokenizer = pipeline.tokenizer

    init_text_tk = tokenizer(init_text, return_length=True)
    num_input_tk = min(init_text_tk["length"][0], max_last_input_tokens)

    input_text = tokenizer.decode(init_text_tk["input_ids"][-num_input_tk:])
    pipeline_output = pipe(input_text, temperature=temperature, top_p=top_p)
    output_text = pipeline_output[0]["generated_text"]
    
    output_text_tk = tokenizer(output_text, return_length=True)
    new_text = tokenizer.decode(output_text_tk["input_ids"][num_input_tk:])
    
    return new_text

In [200]:
generate_new_text(
    "Using a CNN, we propose a method to newly",
    pipe,
    max_last_input_tokens=5,
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' discover a robust model of an unsupervised dataset, named the CIFAR10. The method consists of a convolutional neural network (CNN) to train an unsupervised model of an unknown dataset. A CNN is trained in the absence of external labeled examples to avoid the overfitting issue. We report results on the CIFAR100 dataset and the Eureka Challenge datasets, that demonstrates that this is not even possible due to the lack of unlabeled'

In [215]:
model.config.task_specific_params["text-generation"]["max_length"] = 100
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
generate_new_text(
    "Using a CNN, we propose a method to newly characterize MNIST data. By using an autoencoder, the",
    pipe,
    max_last_input_tokens=50,
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' CNN can represent the same image, its input, the reconstruction, and the final classification function. The CNN is trained to capture both the information of the source image, the original part, and the reconstruction functions, and then used as input, the reconstruction function, and the reconstruction function. We show that by training CNN on MNIST data with the autoencoder, the CNN'

In [None]:
pipe = pipeline(
    "text-generation", model=self.hf_model, tokenizer=tokenizer, device=0
)
txt = """
import numpy as np

# add two numbers
"""
gen_text = pipe(txt, num_return_sequences=1)[0]["generated_text"]