In [None]:
%cd ..

c:\Users\anant\Downloads\S22\S22


In [4]:
import time
import sys
from pathlib import Path
from typing import Optional, Literal, Any

import lightning as L
import torch
from lightning.fabric.strategies import FSDPStrategy
from lightning.fabric.plugins import BitsandbytesPrecision


from tsai_gpt.utils import get_default_supported_precision, gptq_quantization, load_checkpoint
from tsai_gpt.model import GPT, Block, Config
from tsai_gpt.tokenizer import Tokenizer

In [5]:
L.seed_everything(1234)

INFO: Seed set to 1234
INFO:lightning.fabric.utilities.seed:Seed set to 1234


1234

In [6]:
def multinomial_num_samples_1(probs: torch.Tensor) -> torch.Tensor:
    if torch._dynamo.is_compiling():
        # Faster alternative to `torch.multinomial(probs, num_samples=1)` that is also CUDAGraph friendly
        distribution = torch.empty_like(probs).exponential_(1)
        return torch.argmax(probs / distribution, dim=-1, keepdim=True)
    return torch.multinomial(probs, num_samples=1)


def sample(logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] = None) -> torch.Tensor:
    logits = logits[0, -1]
    # optionally crop the logits to only the top k options
    if top_k is not None:
        v, i = torch.topk(logits, min(top_k, logits.size(-1)))
        # do not use `torch.where` as in nanogpt because it will repeat top-k collisions
        logits = torch.full_like(logits, float("-inf")).scatter_(-1, i, v)
    # optionally scale the logits and sample from a probability distribution
    if temperature > 0.0:
        probs = torch.nn.functional.softmax(logits / temperature, dim=-1)
        return multinomial_num_samples_1(probs)
    return torch.argmax(logits, dim=-1, keepdim=True)


def next_token(model: GPT, input_pos: torch.Tensor, x: torch.Tensor, **kwargs: Any) -> torch.Tensor:
    logits = model(x, input_pos)
    next = sample(logits, **kwargs)
    return next.type_as(x)


@torch.inference_mode()
def generate(
    model: GPT,
    prompt: torch.Tensor,
    max_returned_tokens: int,
    *,
    temperature: float = 1.0,
    top_k: Optional[int] = None,
    eos_id: Optional[int] = None,
) -> torch.Tensor:
    """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.

    The implementation of this function is modified from A. Karpathy's nanoGPT.

    Args:
        model: The model to use.
        prompt: Tensor of shape (T) with indices of the prompt sequence.
        max_returned_tokens: The maximum number of tokens to return (given plus generated).
        temperature: Scales the predicted logits by 1 / temperature.
        top_k: If specified, only sample among the tokens with the k highest probabilities.
        eos_id: If specified, stop generating any more token once the <eos> token is triggered.
    """
    T = prompt.size(0)
    assert max_returned_tokens > T
    if model.max_seq_length < max_returned_tokens - 1:
        # rolling the kv cache based on the `input_pos` value would be necessary. However, doing so would introduce a
        # data dependency on the `input_pos` tensor and impact model compilation. Since this setting is uncommon, we do
        # not support it to avoid negatively impacting the overall speed
        raise NotImplementedError(f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}")

    device = prompt.device
    tokens = [prompt]
    input_pos = torch.tensor([T], device=device)
    token = next_token(
        model, torch.arange(0, T, device=device), prompt.view(1, -1), temperature=temperature, top_k=top_k
    ).clone()
    tokens.append(token)
    for _ in range(2, max_returned_tokens - T + 1):
        token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k).clone()
        tokens.append(token)
        if token == eos_id:
            break
        input_pos = input_pos.add_(1)
    return torch.cat(tokens)

In [11]:
"""
quantize (Optional[Literal[&quot;bnb.nf4&quot;, &quot;bnb.nf4, optional): quantization method to use. Defaults to None.
    - "bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq": 4-bit quantization bitsandbytes
    - "bnb.int8": 8-bit quantization bitsandbytes
    - "gptq.int4": 4-bit quantization GPTQ
    for more details see: https://github.com/facebookresearch/bitsandbytes, https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
strategy (str, optional): Fabric strategy setting. Defaults to "auto".
devices (int, optional): number of devices to be used. Defaults to 1.
precision (Optional[str], optional): fabic precision settings. Defaults to None.
"""

chptk_path: str = "last-iter-015000-ckpt.pth"
tokenizer_path: str = "tokenizer_Llama-2-7b-chat-hf"
quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None
strategy: str = "auto"
devices: int = 1
precision: Optional[str] = None

precision = precision or get_default_supported_precision(training=False)
plugins = None
if quantize is not None:
    if devices > 1:
        raise NotImplemented("Multi-GPU quantization is not supported yet.")
    if quantize.startswith("bnb."):
        if "mixed" in precision:
            raise ValueError("Quantization and mixed precision is not supported.")
        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
        plugins = BitsandbytesPrecision(quantize[4:], dtype)
        precision = None

if strategy=="fsdp":
    strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)

fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, plugins=plugins)
fabric.launch()

tokenizer = Tokenizer(Path('tokenizer_Llama-2-7b-chat-hf'))
config = Config.from_name("pythia-160m")

fabric.print(f"Loading model from {chptk_path}" , file=sys.stderr)
t0 = time.perf_counter()
with fabric.init_module(empty_init=True), gptq_quantization(quantize=="gptq.int4"):
    model = GPT(config)
fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
with fabric.init_tensor():
    # enable the kv cache
    model.set_kv_cache(batch_size=1)

model.eval()
model = fabric.setup_module(model)

t0 = time.perf_counter()
load_checkpoint(fabric, model, chptk_path)
fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)

Loading model from /content/drive/MyDrive/last-iter-015000-ckpt.pth
Time to instantiate model: 0.03 seconds.
Time to load the model weights: 26.28 seconds.


In [12]:
def generate_from_prompt(
    prompt: str = "",
    *,
    num_samples: int = 1,
    max_new_tokens: int = 500,
    top_k: int = 200,
    temperature: float = 0.8,
):
    """Generate text from a prompt using pre-trained model

    Args:
        prompt (str, optional): Prompt string to be used for generating samples. Defaults to "".
        num_samples (int, optional): Number of samples to be generated. Defaults to 1.
        max_new_tokens (int, optional): number of generation steps to take. Defaults to 500.
        top_k (int, optional): top most preferable tokens to consider in the sampling process. Defaults to 200.
        temperature (float, optional): Control randomness for sampelling process. Defaults to 0.8.
    """
    encoded = tokenizer.encode(prompt, device=fabric.device)
    prompt_length = encoded.size(0)
    max_returned_tokens = prompt_length + max_new_tokens
    with fabric.init_tensor():
        # set the max_seq_length to limit the memory usage to what we need
        model.max_seq_length = max_returned_tokens

    for i in range(num_samples):
        t0 = time.perf_counter()
        y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k)
        t = time.perf_counter() - t0
        # for block in model.transformer.h:
        #     block.attn.kv_cache.reset_parameters()
        fabric.print(tokenizer.decode(y))
        tokens_generated = y.size(0) - prompt_length
        fabric.print(
            f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr
        )
    if fabric.device.type == "cuda":
        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)


In [13]:
generate_from_prompt(
    prompt="In a galaxy far, far away, an intergalactic council convenes to discuss the rising cost of lightsaber batteries. Among them is an unlikely representative: a droid with a penchant for economics...",
)

In a galaxy far, far away, an intergalactic council convenes to discuss the rising cost of lightsaber batteries. Among them is an unlikely representative: a droid with a penchant for economics...
https://avinpeace.com/canon/business-advertime-from-to-top-or-to-frivional-rescue-of-girls.html
Why aren’t they worsened to keep things alive?
Human rights activists are going to provide a healthy, unilateral, universal understanding of the environment and what it means to them. They are very much in the sense of the environment. They need to be able to understand and analyze the environment and there is something in the world that has benefited from the world.
This is one of the least important, most important concepts: it is the world that a healthy, poor and more complicated social order is how the environment is developed. This is not only the most common cause of destruction, but it is the reality that it is the world that has its best. It is the way that we see the end of the world, and 

Time for inference 1: 124.59 sec total, 4.01 tokens/sec


In [14]:
generate_from_prompt(
    prompt="""Two roads diverged in a yellow wood,
            Robert Frost poetAnd sorry I could not travel both
            And be one traveler, long I stood
            And looked down one as far as I could
            To where it bent in the undergrowth;

            Then took the other, as just as fair,
            And having perhaps the better claim,
            Because it was grassy and wanted wear;
            Though as for that the passing there
            Had worn them really about the same,...""",
    temperature=0.5
)

Two roads diverged in a yellow wood,
            Robert Frost poetAnd sorry I could not travel both
            And be one traveler, long I stood
            And looked down one as far as I could
            To where it bent in the undergrowth;

            Then took the other, as just as fair,
            And having perhaps the better claim,
            Because it was grassy and wanted wear;
            Though as for that the passing there
            Had worn them really about the same,...
            But when the rest of the day was the right place, it was the first time that the rest of the day had been gone,
            And so I said,
            And the last time I had seen the rest of the day, I was not sure how to get the rest of the day
            And was it the best place to go, and to go, and to get the rest of the day, and to get the rest of the day, and to get the rest of the day, and to get the rest of the day, and to get the rest of the day, and to get the rest of the d

Time for inference 1: 129.69 sec total, 3.86 tokens/sec


In [15]:
generate_from_prompt(
    prompt="After receiving an encrypted message from a sinister organisation, with the help of Madeleine, James Bond sets out to uncover a conspiracy, which reveals an ugly truth about his past..",

    # temperature=0.5
)

After receiving an encrypted message from a sinister organisation, with the help of Madeleine, James Bond sets out to uncover a conspiracy, which reveals an ugly truth about his past..
For his friends, the love of one whose actions were used in both the speeches and the news media. The truth is that this is the only one where every person of the real person who is in a peaceful movement. Of those who were in fact truly unlocked for the pleasure of him, the truth of his efforts is that it is not a matter of reason for people to accept their freedom. He was a priest of notorious spirituality and injustice for all of his teachings in the community than before he did.
However the same was true. His reputation for freedom and his desire for truth be in the heart of the world.
On behalf of the world, the world has established itself as a symbol of freedom.
Another life has raised a life of free speech, an unprecedented nature in art and culture.
For this purpose, in this case, the world has 

Time for inference 1: 116.33 sec total, 4.30 tokens/sec


In [16]:
generate_from_prompt(
    prompt="Out of no where yesterday night I saw a dream, I was walking on a lonely road alone, listening music throughout and ..",

    # temperature=0.5
)

Out of no where yesterday night I saw a dream, I was walking on a lonely road alone, listening music throughout and ..
"A spy troll down a little my little guy I'm a freakin' haasan. We have a lot of people here because I'm looking at every single person on the internet and I didn't know anyone who's going to do it for him anymore. We have a lot of people around the world who know it. I'm a fan of the music industry in the industry. I'm very fortunate to have a lot of people here and there."
What do you think about this? I'm not thinking about it. I think it's the biggest thing that I imagined in terms of the marketing aspect. You know, I think it's the largest problem that I've seen after the third and I think people are talking about it. I think they're going to go there and I mean that's what the way it's going to be like in a way that's going to be like in a way that's going to be like "a good thing."
But the best thing I think about this was that I was like "you're not winning the

Time for inference 1: 116.23 sec total, 4.30 tokens/sec


In [17]:
generate_from_prompt(
    prompt="A wall with beautiful paintings lot of colors, arts, theams and many more, experiecing such thing is a gift in itself ..",

    # temperature=0.5
)

A wall with beautiful paintings lot of colors, arts, theams and many more, experiecing such thing is a gift in itself ..
The Cooligolo is a proud place in which he is now serving with his wife the Queen. If you’re interested in developing this new adventure, please call us today for more information.
We are pleased to receive updates or comments that keep your home in the future, and that do not mean we are going to stay safe and enjoy. RF1: The Difference Between A Millionaire and the Stousand Yard
In total, Yard’s 25th-century career began in 1990. A total of 59,997 people were included in today’s national food supply of food. But at least 25,000 people received their annual life of life. Mothers’ 1929 retail outreach was the only country that was built in 1991. The following year, she reserves the power of the great deal to serve 8,000 people over the next 25 years. She has long been in the military as well as internationally as international volunteers. She completed her three meet

Time for inference 1: 115.86 sec total, 4.32 tokens/sec
