In [1]:
%cd ..

c:\Users\anant\Downloads\S22\S22


In [2]:
import time
import sys
from pathlib import Path
from typing import Optional, Literal, Any

import lightning as L
import torch
from lightning.fabric.strategies import FSDPStrategy
from lightning.fabric.plugins import BitsandbytesPrecision


from tsai_gpt.utils import get_default_supported_precision, gptq_quantization, load_checkpoint
from tsai_gpt.model import GPT, Block, Config
from tsai_gpt.tokenizer import Tokenizer

In [10]:
L.seed_everything(1234)

Seed set to 1234


1234

In [3]:
def multinomial_num_samples_1(probs: torch.Tensor) -> torch.Tensor:
    if torch._dynamo.is_compiling():
        # Faster alternative to `torch.multinomial(probs, num_samples=1)` that is also CUDAGraph friendly
        distribution = torch.empty_like(probs).exponential_(1)
        return torch.argmax(probs / distribution, dim=-1, keepdim=True)
    return torch.multinomial(probs, num_samples=1)


def sample(logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] = None) -> torch.Tensor:
    logits = logits[0, -1]
    # optionally crop the logits to only the top k options
    if top_k is not None:
        v, i = torch.topk(logits, min(top_k, logits.size(-1)))
        # do not use `torch.where` as in nanogpt because it will repeat top-k collisions
        logits = torch.full_like(logits, float("-inf")).scatter_(-1, i, v)
    # optionally scale the logits and sample from a probability distribution
    if temperature > 0.0:
        probs = torch.nn.functional.softmax(logits / temperature, dim=-1)
        return multinomial_num_samples_1(probs)
    return torch.argmax(logits, dim=-1, keepdim=True)


def next_token(model: GPT, input_pos: torch.Tensor, x: torch.Tensor, **kwargs: Any) -> torch.Tensor:
    logits = model(x, input_pos)
    next = sample(logits, **kwargs)
    return next.type_as(x)


@torch.inference_mode()
def generate(
    model: GPT,
    prompt: torch.Tensor,
    max_returned_tokens: int,
    *,
    temperature: float = 1.0,
    top_k: Optional[int] = None,
    eos_id: Optional[int] = None,
) -> torch.Tensor:
    """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.

    The implementation of this function is modified from A. Karpathy's nanoGPT.

    Args:
        model: The model to use.
        prompt: Tensor of shape (T) with indices of the prompt sequence.
        max_returned_tokens: The maximum number of tokens to return (given plus generated).
        temperature: Scales the predicted logits by 1 / temperature.
        top_k: If specified, only sample among the tokens with the k highest probabilities.
        eos_id: If specified, stop generating any more token once the <eos> token is triggered.
    """
    T = prompt.size(0)
    assert max_returned_tokens > T
    if model.max_seq_length < max_returned_tokens - 1:
        # rolling the kv cache based on the `input_pos` value would be necessary. However, doing so would introduce a
        # data dependency on the `input_pos` tensor and impact model compilation. Since this setting is uncommon, we do
        # not support it to avoid negatively impacting the overall speed
        raise NotImplementedError(f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}")

    device = prompt.device
    tokens = [prompt]
    input_pos = torch.tensor([T], device=device)
    token = next_token(
        model, torch.arange(0, T, device=device), prompt.view(1, -1), temperature=temperature, top_k=top_k
    ).clone()
    tokens.append(token)
    for _ in range(2, max_returned_tokens - T + 1):
        token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k).clone()
        tokens.append(token)
        if token == eos_id:
            break
        input_pos = input_pos.add_(1)
    return torch.cat(tokens)

In [13]:
"""
quantize (Optional[Literal[&quot;bnb.nf4&quot;, &quot;bnb.nf4, optional): quantization method to use. Defaults to None.
    - "bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq": 4-bit quantization bitsandbytes
    - "bnb.int8": 8-bit quantization bitsandbytes
    - "gptq.int4": 4-bit quantization GPTQ
    for more details see: https://github.com/facebookresearch/bitsandbytes, https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
strategy (str, optional): Fabric strategy setting. Defaults to "auto".
devices (int, optional): number of devices to be used. Defaults to 1.
precision (Optional[str], optional): fabic precision settings. Defaults to None.
"""

chptk_path: str = "saved_model/last-iter-015000-ckpt.pth"
tokenizer_path: str = "tokenizer_Llama-2-7b-chat-hf" 
quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None
strategy: str = "auto"
devices: int = 1
precision: Optional[str] = None

precision = precision or get_default_supported_precision(training=False)
plugins = None
if quantize is not None:
    if devices > 1:
        raise NotImplemented("Multi-GPU quantization is not supported yet.") 
    if quantize.startswith("bnb."):
        if "mixed" in precision:
            raise ValueError("Quantization and mixed precision is not supported.")
        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
        plugins = BitsandbytesPrecision(quantize[4:], dtype)
        precision = None

if strategy=="fsdp":
    strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)

fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, plugins=plugins)
fabric.launch()

tokenizer = Tokenizer(Path('tokenizer_Llama-2-7b-chat-hf'))
config = Config.from_name("pythia-160m")

fabric.print(f"Loading model from {chptk_path}" , file=sys.stderr)
t0 = time.perf_counter()
with fabric.init_module(empty_init=True), gptq_quantization(quantize=="gptq.int4"):  
    model = GPT(config)
fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
with fabric.init_tensor():
    # enable the kv cache
    model.set_kv_cache(batch_size=1)
    
model.eval()
model = fabric.setup_module(model)

t0 = time.perf_counter()
load_checkpoint(fabric, model, chptk_path)
fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)

Loading model from saved_model/last-iter-015000-ckpt.pth
Time to instantiate model: 0.70 seconds.
Time to load the model weights: 0.88 seconds.


In [16]:
def generate_from_prompt(
    prompt: str = "",
    *,
    num_samples: int = 1,
    max_new_tokens: int = 500,
    top_k: int = 200,
    temperature: float = 0.8,
):
    """Generate text from a prompt using pre-trained model

    Args:
        prompt (str, optional): Prompt string to be used for generating samples. Defaults to "".
        num_samples (int, optional): Number of samples to be generated. Defaults to 1.
        max_new_tokens (int, optional): number of generation steps to take. Defaults to 500.
        top_k (int, optional): top most preferable tokens to consider in the sampling process. Defaults to 200.
        temperature (float, optional): Control randomness for sampelling process. Defaults to 0.8.
    """    
    encoded = tokenizer.encode(prompt, device=fabric.device)
    prompt_length = encoded.size(0)
    max_returned_tokens = prompt_length + max_new_tokens
    with fabric.init_tensor():
        # set the max_seq_length to limit the memory usage to what we need
        model.max_seq_length = max_returned_tokens
        
    for i in range(num_samples):
        t0 = time.perf_counter()
        y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k)
        t = time.perf_counter() - t0
        # for block in model.transformer.h:
        #     block.attn.kv_cache.reset_parameters()
        fabric.print(tokenizer.decode(y))
        tokens_generated = y.size(0) - prompt_length
        fabric.print(
            f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr
        )
    if fabric.device.type == "cuda":
        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)


In [17]:
generate_from_prompt(
    prompt="In a galaxy far, far away, an intergalactic council convenes to discuss the rising cost of lightsaber batteries. Among them is an unlikely representative: a droid with a penchant for economics...",
)

In a galaxy far, far away, an intergalactic council convenes to discuss the rising cost of lightsaber batteries. Among them is an unlikely representative: a droid with a penchant for economics...
The sunlight, like a mountain of lithium, is the largest stream. The sky is the same type of storm: one of the stars in the mountain stars is a crucial stream you head to the sky. It was the largest speed of space for stars in the world. The sunlight is then the middle of a rayon. Here is the moonlight on the Earth of summer.
And it is the same type of sky as the sunlight turns out to be the sunseted away. The sky will only be that of the world.
To be sure, viewers are constantly talking about the cold weather and the sunlight, and they're beginning to see the cold weather.
So, you see the sunlight on the moonlight and it reaches the sunlight. There is a massive perimeter, and you look across the sky. The sunlight makes you warm up on the moonlight.
What the way the sun will be is, the sunligh

Time for inference 1: 26.72 sec total, 18.71 tokens/sec
Memory used: 0.79 GB


In [19]:
generate_from_prompt(
    prompt="In a galaxy far, far away, an intergalactic council convenes to discuss the rising cost of lightsaber batteries. Among them is an unlikely representative: a droid with a penchant for economics...",
    temperature=0.5
)

In a galaxy far, far away, an intergalactic council convenes to discuss the rising cost of lightsaber batteries. Among them is an unlikely representative: a droid with a penchant for economics...
Very few people have been able to say that the Earth's energy costs are a more expensive, uncommon, and uncommon.
A major change of time, the electrical infrastructure, the electrical infrastructure, and the electrical infrastructure, is a growing threat to the planet.
A strong, sharp, and uncommon sense of power, the electrical infrastructure is a fact that the electrical infrastructure is a major part of the Earth's energy plan.
The scientists agree that the electrical infrastructure is a part of the Earth's energy project, a direct-engineered solution to a more efficient, efficient and efficient solution.
The project is a major concern: a new, uncommon task force at the Earth's energy costs is to make it easier for the electricity of the solar energy sector to produce a solar energy infrast

Time for inference 1: 28.01 sec total, 17.85 tokens/sec
Memory used: 0.79 GB


In [20]:
generate_from_prompt(
    prompt="As Sherlock Holmes and Dr. Watson enter the world of social media influencers, they find their first case: the mysterious disappearance of a famous TikTok star's like button.",
    
    # temperature=0.5
)

As Sherlock Holmes and Dr. Watson enter the world of social media influencers, they find their first case: the mysterious disappearance of a famous TikTok star's like button. At the point of this, a former victim of a former tad, and what the GOP is looking for, I'm not sure what it's happening, but I'm trying to bring a big fight to the world of social media influencers. They are everywhere, but with these two-time friends and spending a good time. They seem to be a 'competent' star to have such diverse channels as the GOP 'makeup' of a single killer.
Still, thank you for posting this series on this show, and be a penn by the way. This is one of the most amazing things in the world. The role of TikTok is a national music festival, the national television community, the media and the world.
We describe how TikTok is the best song on TikTok.
This year, we were a long time, and the band was part of a television show that brought the audience to the right place. Once we began to explore t

Time for inference 1: 6.48 sec total, 77.18 tokens/sec
Memory used: 0.79 GB


As Sherlock Holmes and Dr. Watson enter the world of social media influencers, they find their first case: the mysterious disappearance of a famous TikTok star's like button. So, it's a humiliating and challenging story of the ties, the people who hilariously make the world the way they live. That's why we've found that the camera still does.
(D) In the background of the 2008 novel, the reality show becomes the story of a fictional story of the TikTok star's novel. The ties of the TikTok event are an extremely vulnerable, but it's important to remember that the story of a younger man's imagination is not just a way to understand the story of his own.
It's about the ties between the TikTok star and the world.
While we'll have to make the story of the TikTok star's story, we'll throw in the world of the horror and the world.
The TikTok star is the greatest hero. Despite his story of the tragedy, it's worth a lot of fun to find an out story.
To see, the TikTok star is a fictional story. W

Time for inference 1: 6.26 sec total, 79.84 tokens/sec
Memory used: 0.79 GB
