<a href="https://colab.research.google.com/github/eng-accelerator/ai-accelerator/blob/main/Day_8/day_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and imports

In [None]:
! pip install transformers diffusers pillow gradio torch datasets evaluate accelerate ftfy pyarrow --quiet

In [None]:
import torch
from PIL import Image
import gradio as gr
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from io import BytesIO

from torch.nn.attention import SDPBackend, sdpa_kernel
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from diffusers import (
    StableDiffusionXLPipeline,
    StableDiffusionPipeline,
    StableDiffusionImg2ImgPipeline,
    StableDiffusionInpaintPipeline,
    EulerDiscreteScheduler,
    EulerAncestralDiscreteScheduler,
    DDIMScheduler,
    LMSDiscreteScheduler,
    DPMSolverMultistepScheduler
)

from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    pipeline,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    infer_device,
    BitsAndBytesConfig,
)

torch_device = "cuda" if torch.cuda.is_available() else "cpu"

# Pipelines

### Text generation

In [None]:
generator = pipeline("text-generation")
generator("C++ is ")

### Do more with pipeline - explore yourself!

**How to experiment.**


Two places -- while creating the [pipeline](https://github.com/huggingface/transformers/blob/v4.56.2/src/transformers/pipelines/__init__.py#L637) and while generating the actual output (text image etc.)


List of tasks that can be done with pipeline - figure this out. Experiment with each yourself.

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")
generator("C++ is",
            max_new_tokens=5,
            num_return_sequences=2,
        )

### Images and pipeline

In [None]:
from transformers import pipeline

image_classifier = pipeline(
    task="image-classification", model="google/vit-base-patch16-224"
)
result = image_classifier(
    "https://media.istockphoto.com/id/1443562748/photo/cute-ginger-cat.jpg?s=1024x1024&w=is&k=20&c=QaEkKC7lFEBrzzPftMRBVuOZq4FNOnUjOV1VqTmpMfY="
)
print(result)

### Multimodal example

In [None]:
vqa_pipeline = pipeline(
    "visual-question-answering", model="Salesforce/blip-vqa-capfilt-large"
)

image = Image.open("baby_goat.jpg")
question = "Is there an elephant?"

vqa_pipeline(image, question, top_k=1)

Gradio app!

In [None]:
def answer_question(image, question):
    if image is None or question.strip() == "":
        return "Please provide an image and a question."
    outputs = vqa_pipeline(image, question, top_k=1)
    return outputs[0]["answer"]


with gr.Blocks() as demo:
    gr.Markdown("# 🖼️ Visual Question Answering with BLIP")
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload an Image")
            question_input = gr.Textbox(label="Enter your question")
            submit_btn = gr.Button("Get Answer")
        with gr.Column():
            output_text = gr.Textbox(label="Answer")

    submit_btn.click(
        fn=answer_question,
        inputs=[image_input, question_input],
        outputs=output_text
    )

# Launch the app
demo.launch(debug=True)


# Auto-classes

### AutoTokenizer

In [None]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I am learning Operating Systems and it's so fun.",
    "I write terrible C++ code!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt", max_length=15)
print(inputs)

# another way to do it - a closer look into the tokenizer
sequence = "I am learning Operating Systems and it's so fun."
tokens = tokenizer.tokenize(sequence)

print("--------TOKENIZED SENTENCE--------")
print(tokens)

print("\n\n--------TOKENS MAPPED TO THEIR IDS--------")
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

**TAKE A PAUSE!**

**SPECIAL TOKENS!**

In [None]:
# already tokenized input
decoded_string = tokenizer.decode([1045, 2572, 4083, 4082, 3001, 1998, 2009, 1005, 1055, 2061, 4569, 1012])
print(f"DECODING input_ids of ALREADY TOKENIZED INPUT:\n{decoded_string}\n\n")

# raw text
decoded_string = tokenizer.decode([101, 1045, 2572, 4083, 4082, 3001, 1998, 2009, 1005, 1055, 2061, 4569,
         1012,  102])
print(f"DECODING input_ids of RAW TOKENIZED INPUT:\n{decoded_string}\n\n")

**Padding - ways to do it**

In [None]:
# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

### Models

In [None]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

outputs = model(**inputs)
print(f"shape of output tensor from distilbert-base-uncased-finetuned-sst-2-english:\n{outputs.last_hidden_state.shape}\n\n")
# bs, seq_len, dim
# torch.Size([2, 14, 768])


# loading the model with the appropriate task class
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

print("--------SHAPE OF LOGIT TENSOR--------")
print(outputs.logits.shape)

print("\n\n--------LOGIT TENSOR--------")
print(outputs.logits)


predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = torch.argmax(predictions, dim=1)
print("\n\n--------PREDICTIONS--------")
print(predictions)
print(f"labels according to the model config:\n{model.config.id2label}")

# Fast inference

### Precision (dtype)

In [None]:
### Precision (dtype)

model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", device_map="auto", torch_dtype=torch.float16)

### torch.compile + channels last + dynamic compilation

In [None]:
torch._inductor.config.conv_1x1_as_mm = True
torch._inductor.config.coordinate_descent_tuning = True
torch._inductor.config.epilogue_fusion = False
torch._inductor.config.coordinate_descent_check_all_directions = True

pipeline = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
).to("cuda")
pipeline.unet.to(memory_format=torch.channels_last)
pipeline.vae.to(memory_format=torch.channels_last)
pipeline.unet = torch.compile(
    pipeline.unet, mode="max-autotune", fullgraph=True, dynamic=True
)
pipeline.vae.decode = torch.compile(
    pipeline.vae.decode,
    mode="max-autotune",
    fullgraph=True, dynamic=True
)

prompt = "A rabbit in a garden, warm and muted colors, detailed, 8k"
pipeline(prompt, num_inference_steps=30).images[0]

### Speculative decoding

The assistant and LLM model must also share the same tokenizer to avoid re-encoding and decoding tokens.

In [None]:
device = infer_device()

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
inputs = tokenizer("just some sample text", return_tensors="pt").to(device)

model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", dtype="auto").to(device)
assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device)

outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.7)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

### Prompt lookup decoding

works best for input-grounded tasks (eg. summarization) where there's overlapping words between the input and output!

In [None]:
device = infer_device()

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
inputs = tokenizer("more sample text", return_tensors="pt").to(device)

model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", dtype="auto").to(device)
assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device)
outputs = model.generate(**inputs, prompt_lookup_num_tokens=3)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

### Attention

In [None]:
# flash attn 2
quant_config = BitsAndBytesConfig(load_in_8bit=True)

# option 1
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b",
    quantization_config=quant_config,
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)

# option 2
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b",
    quantization_config=quant_config,
    dtype=torch.bfloat16
)
model.set_attention_implementation("flash_attention_2")



# torch sdpa
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b",
    dtype=torch.bfloat16,
)

with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)

# Accelerate

In [None]:
# the code below is just for deomonstration and understanding purposes - it won't run unless a PyTorch model MyModelClass is defined, trained and saved in a file called ckpt_file
#################################### FOCUS HERE ################################

# just a skeleton
with init_empty_weights():
    model = MyModelClass(...)

model = load_checkpoint_and_dispatch(
    model, checkpoint=ckpt_file, device_map="auto"
)

################################################################################

input = torch.randn(2,3)
device_type = next(iter(model.parameters())).device.type
input = input.to(device_type)
output = model(input)


That was PyTorch. What about HF models (using accelerate w the autoclasses API we just studied)?

In [None]:
# this would work
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", device_map="auto")

# Diffusers


In [None]:
# -------------------------------------
# 1. TEXT-TO-IMAGE GENERATION
# -------------------------------------


pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
).to(torch_device)

# Example prompt
prompt = "A majestic lion wearing a crown, photorealistic, 4k, highly detailed"
image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]

display(image)

"""
📝 Parameters explained:
- prompt: text description of the image
- num_inference_steps: how many denoising steps (higher = better quality, slower)
- guidance_scale: how strongly the prompt guides generation (7-8 is common; higher = more prompt fidelity, lower = more creativity)
"""



In [None]:
# -------------------------------------
# 2. IMAGE-TO-IMAGE GENERATION
# -------------------------------------

img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
).to(torch_device)


url = "https://images.unsplash.com/photo-1480497490787-505ec076689f?q=80&w=2069&auto=format&fit=crop&ixlib=rb-4.1.0&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
init_image = Image.open(BytesIO(requests.get(url).content)).convert("RGB").resize((512, 512))

prompt = "A futuristic city skyline painted on the mountain"
strength = 0.7  # how much noise to add: 0 = almost same as input, 1 = ignore input
num_inference_steps = 40

img2img = img2img_pipe(
    prompt=prompt,
    image=init_image,
    strength=strength,
    num_inference_steps=num_inference_steps,
    guidance_scale=7.5
).images[0]

display(init_image)
display(img2img)

"""
📝 Parameters explained:
- image: input image you want to transform
- strength: controls how much noise is added
    - low strength (0.2-0.4): keeps input structure, small edits
    - high strength (0.7-0.9): more creative, diverges from input
"""



In [None]:
# -------------------------------------
# 3. INPAINTING
# -------------------------------------

inpaint_pipe = StableDiffusionInpaintPipeline.from_pretrained(
    "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
).to(torch_device)


url = "https://images.unsplash.com/photo-1480497490787-505ec076689f?q=80&w=2069&auto=format&fit=crop&ixlib=rb-4.1.0&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
init_image = Image.open(BytesIO(requests.get(url).content)).convert("RGB").resize((512, 512))

# Mask: white = keep, black = replace with noise
mask_url = "https://images.unsplash.com/photo-1573865526739-10659fec78a5?q=80&w=1015&auto=format&fit=crop&ixlib=rb-4.1.0&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
mask_image = Image.open(BytesIO(requests.get(mask_url).content)).convert("RGB").resize((512, 512))

prompt = "Replace the masked area with a cute cat sitting there"
inpaint = inpaint_pipe(
    prompt=prompt,
    image=init_image,
    mask_image=mask_image,
    num_inference_steps=40,
    guidance_scale=7.5
).images[0]

display(init_image)
display(mask_image)
display(inpaint)

"""
📝 Parameters explained:
- image: input image
- mask_image: white = preserved areas, black = replaced by generation
- num_inference_steps: steps of denoising (same as before)
- guidance_scale: prompt strength (same as before)
"""


### Schedulers

In [None]:
# =====================================
# 4. EXPLORING DIFFERENT SCHEDULERS
# =====================================



prompt = "A futuristic cyberpunk cityscape at night, neon lights, highly detailed"
num_inference_steps = 30
guidance_scale = 7.5

schedulers = {
    "Euler": EulerDiscreteScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler"),
    "Euler Ancestral": EulerAncestralDiscreteScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler"),
    "DDIM": DDIMScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler"),
    "LMS": LMSDiscreteScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler"),
    "DPM-Solver++": DPMSolverMultistepScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler"),
}

images = {}

for name, scheduler in schedulers.items():
    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        scheduler=scheduler,
        torch_dtype=torch.float16
    ).to(torch_device)

    print(f"\n\n---------------------------Generating with {name} scheduler---------------------------------")
    image = pipe(
        prompt,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale
    ).images[0]

    images[name] = image
    display(image)

"""
📝 Key Notes on Schedulers:

1. Euler:
   - Fast and stable
   - Great default choice
   - Good balance of quality and speed

2. Euler Ancestral:
   - More creative / adds variation
   - Can be less deterministic
   - Useful when you want diverse samples

3. DDIM (Denoising Diffusion Implicit Model):
   - Deterministic (same seed → same output)
   - Faster sampling
   - Slightly less detailed sometimes

4. LMS (Laplacian Pyramid Solver):
   - Sharp results
   - Popular for high-quality outputs
   - Slower than Euler

5. DPM-Solver++:
   - Modern, very efficient
   - High-quality images in fewer steps
   - Good for faster inference with quality

👉 Tip: Try lowering num_inference_steps (e.g. 15) with DPM-Solver++ and compare!
"""
