<a href="https://colab.research.google.com/github/deepshikha04yadav/Text-to-Image/blob/main/text-to-image.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!git clone https://github.com/deepshikha04yadav/Text-to-Image

fatal: destination path 'Text-to-Image' already exists and is not an empty directory.


In [None]:
# Colab: Install all dependencies
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install diffusers==0.21.0 transformers==4.30.2 accelerate==0.20.3 safetensors==0.3.1 xformers==0.0.20 Pillow==9.5.0 numpy==1.24.4 matplotlib==3.7.2 gradio==4.0.0

In [11]:
import warnings
warnings.filterwarnings("ignore", message="A matching Triton is not available")

import torch
import torch.nn.functional as F
from torch import autocast
import numpy as np
from PIL import Image
import os
import time
import gc
from typing import Optional, Tuple, List
from datetime import datetime
from importlib.metadata import version

from diffusers import (
    StableDiffusionPipeline,
    EulerAncestralDiscreteScheduler,
    EulerDiscreteScheduler,
    DPMSolverMultistepScheduler,
    DDIMScheduler,
    LMSDiscreteScheduler
)
import gradio as gr

In [None]:
# Core Stable Diffusion Generator class
class StableDiffusionGenerator:
    def __init__(self, model_id: str = "runwayml/stable-diffusion-v1-5", device: str = "auto"):
        try:
            self.device = self._setup_device(device)
            self.dtype = torch.float16 if self.device.type == "cuda" else torch.float32

            print(f"üöÄ Initializing Stable Diffusion on {self.device}")
            print(f"üìä Using precision: {self.dtype}")

            torch_version = version("torch")
            diffusers_version = version("diffusers")
            print(f"üì¶ PyTorch version: {torch_version}")
            print(f"üì¶ Diffusers version: {diffusers_version}")

            self.pipe = self._load_pipeline(model_id)
            self.current_scheduler = "euler_a"
            self.schedulers = {
                "euler_a": ("Euler Ancestral", "Fast, good for creative images"),
                "euler": ("Euler", "Deterministic, consistent results"),
                "ddim": ("DDIM", "Classic, good quality, slower"),
                "dpm_solver": ("DPM Solver", "High quality, efficient"),
                "lms": ("LMS", "Linear multistep, stable")
            }
            print("‚úÖ Stable Diffusion Generator Ready!")
            print(f"üìù Available Schedulers: {list(self.schedulers.keys())}")
        except Exception as e:
            print(f"‚ùå Initialization Error: {str(e)}")
            print("Please ensure Visual C++ Redistributable 2015-2022 is installed")
            raise

    def _setup_device(self, device: str) -> torch.device:
        if device == "auto":
            if torch.cuda.is_available():
                device = "cuda"
                print(f"üéØ GPU Detected: {torch.cuda.get_device_name(0)}")
                print(f"üíæ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")
            else:
                device = "cpu"
                print("üíª Using CPU (GPU not available)")
        return torch.device(device)

    def _load_pipeline(self, model_id: str) -> StableDiffusionPipeline:
        try:
            pipe = StableDiffusionPipeline.from_pretrained(
                model_id,
                torch_dtype=self.dtype,
                safety_checker=None,
                requires_safety_checker=False,
            )
            print("üîß Applying Memory Optimizations...")
            pipe.enable_attention_slicing()
            print("  ‚úì Attention Slicing: Enabled")
            pipe.enable_vae_slicing()
            print("  ‚úì VAE Slicing: Enabled")
            try:
                pipe.enable_xformers_memory_efficient_attention()
                print("  ‚úì XFormers Attention: Enabled")
            except Exception as e:
                print(f"  ‚ö† XFormers: Not available ({e})")
            if self.device.type == "cuda":
                try:
                    pipe = pipe.to(self.device)
                    print("  ‚úì Full GPU Loading: Success")
                except RuntimeError as e:
                    print("  ‚ö† GPU Memory Limited: Using CPU Offload")
                    pipe.enable_model_cpu_offload()
            else:
                pipe.enable_sequential_cpu_offload()
                print("  ‚úì CPU Sequential Offload: Enabled")
            return pipe
        except Exception as e:
            raise RuntimeError(f"Failed to load model: {e}")

    def set_scheduler(self, scheduler_name: str) -> bool:
        if scheduler_name not in self.schedulers:
            print(f"‚ùå Unknown scheduler: {scheduler_name}")
            return False
        if scheduler_name == self.current_scheduler:
            return True
        scheduler_map = {
            "euler_a": EulerAncestralDiscreteScheduler,
            "euler": EulerDiscreteScheduler,
            "ddim": DDIMScheduler,
            "dpm_solver": DPMSolverMultistepScheduler,
            "lms": LMSDiscreteScheduler
        }
        try:
            scheduler_class = scheduler_map[scheduler_name]
            self.pipe.scheduler = scheduler_class.from_config(self.pipe.scheduler.config)
            self.current_scheduler = scheduler_name
            name, desc = self.schedulers[scheduler_name]
            print(f"üîÑ Scheduler Changed: {name} ({desc})")
            return True
        except Exception as e:
            print(f"‚ùå Scheduler Error: {e}")
            return False

    def generate_image(
        self,
        prompt: str,
        negative_prompt: str = "",
        width: int = 512,
        height: int = 512,
        num_inference_steps: int = 20,
        guidance_scale: float = 7.5,
        seed: Optional[int] = None,
        scheduler: str = "euler_a"
    ) -> Tuple[Image.Image, dict]:
        if not prompt.strip():
            raise ValueError("Prompt cannot be empty")
        self.set_scheduler(scheduler)
        if seed is None:
            seed = torch.randint(0, 2**32, (1,)).item()
        generator = torch.Generator(device=self.device)
        generator.manual_seed(seed)
        width = (width // 8) * 8
        height = (height // 8) * 8
        print(f"üé® Generating: '{prompt[:50]}...'")
        print(f"üìè Size: {width}x{height}, Steps: {num_inference_steps}, CFG: {guidance_scale}")
        print(f"üé≤ Seed: {seed}, Scheduler: {scheduler}")
        start_time = time.time()
        try:
            with torch.inference_mode():
                if self.device.type == "cuda" and self.dtype == torch.float16:
                    with autocast(self.device.type):
                        result = self.pipe(
                            prompt=prompt,
                            negative_prompt=negative_prompt if negative_prompt else None,
                            width=width,
                            height=height,
                            num_inference_steps=num_inference_steps,
                            guidance_scale=guidance_scale,
                            generator=generator
                        )
                else:
                    result = self.pipe(
                        prompt=prompt,
                        negative_prompt=negative_prompt if negative_prompt else None,
                        width=width,
                        height=height,
                        num_inference_steps=num_inference_steps,
                        guidance_scale=guidance_scale,
                        generator=generator
                    )
            generation_time = time.time() - start_time
            metadata = {
                "prompt": prompt,
                "negative_prompt": negative_prompt,
                "width": width,
                "height": height,
                "steps": num_inference_steps,
                "guidance_scale": guidance_scale,
                "scheduler": scheduler,
                "seed": seed,
                "generation_time": round(generation_time, 2),
                "device": str(self.device),
                "dtype": str(self.dtype)
            }
            print(f"‚úÖ Generated in {generation_time:.2f}s")
            return result.images[0], metadata
        except torch.cuda.OutOfMemoryError:
            self._cleanup_memory()
            raise RuntimeError(
                "GPU Out of Memory! Try: reducing image size, fewer steps, "
                "or use CPU mode. Current settings may be too demanding."
            )
        except Exception as e:
            raise RuntimeError(f"Generation failed: {str(e)}")
        finally:
            self._cleanup_memory()

    def _cleanup_memory(self):
        gc.collect()
        if self.device.type == "cuda":
            torch.cuda.empty_cache()

    def get_memory_usage(self) -> dict:
        memory_info = {}
        if self.device.type == "cuda":
            memory_info = {
                "allocated_gb": torch.cuda.memory_allocated() / 1024**3,
                "reserved_gb": torch.cuda.memory_reserved() / 1024**3,
                "max_allocated_gb": torch.cuda.max_memory_allocated() / 1024**3,
                "total_gb": torch.cuda.get_device_properties(0).total_memory / 1024**3
            }
        else:
            memory_info = {"device": "cpu", "note": "CPU memory tracking not available"}
        return memory_info

    def save_image(self, image: Image.Image, metadata: dict, output_dir: str = "outputs") -> str:
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"sd_gen_{timestamp}_s{metadata['seed']}_{metadata['width']}x{metadata['height']}.png"
        filepath = os.path.join(output_dir, filename)
        image.save(filepath)
        metadata_file = filepath.replace('.png', '_metadata.txt')
        with open(metadata_file, 'w') as f:
            f.write("Stable Diffusion Generation Metadata\n")
            f.write("=" * 40 + "\n")
            for key, value in metadata.items():
                f.write(f"{key}: {value}\n")
        print(f"üíæ Saved: {filepath}")
        return filepath

In [None]:

from utils.text_preprocessing import AdvancedTextPreprocessor

prompt = "a cyberpunk city at night with neon lights"
# self.preprocessor = AdvancedTextPreprocessor()
preprocessor = AdvancedTextPreprocessor()
result = preprocessor.preprocess_for_stable_diffusion(prompt)

print(result["prompt"])
print(result["negative_prompt"])

a cyberpunk city at night with neon lights, highly detailed, high quality, sharp focus
low quality, blurry, bad anatomy, worst quality, low resolution, bad proportions, jpeg artifacts, ugly, deformed, distorted, disfigured


In [None]:
# Gradio UI class for Stable Diffusion
class StableDiffusionUI:
    def __init__(self):
        self.generator = None
        self.gallery_images = []
        self.generation_history = []

    def initialize_generator(self, model_choice: str, device_choice: str) -> str:
        try:
            model_map = {
                "Stable Diffusion 1.5 (Recommended)": "runwayml/stable-diffusion-v1-5",
                "Stable Diffusion 2.1": "stabilityai/stable-diffusion-2-1",
                "Realistic Vision (RealVisXL)": "SG161222/RealVisXL_V4.0"
            }
            device_map = {
                "Auto (Recommended)": "auto",
                "GPU (CUDA)": "cuda",
                "CPU (Slower)": "cpu"
            }
            model_id = model_map.get(model_choice, "runwayml/stable-diffusion-v1-5")
            device = device_map.get(device_choice, "auto")
            self.generator = StableDiffusionGenerator(model_id=model_id, device=device)
            memory_info = self.generator.get_memory_usage()
            memory_text = f"Memory Usage: {memory_info}" if memory_info else "Ready!"
            return f"‚úÖ Model loaded successfully!\n{memory_text}"
        except Exception as e:
            return f"‚ùå Initialization failed: {str(e)}"

    def generate_image(
        self,
        prompt: str,
        negative_prompt: str,
        width: int,
        height: int,
        steps: int,
        guidance: float,
        scheduler: str,
        seed: int,
        save_image: bool
    ) -> Tuple[Optional[Image.Image], str, str]:
        if self.generator is None:
            return None, "‚ùå Please initialize the model first!", ""
        if not prompt.strip():
            return None, "‚ùå Please enter a prompt!", ""
        try:
            seed = None if seed == -1 else int(seed)
            image, metadata = self.generator.generate_image(
                prompt=prompt,
                negative_prompt=negative_prompt,
                width=width,
                height=height,
                num_inference_steps=steps,
                guidance_scale=guidance,
                scheduler=scheduler,
                seed=seed
            )
            info_text = self._format_generation_info(metadata)
            saved_path = ""
            if save_image:
                saved_path = self.generator.save_image(image, metadata)
            self.generation_history.append(metadata)
            self.gallery_images.append(image)
            if len(self.gallery_images) > 10:
                self.gallery_images = self.gallery_images[-10:]
                self.generation_history = self.generation_history[-10:]
            return image, info_text, saved_path
        except Exception as e:
            return None, f"‚ùå Generation failed: {str(e)}", ""

    def _format_generation_info(self, metadata: dict) -> str:
        return f"""
‚úÖ Generation Complete!

üéØ **Parameters Used:**
‚Ä¢ Prompt: {metadata['prompt'][:100]}{'...' if len(metadata['prompt']) > 100 else ''}
‚Ä¢ Size: {metadata['width']} √ó {metadata['height']} pixels
‚Ä¢ Steps: {metadata['steps']} (more steps = higher quality, slower)
‚Ä¢ Guidance Scale: {metadata['guidance_scale']} (higher = follows prompt more closely)
‚Ä¢ Scheduler: {metadata['scheduler']}
‚Ä¢ Seed: {metadata['seed']} (for reproducible results)

‚è±Ô∏è **Performance:**
‚Ä¢ Generation Time: {metadata['generation_time']}s
‚Ä¢ Device: {metadata['device']}
‚Ä¢ Precision: {metadata['dtype']}
"""

    def get_example_prompts(self) -> list:
        return [
            ["a serene mountain landscape at sunrise, photorealistic, highly detailed", "blurry, low quality"],
            ["portrait of a wise old wizard, fantasy art, digital painting", "ugly, deformed"],
            ["cyberpunk cityscape at night, neon lights, futuristic", "daytime, bright"],
            ["cute cartoon cat wearing a hat, kawaii style", "realistic, scary"],
            ["abstract geometric patterns, colorful, modern art", "representational, dull colors"]
        ]

    def show_scheduler_info(self, scheduler: str) -> str:
        scheduler_info = {
            "euler_a": "**Euler Ancestral**: Fast and creative, adds slight randomness for variety",
            "euler": "**Euler**: Deterministic and consistent, same seed = same result",
            "ddim": "**DDIM**: Classic scheduler, high quality but slower",
            "dpm_solver": "**DPM Solver**: Efficient high-quality generation",
            "lms": "**LMS**: Linear multistep, very stable results"
        }
        return scheduler_info.get(scheduler, "Scheduler information not available")

    def get_memory_info(self) -> str:
        if self.generator is None:
            return "Model not loaded"
        try:
            memory_info = self.generator.get_memory_usage()
            if 'allocated_gb' in memory_info:
                return f"""
GPU Memory Usage:
‚Ä¢ Allocated: {memory_info['allocated_gb']:.2f}GB
‚Ä¢ Reserved: {memory_info['reserved_gb']:.2f}GB
‚Ä¢ Total Available: {memory_info['total_gb']:.2f}GB
‚Ä¢ Usage: {(memory_info['allocated_gb']/memory_info['total_gb']*100):.1f}%
                """
            else:
                return "CPU mode - memory tracking not available"
        except:
            return "Memory info unavailable"

    def create_interface(self) -> gr.Blocks:
        with gr.Blocks(
            title="üé® Educational Stable Diffusion Generator",
            theme=gr.themes.Soft()
        ) as interface:
            gr.Markdown("""
            # üé® Educational Stable Diffusion Text-to-Image Generator
            **Learn Generative AI concepts while creating images!**
            """)
            with gr.Tab("üöÄ Setup & Generation"):
                with gr.Row():
                    with gr.Column():
                        gr.Markdown("### üîß Model Setup")
                        model_choice = gr.Dropdown(
                            choices=[
                                "Stable Diffusion 1.5 (Recommended)",
                                "Stable Diffusion 2.1",
                                "Realistic Vision (RealVisXL)"
                            ],
                            value="Stable Diffusion 1.5 (Recommended)",
                            label="Model Selection"
                        )
                        device_choice = gr.Dropdown(
                            choices=[
                                "Auto (Recommended)",
                                "GPU (CUDA)",
                                "CPU (Slower)"
                            ],
                            value="Auto (Recommended)",
                            label="Device Selection"
                        )
                        init_btn = gr.Button("üöÄ Initialize Model", variant="primary")
                        init_status = gr.Textbox(
                            label="Initialization Status",
                            placeholder="Click Initialize Model to start",
                            lines=3
                        )
                    with gr.Column():
                        gr.Markdown("### üìä System Info")
                        memory_btn = gr.Button("üìä Check Memory Usage")
                        memory_info = gr.Textbox(
                            label="Memory Information",
                            placeholder="Click to check memory usage",
                            lines=6
                        )
                gr.Markdown("### ‚ú® Image Generation")
                with gr.Row():
                    with gr.Column():
                        prompt = gr.Textbox(
                            label="üéØ Prompt (Describe what you want)",
                            placeholder="a beautiful landscape painting, oil on canvas, detailed",
                            lines=3
                        )
                        negative_prompt = gr.Textbox(
                            label="üö´ Negative Prompt (What to avoid)",
                            placeholder="blurry, low quality, bad anatomy",
                            lines=2
                        )
                        generate_btn = gr.Button("üé® Generate Image", variant="primary", size="lg")
                    with gr.Column():
                        with gr.Accordion("üîß Advanced Settings", open=True):
                            with gr.Row():
                                width = gr.Slider(256, 1024, 512, step=64, label="Width")
                                height = gr.Slider(256, 1024, 512, step=64, label="Height")
                            with gr.Row():
                                steps = gr.Slider(10, 100, 20, step=1, label="Inference Steps")
                                guidance = gr.Slider(1.0, 20.0, 7.5, step=0.5, label="Guidance Scale")
                            scheduler = gr.Dropdown(
                                choices=["euler_a", "euler", "ddim", "dpm_solver", "lms"],
                                value="euler_a",
                                label="Scheduler"
                            )
                            scheduler_info = gr.Textbox(
                                label="Scheduler Information",
                                interactive=False,
                                lines=2
                            )
                            with gr.Row():
                                seed = gr.Number(-1, label="Seed")
                                save_image = gr.Checkbox(True, label="üíæ Save Generated Images")
                with gr.Row():
                    output_image = gr.Image(label="üñºÔ∏è Generated Image", type="pil")
                with gr.Row():
                    generation_info = gr.Textbox(
                        label="üìù Generation Information",
                        lines=10,
                        interactive=False
                    )
                    saved_path = gr.Textbox(
                        label="üíæ Saved File Path",
                        interactive=False
                    )
            with gr.Tab("üìö Learning Resources"):
                gr.Markdown("""
                ## üß† Understanding Stable Diffusion
                ### What is Diffusion?
                Diffusion models learn to gradually remove noise from random data.
                ### Key Components:
                **üéØ CLIP (Text Encoder)**
                **üßÆ U-Net (Denoising Network)**
                **üé® VAE (Variational Autoencoder)**
                **‚öôÔ∏è Schedulers**
                ### Parameter Guide:
                **Steps (10-100)**: More steps = higher quality but slower generation
                **Guidance Scale (1-20)**: Higher values make the AI follow your prompt more strictly
                **Seed**: Controls randomness - same seed + settings = same image
                **Resolution**: Higher resolution = more detail but needs more GPU memory
                """)
            with gr.Tab("üñºÔ∏è Examples & Gallery"):
                gr.Markdown("### üé® Example Prompts to Try")
                examples = gr.Examples(
                    examples=self.get_example_prompts(),
                    inputs=[prompt, negative_prompt],
                    label="Click any example to load it"
                )
                gr.Markdown("### üñºÔ∏è Recent Generations")
                gallery = gr.Gallery(
                    value=[],
                    label="Your Generated Images",
                    show_label=True,
                    elem_id="gallery",
                    columns=3,
                    rows=2,
                    object_fit="contain",
                    height="auto"
                )
            # Event handlers
            init_btn.click(
                fn=self.initialize_generator,
                inputs=[model_choice, device_choice],
                outputs=init_status
            )
            generate_btn.click(
                fn=self.generate_image,
                inputs=[prompt, negative_prompt, width, height, steps, guidance, scheduler, seed, save_image],
                outputs=[output_image, generation_info, saved_path]
            ).then(
                fn=lambda: self.gallery_images,
                outputs=gallery
            )
            scheduler.change(
                fn=self.show_scheduler_info,
                inputs=scheduler,
                outputs=scheduler_info
            )
            memory_btn.click(
                fn=self.get_memory_info,
                outputs=memory_info
            )
        return interface

In [None]:
# Launch the Gradio interface
ui = StableDiffusionUI()
interface = ui.create_interface()
interface.launch(
    share=True,  # Set to True for public sharing
    server_name="0.0.0.0",
    server_port=7860,
    debug=False,
    show_error=True
)

In [14]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("thedevastator/rsicd-image-caption-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'rsicd-image-caption-dataset' dataset.
Path to dataset files: /kaggle/input/rsicd-image-caption-dataset


In [20]:
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "data/"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "thedevastator/rsicd-image-caption-dataset",
  file_path)

print("First 5 records:", df.head())


ValueError: Unsupported file extension: ''. Supported file extensions are: .csv, .tsv, .json, .jsonl, .xml, .parquet, .feather, .sqlite, .sqlite3, .db, .db3, .s3db, .dl3, .xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt

In [None]:
from torch.utils.data import Dataset
from PIL import Image
import os

class TextImageDataset(Dataset):
    def __init__(self, image_dir, captions_dict, transform=None):
        self.image_dir = image_dir
        self.captions = captions_dict  # {filename: caption}
        self.image_files = list(captions_dict.keys())
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.captions[img_name]

        if self.transform:
            image = self.transform(image)

        return image, caption


In [None]:
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

dataset = TextImageDataset(
    image_dir="data/images",
    captions_dict=your_caption_dictionary,
    transform=transform
)

train_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2
)

In [None]:
from models.attention_gan import AttentionTextToImageGAN, weights_init
from models.attention_trainer import AttentionGANTrainer
from utils.text_embedding import TextEmbedder

# Create model
model = AttentionTextToImageGAN(
    latent_dim=100,
    text_embedding_dim=768,
    num_attention_blocks=2,
    attention_heads=8
)

# Initialize
model.generator.apply(weights_init)
model.discriminator.apply(weights_init)

# Train
embedder = TextEmbedder()
trainer = AttentionGANTrainer(model, embedder)
trainer.train(train_loader, num_epochs=100)

In [None]:
!python example_usage.py

ATTENTION-BASED TEXT-TO-IMAGE GAN - COMPREHENSIVE EXAMPLE

PART 1: ATTENTION MECHANISMS EXPLAINED

üß† ATTENTION MECHANISMS IN OUR GAN:

1. SELF-ATTENTION (SAGAN-style)
   Purpose: Allow pixels to attend to all other pixels
   Benefit: Captures long-range dependencies
   
   Example: When generating a dog
   - Without: Ears and tail might not match body
   - With: All body parts are coherent
   
   Applied at: 8x8, 32x32 resolutions

2. CROSS-ATTENTION (Image ‚Üî Text)
   Purpose: Image regions attend to relevant text features
   Benefit: Better text-image alignment
   
   Example: "red car on left, blue house on right"
   - Without: Colors and positions might be mixed
   - With: Left region focuses on "red car" text
          Right region focuses on "blue house" text
   
   Applied at: 16x16, plus in residual blocks

3. CBAM (Channel + Spatial Attention)
   Purpose: Focus on important channels and locations
   Benefit: Better detail and feature emphasis
   
   Example: Generating sun