In [1]:
!pip install --upgrade parallel_llm==0.6.25

Collecting parallel_llm==0.6.25
  Downloading parallel_llm-0.6.25-py3-none-any.whl.metadata (35 kB)
Collecting protobuf<4.0.0,>=3.20.0 (from parallel_llm==0.6.25)
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.2.0->parallel_llm==0.6.25)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.2.0->parallel_llm==0.6.25)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.2.0->parallel_llm==0.6.25)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.2.0->parallel_llm==0.6.25)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
"""
Training script for Unimodal (Text-only) Parallel-LLM using WikiText-2.
Demonstrates distributed training with automatic batch size scaling.
"""
import os
import sys

# Suppress TensorFlow/CUDA warnings BEFORE importing any ML libraries
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

import torch
import torch.distributed as dist
from torch.utils.data import DataLoader, DistributedSampler
from transformers import AutoTokenizer
from datasets import load_dataset
from itertools import islice
from parallel_llm.core import DiffusionTransformer, ModelConfig
from parallel_llm.training import DistributedTrainer, TrainingConfig
from parallel_llm.utils import TextDataset

def setup_distributed():
    """Initialize distributed training environment"""
    if "LOCAL_RANK" not in os.environ:
        return 0  # DistributedTrainer will handle multi-GPU auto-detection
    
    dist.init_process_group(backend="nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)
    return local_rank

def main():
    print("="*60)
    print("Parallel-LLM Unimodal Training Example (WikiText-2)")
    print("="*60)
    
    # Detect available GPUs
    num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
    if num_gpus > 0:
        print(f"\n🎮 Detected {num_gpus} GPU(s)")
        for i in range(num_gpus):
            gpu_name = torch.cuda.get_device_name(i)
            print(f"   GPU {i}: {gpu_name}")
    else:
        print("\n💻 No GPU detected, using CPU")

    # 1. Setup Distributed Environment
    local_rank = setup_distributed()
    is_main_process = local_rank == 0

    if is_main_process:
        print("\nStarting Unimodal Training")

    # 2. Configuration
    # Model Configuration (GPU-friendly size)
    model_config = ModelConfig(
        vocab_size=32000,      # Will be updated after loading tokenizer
        hidden_size=768,       # Reduced from 2048
        num_hidden_layers=12,  # Reduced from 22
        num_attention_heads=12, # Reduced from 32
        num_diffusion_steps=10, # Reduced from 64
        use_flash_attention=True if torch.cuda.is_available() else False,
        dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
    )

    # Training Configuration
    train_config = TrainingConfig(
        output_dir="./checkpoints/unimodal_wikitext",
        num_train_steps=100,
        batch_size=4,  # Adjust based on VRAM (4 fits on 16GB with grad checkpointing)
        learning_rate=3e-4,
        warmup_steps=100,
        use_fsdp=False, # Enable if multiple GPUs available
        mixed_precision="bf16" if torch.cuda.is_bf16_supported() else "fp16",
        gradient_checkpointing=True, # Save memory
        logging_steps=10,
        save_steps=500,
        eval_steps=200,
        use_torch_compile=True
    )

    # 3. Data Preparation
    if is_main_process:
        print("\n[Step 1/5] Loading tokenizer (TinyLlama)...")

    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    tokenizer.pad_token = tokenizer.eos_token
    model_config.vocab_size = tokenizer.vocab_size
    
    if is_main_process:
        print(f"✓ Tokenizer loaded: {tokenizer.vocab_size:,} tokens")

    if is_main_process:
        print("\n[Step 2/5] Loading WikiText-2 dataset...")
    
    # Load streaming to avoid memory issues
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train", streaming=True)
    
    # Take a subset for this example
    train_data = list(islice(dataset, 1000))
    
    if is_main_process:
        print(f"✓ Dataset loaded: {len(train_data)} training samples")

    train_dataset = TextDataset(
        dataset=train_data,
        tokenizer=tokenizer,
        max_length=512
    )

    sampler = None
    if "LOCAL_RANK" in os.environ and dist.is_initialized():
        sampler = DistributedSampler(train_dataset)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=train_config.batch_size,
        sampler=sampler,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )

    # 4. Initialize Model
    if is_main_process:
        print("\n[Step 3/5] Initializing DiffusionTransformer model (may take 30-60s)...")
    
    model = DiffusionTransformer(model_config)
    
    if is_main_process:
        num_params = sum(p.numel() for p in model.parameters()) / 1e6
        print(f"✓ Model initialized: {num_params:.1f}M parameters")
    
    # 5. Setup Distributed Trainer
    if is_main_process:
        print("\n[Step 4/5] Setting up DistributedTrainer...")
    
    trainer = DistributedTrainer(
        model=model,
        train_config=train_config,
        model_config=model_config,
        train_dataloader=train_dataloader
    )

    # 6. Start Training
    if is_main_process:
        print("\n[Step 5/5] Starting training loop...")
        print("="*60)
    
    trainer.train()

    if is_main_process:
        print("Training complete!")

if __name__ == "__main__":
    main()


In [None]:
"""
Inference script for Unimodal Parallel-LLM using TinyLlama architecture.
This script demonstrates high-speed parallel token generation using a model configured
to match TinyLlama-1.1B dimensions, fitting comfortably on a Tesla P100 (16GB).
"""
import sys
import os
import torch
from transformers import AutoTokenizer
from parallel_llm.core import DiffusionTransformer, ModelConfig
from parallel_llm.inference import ParallelGenerator, GenerationConfig

def main():
    print("="*60)
    print("Parallel-LLM Unimodal Inference Example (TinyLlama-1.1B Config)")
    print("="*60)

    # Check for CUDA
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    if device == "cuda":
        props = torch.cuda.get_device_properties(device)
        print(f"GPU: {props.name} | VRAM: {props.total_memory / 1024**3:.2f} GB")

    # 1. Load Tokenizer (Real-world tokenizer)
    model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    print(f"\n[1/4] Loading tokenizer from {model_id}...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
    except Exception as e:
        print(f"Failed to load tokenizer: {e}")
        print("Fallback to gpt2 tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained("gpt2")

    # 2. Initialize Model (Reduced size for GPU compatibility)
    print("\n[2/4] Initializing DiffusionTransformer with GPU-friendly config...")
    # Smaller config that fits comfortably on 14GB GPU (~500M params)
    config = ModelConfig(
        vocab_size=tokenizer.vocab_size,
        hidden_size=768,            # Reduced from 2048
        num_hidden_layers=12,       # Reduced from 22
        num_attention_heads=12,     # Reduced from 32
        num_diffusion_steps=10,     # Reduced from 64
        use_flash_attention=True if device == "cuda" else False,
        dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
    )
    
    model = DiffusionTransformer(config)
    model.to(device)
    model.eval()
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")

    # 3. Configure Generation
    print("\n[3/4] Configuring Parallel Generation...")
    gen_config = GenerationConfig(
        max_new_tokens=128,
        num_parallel_tokens=64,  # Generate 64 tokens in parallel
        num_refinement_steps=5,
        temperature=0.8,
        top_k=50,
        repetition_penalty=1.2,  # Prevent token repetition
        use_adaptive_steps=True,
        use_torch_compile=True if device == "cuda" else False # Enable torch.compile for speed
    )

    generator = ParallelGenerator(
        model=model,
        config=gen_config,
        use_kv_cache=True,
        use_cuda_graphs=True if device == "cuda" else False
    )

    # 4. Run Inference
    print("\n[4/4] Running Inference...")
    prompt = "The future of artificial intelligence is"
    print(f"Prompt: {prompt}")

    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    # Warmup
    print("Warming up...")
    with torch.no_grad():
        _ = generator.generate(input_ids, max_new_tokens=10, use_autoregressive=True)

    # Actual generation
    print("Generating...")
    print("Mode: Autoregressive (generates tokens one-by-one for coherent output)")
    with torch.no_grad():
        output_ids = generator.generate(input_ids, use_autoregressive=True)

    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    print("\nGenerated Text:")
    print("-" * 40)
    print(output_text)
    print("-" * 40)
    # print("\nNote: Since this model is initialized with random weights, the output text will be incoherent.")
    # print("To generate meaningful text, please train the model using `train_unimodal.py`.")

if __name__ == "__main__":
    main()


In [3]:
"""
Training script for Multimodal (Text + Image) Parallel-LLM using Conceptual Captions.
Demonstrates training a model that can handle both text and image inputs using real-world data.
"""
import os
import sys

# Suppress ALL warnings before any imports
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow logging
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Disable oneDNN
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'  # Don't block for CUDA errors
os.environ['WANDB_SILENT'] = 'true'  # Suppress WandB warnings
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'  # Suppress transformers warnings

import warnings
warnings.filterwarnings('ignore')  # Ignore ALL warnings
warnings.simplefilter('ignore')  # Simplest ignore all

# Suppress specific warning categories
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', message='.*protobuf.*')
warnings.filterwarnings('ignore', message='.*pydantic.*')
warnings.filterwarnings('ignore', message='.*Fast image processor.*')

import torch
import torch.distributed as dist
from torch.utils.data import DataLoader, DistributedSampler
from transformers import AutoTokenizer, AutoImageProcessor
from datasets import load_dataset
from parallel_llm.core import DiffusionTransformer, MultimodalConfig
from parallel_llm.training import DistributedTrainer, TrainingConfig
from parallel_llm.utils import MultimodalDataset

def setup_distributed():
    if "LOCAL_RANK" not in os.environ:
        print("Not running in distributed mode. Using single GPU/CPU.")
        return 0
    
    dist.init_process_group(backend="nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)
    return local_rank

def main():
    print("="*60)
    print("Parallel-LLM Multimodal Training Example (Conceptual Captions)")
    print("="*60)

    local_rank = setup_distributed()
    is_main_process = local_rank == 0

    if is_main_process:
        print("Starting Multimodal Training")

    # 1. Multimodal Configuration (GPU-friendly size)
    model_config = MultimodalConfig(
        # Text parameters (reduced size)
        vocab_size=32000,
        hidden_size=768,       # Reduced from 2048
        num_hidden_layers=12,  # Reduced from 22
        
        # Vision parameters (smaller ViT)
        vision_encoder="vit",
        image_size=224,
        patch_size=16,
        vision_hidden_size=384,  # Reduced from 768
        
        # Fusion parameters
        fusion_type="cross_attention",
        num_cross_attention_layers=4,
        
        # Training objectives
        use_contrastive=True,
        contrastive_temperature=0.07,
        
        dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
    )

    train_config = TrainingConfig(
        output_dir="./checkpoints/multimodal_cc",
        num_train_steps=1000,
        batch_size=2,  # Smaller batch size for multimodal
        learning_rate=1e-4,
        warmup_steps=100,
        mixed_precision="bf16" if torch.cuda.is_bf16_supported() else "fp16",
        gradient_checkpointing=True,
        use_fsdp=False,
        logging_steps=10,
        save_steps=500,
        eval_steps=200,
        use_torch_compile=True
    )

    # 2. Data Preparation
    if is_main_process:
        print("Loading processors and dataset...")

    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    tokenizer.pad_token = tokenizer.eos_token
    model_config.vocab_size = tokenizer.vocab_size

    image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")

    # Load Conceptual Captions (streaming)
    # Note: CC requires downloading images from URLs, which can be slow/flaky.
    # For robustness in this example, we'll try to use it but handle failures gracefully
    # or use a pre-downloaded subset if available. 
    # Here we use streaming and filter for valid images.
    dataset = load_dataset("conceptual_captions", split="train", streaming=True)
    dataset = dataset.take(2000) # Take a small subset for demo start

    train_dataset = MultimodalDataset(
        dataset=dataset,
        tokenizer=tokenizer,
        image_processor=image_processor,
        text_column="caption",
        image_column="image_url", # MultimodalDataset handles URL downloading if column is URL
        max_length=128
    )

    sampler = None
    if "LOCAL_RANK" in os.environ and dist.is_initialized():
        sampler = DistributedSampler(train_dataset)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=train_config.batch_size,
        sampler=sampler,
        shuffle=False,
        num_workers=4,
        pin_memory=True
    )

    # 3. Model & Trainer
    if is_main_process:
        print("Initializing Multimodal DiffusionTransformer...")

    model = DiffusionTransformer(model_config)

    trainer = DistributedTrainer(
        model=model,
        train_config=train_config,
        model_config=model_config,
        train_dataloader=train_dataloader
    )

    # 4. Train
    if is_main_process:
        print("Starting training...")

    trainer.train()

if __name__ == "__main__":
    main()


E0000 00:00:1763821281.702159      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763821281.816027      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Parallel-LLM Multimodal Training Example (Conceptual Captions)
Not running in distributed mode. Using single GPU/CPU.
Starting Multimodal Training
Loading processors and dataset...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


README.md: 0.00B [00:00, ?B/s]

Initializing Multimodal DiffusionTransformer...


<IPython.core.display.Javascript object>

Starting training...


Training:   0%|          | 0/1000 [00:00<?, ?it/s]W1122 14:22:20.164000 48 torch/_logging/_internal.py:1089] [0/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
Training: 100%|██████████| 1000/1000 [07:33<00:00,  2.21it/s]


In [5]:
"""
Inference script for Multimodal Parallel-LLM using TinyLlama + ViT architecture.
This script demonstrates generating text descriptions from images using a model configured
with TinyLlama-1.1B (Text) and ViT-Base (Vision) dimensions.
"""
import sys
import os
import torch
import requests
from PIL import Image
from transformers import AutoTokenizer, AutoImageProcessor
from parallel_llm.core import DiffusionTransformer, MultimodalConfig
from parallel_llm.inference import ParallelGenerator, GenerationConfig

def main():
    print("="*60)
    print("Parallel-LLM Multimodal Inference Example (TinyLlama + ViT)")
    print("="*60)

    # Check for CUDA
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # 1. Load Processors (Real-world models)
    text_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    vision_model_id = "google/vit-base-patch16-224"
    
    print(f"\n[1/4] Loading processors...")
    print(f"Text: {text_model_id}")
    print(f"Vision: {vision_model_id}")
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(text_model_id)
        image_processor = AutoImageProcessor.from_pretrained(vision_model_id)
    except Exception as e:
        print(f"Failed to load processors: {e}")
        return

    # 2. Initialize Model (Reduced size for GPU compatibility)
    print("\n[2/4] Initializing Multimodal DiffusionTransformer...")
    config = MultimodalConfig(
        # Text parameters (reduced size)
        vocab_size=tokenizer.vocab_size,
        hidden_size=768,            # Reduced from 2048
        num_hidden_layers=12,       # Reduced from 22
        
        # Vision parameters (smaller ViT)
        vision_encoder="vit",
        image_size=224,
        patch_size=16,
        vision_hidden_size=384,     # Reduced from 768
        
        # Fusion parameters
        fusion_type="cross_attention",
        num_cross_attention_layers=4,
        
        # General
        num_diffusion_steps=10,     # Reduced from 64
        use_flash_attention=True if device == "cuda" else False,
        dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
    )
    
    model = DiffusionTransformer(config)
    model.to(device)
    model.eval()
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")

    # 3. Prepare Inputs
    print("\n[3/4] Preparing Inputs...")
    try:
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        print(f"Downloading image from {url}...")
        image = Image.open(requests.get(url, stream=True).raw)
    except Exception as e:
        print(f"Failed to download image: {e}")
        print("Using mock image...")
        image = Image.new('RGB', (224, 224), color=(128, 128, 128))

    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(device)
    if torch.cuda.is_bf16_supported():
        pixel_values = pixel_values.to(torch.bfloat16)
    
    prompt = "A picture of"
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # 4. Run Inference
    print("\n[4/4] Running Inference...")
    
    gen_config = GenerationConfig(
        max_new_tokens=64,
        num_parallel_tokens=64,  # Generate all tokens in parallel
        num_refinement_steps=5,  # Use 5 refinement steps for better quality
        temperature=0.7,
        repetition_penalty=1.2,  # Prevent token repetition
        confidence_threshold=0.5,  # Moderate confidence threshold
        use_torch_compile=True if device == "cuda" else False
    )

    generator = ParallelGenerator(model, gen_config)

    print("Generating caption...")
    print("Mode: Autoregressive (generates tokens one-by-one for coherent output)")
    with torch.no_grad():
        output_ids = generator.generate(
            input_ids,
            pixel_values=pixel_values,
            use_autoregressive=True  # Enable autoregressive mode
        )

    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"\nResult: {output_text}")
    # print("\nNote: Since this model is initialized with random weights, the output text will be incoherent.")

if __name__ == "__main__":
    main()


Parallel-LLM Multimodal Inference Example (TinyLlama + ViT)
Using device: cuda

[1/4] Loading processors...
Text: TinyLlama/TinyLlama-1.1B-Chat-v1.0
Vision: google/vit-base-patch16-224


Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.



[2/4] Initializing Multimodal DiffusionTransformer...
Model parameters: 319.33M

[3/4] Preparing Inputs...
Downloading image from http://images.cocodataset.org/val2017/000000039769.jpg...

[4/4] Running Inference...
Generating caption...
Mode: Autoregressive (generates tokens one-by-one for coherent output)

Result: A picture of exposedälliętn并வчко Hermтельной sixth julio" julioкор whoseoval Jław Alle^{\ admitted контра uz Rah Jügel eclipse контра парativo пар Partarsi機行matrixerrрд парók Keepero dla Indeed melhor контра precedinggrund паррдmatrixрдourt вели uzerr hack eclipse Part Rahókommenacco
