In [None]:
from huggingface_hub import login

# Replace with your actual token starting with hf_...
login(token="f_tObFkOdreZjyLBjXMrpbGShtBCmGKyYjdy")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import sys
import glob
import json
import functools
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from tqdm.auto import tqdm
from transformers import T5EncoderModel, AutoTokenizer
from diffusers import UNet2DConditionModel

# Fix for newer diffusers versions where randn_tensor moved to diffusers.utils.torch_utils
import diffusers.utils
if not hasattr(diffusers.utils, "randn_tensor"):
    from diffusers.utils.torch_utils import randn_tensor
    diffusers.utils.randn_tensor = randn_tensor

# Fix for transformers passing deprecated 'use_auth_token' to newer huggingface_hub
import huggingface_hub
_original_hf_hub_download = huggingface_hub.file_download.hf_hub_download

@functools.wraps(_original_hf_hub_download)
def _patched_hf_hub_download(*args, **kwargs):
    if "use_auth_token" in kwargs:
        kwargs["token"] = kwargs.pop("use_auth_token")
    return _original_hf_hub_download(*args, **kwargs)

huggingface_hub.file_download.hf_hub_download = _patched_hf_hub_download
huggingface_hub.hf_hub_download = _patched_hf_hub_download

print("✅ Applied huggingface_hub compatibility patch (use_auth_token -> token)")

# ------------------------------------------------------------------------------
# 1. Dynamic Path Setup (Modified as requested)
# ------------------------------------------------------------------------------
# Default paths
models_file = "models.py"
config_file = "diffusion_model_config.json"
subfolder_name = "original_files" 

# Check for models.py
if os.path.exists(models_file):
    print(f"✅ Found '{models_file}' in root.")
elif os.path.exists(os.path.join(subfolder_name, "models.py")):
    print(f"✅ Found '{models_file}' in '{subfolder_name}'. Adding to system path...")
    sys.path.append(subfolder_name)
else:
    raise FileNotFoundError(f"CRITICAL: Could not find '{models_file}' in the root directory OR inside '{subfolder_name}/'.")

# Check for Config
print(f"✅ Found config in '{subfolder_name}/{config_file}'.")
UNET_CONFIG_PATH = os.path.join(subfolder_name, config_file)

# Now we can safely import
from original_files.models import AudioDiffusion

# ------------------------------------------------------------------------------
# 1b. Create Local Scheduler Config (avoids downloading from gated HF repo)
# ------------------------------------------------------------------------------
LOCAL_SCHEDULER_DIR = "local_config/scheduler"
os.makedirs(LOCAL_SCHEDULER_DIR, exist_ok=True)

scheduler_config = {
    "_class_name": "DDPMScheduler",
    "_diffusers_version": "0.11.1",
    "beta_end": 0.012,
    "beta_schedule": "scaled_linear",
    "beta_start": 0.00085,
    "clip_sample": False,
    "num_train_timesteps": 1000,
    "prediction_type": "v_prediction",
    "set_alpha_to_one": False,
    "skip_prk_steps": True,
    "steps_offset": 1,
    "trained_betas": None,
    "variance_type": "fixed_small"
}

with open(f"{LOCAL_SCHEDULER_DIR}/scheduler_config.json", "w") as f:
    json.dump(scheduler_config, f, indent=4)

print(f"✅ Created local scheduler config at: {LOCAL_SCHEDULER_DIR}")

# ------------------------------------------------------------------------------
# 2. Custom Dataset for Pre-computed Latents
# ------------------------------------------------------------------------------
class TangoLatentDataset(Dataset):
    def __init__(self, root_dir):
        """
        Args:
            root_dir (str): The 'father folder' containing the 'tango-dataset' folder.
        """
        self.base_path = os.path.join(root_dir, "tango-dataset")
        self.latents_path = os.path.join(self.base_path, "latent_vectors")
        self.captions_path = os.path.join(self.base_path, "captions")
        
        if not os.path.exists(self.latents_path):
            raise FileNotFoundError(f"Latents folder not found at: {self.latents_path}")
            
        self.latent_files = sorted(glob.glob(os.path.join(self.latents_path, "*.pt")))
        print(f"Dataset loaded: Found {len(self.latent_files)} samples.")

    def __len__(self):
        return len(self.latent_files)

    def __getitem__(self, idx):
        latent_file = self.latent_files[idx]
        file_id = os.path.basename(latent_file).replace(".pt", "")
        
        # Load Latent (Map to CPU to avoid GPU saturation during loading)
        latent = torch.load(latent_file, map_location="cpu")
        
        # Load Caption
        caption_file = os.path.join(self.captions_path, f"{file_id}.txt")
        caption = ""
        if os.path.exists(caption_file):
            with open(caption_file, 'r', encoding='utf-8') as f:
                caption = f.read().strip()
        else:
            print(f"Warning: No caption found for {file_id}")

        if isinstance(latent, torch.Tensor):
            latent = latent.float()
            # Squeeze out the batch dimension saved by the VAE encoding step.
            # Latents are saved as (1, 8, 256, 16) but should be (8, 256, 16)
            # so that unsqueeze(0) / torch.stack in collate_fn produces correct 4D input.
            while latent.dim() > 3:
                latent = latent.squeeze(0)
            
        return {"latent": latent, "caption": caption}

def collate_fn(batch):
    latents = torch.stack([item["latent"] for item in batch])
    captions = [item["caption"] for item in batch]
    return latents, captions

# ------------------------------------------------------------------------------
# 3. Smart Model Loader
# ------------------------------------------------------------------------------
def load_tango_model(config_path, device):
    print(">>> Loading Tango Model components...")
    t5_name = "google/flan-t5-large"
    
    # Use the local scheduler config to avoid 401 from gated HF repos
    scheduler_name = "local_config"
    
    # Check local cache for T5
    try:
        print(f"   Checking local cache for {t5_name}...")
        tmp = T5EncoderModel.from_pretrained(t5_name, local_files_only=True)
        del tmp
        print("   -> Found in local cache.")
    except Exception:
        print(f"   -> Not found locally. It will be downloaded by AudioDiffusion.")

    # Initialize AudioDiffusion
    model = AudioDiffusion(
        text_encoder_name=t5_name,
        scheduler_name=scheduler_name,
        unet_model_name=None, 
        unet_model_config_path=config_path,
        freeze_text_encoder=True 
    )
    
    return model.to(device)

print("\nSUCCESS: Environment ready. Use 'UNET_CONFIG_PATH' in the next cells.")

Disabling PyTorch because PyTorch >= 2.1 is required but found 1.13.1
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


AttributeError: module 'torch' has no attribute 'xpu'

In [11]:
# --- USER INPUT HERE ---
FATHER_FOLDER_PATH = "/home/yitshag/test_uv/output_data" # Folder containing 'tango-dataset'

# 1. Init
accelerator = Accelerator()
device = accelerator.device

try:
    # 2. Load Dataset
    dataset = TangoLatentDataset(FATHER_FOLDER_PATH)
    
    if len(dataset) > 0:
        # 3. Load Model (Using the path detected in Cell 1)
        print(f"Using config: {UNET_CONFIG_PATH}")
        model = load_tango_model(UNET_CONFIG_PATH, device)
        model.train()
        
        # 4. Get 1 Sample
        sample = dataset[0]
        dummy_latents = sample["latent"].unsqueeze(0).to(device)
        dummy_captions = [sample["caption"]]
        
        print(f"\n--- SANITY CHECK START ---")
        print(f"Latent Shape: {dummy_latents.shape}")
        
        # 5. Forward & Backward
        with accelerator.accumulate(model):
            loss = model(dummy_latents, dummy_captions)
            print(f"✓ Forward pass successful. Loss: {loss.item()}")
            
            accelerator.backward(loss)
            print("✓ Backward pass successful.")
            
        print("--- SANITY CHECK PASSED ---\n")
    else:
        print("Error: Dataset is empty.")

except Exception as e:
    print(f"\n❌ Sanity Check Failed: {e}")
    import traceback
    traceback.print_exc()

Dataset loaded: Found 3224 samples.
Using config: original_files/diffusion_model_config.json
>>> Loading Tango Model components...
   Checking local cache for google/flan-t5-large...
   -> Not found locally. Downloading...
UNet initialized randomly.


Downloading tokenizer_config.json: 2.54kB [00:00, 11.6MB/s]
Downloading spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 2.95MB/s]
Downloading tokenizer.json: 2.42MB [00:00, 9.88MB/s]
Downloading (…)cial_tokens_map.json: 2.20kB [00:00, 7.71MB/s]
Downloading config.json: 100%|██████████| 662/662 [00:00<00:00, 5.49MB/s]
Downloading pytorch_model.bin: 100%|██████████| 3.13G/3.13G [00:45<00:00, 68.6MB/s]
Some weights of the model checkpoint at google/flan-t5-large were not used when initializing T5EncoderModel: ['decoder.block.20.layer.2.DenseReluDense.wo.weight', 'decoder.block.14.layer.0.SelfAttention.q.weight', 'decoder.block.20.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.3.layer.1.EncDecAttention.v.weight', 'decoder.block.1.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.6.layer.1.layer_norm.weight', 'decoder.block.22.layer.1.EncDecAttention.v.weight', 'decoder.block.2.layer.1.EncDecAttention.q.weight', 'decoder.block.14.layer.2.DenseReluDense.wi_1.weight', 'decoder.b


--- SANITY CHECK START ---
Latent Shape: torch.Size([1, 1, 8, 256, 16])


  deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)



❌ Sanity Check Failed: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 1, 8, 256, 16]


Traceback (most recent call last):
  File "/tmp/ipykernel_845783/2494034777.py", line 28, in <module>
    loss = model(dummy_latents, dummy_captions)
  File "/home/yitshag/test_uv/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yitshag/test_uv/original_files/models.py", line 181, in forward
    model_pred = self.unet(
  File "/home/yitshag/test_uv/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yitshag/test_uv/.venv/lib/python3.10/site-packages/diffusers/models/unet_2d_condition.py", line 899, in forward
    sample = self.conv_in(sample)
  File "/home/yitshag/test_uv/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yitshag/test_uv/.venv/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 463,

In [10]:
import os
import json
from diffusers import UNet2DConditionModel
from transformers import T5EncoderModel, AutoTokenizer
from original_files.models import AudioDiffusion


# 1. Create the Local Scheduler Config
# This bypasses the need to download it from Hugging Face
local_scheduler_dir = "local_config/scheduler"
os.makedirs(local_scheduler_dir, exist_ok=True)

scheduler_config = {
    "_class_name": "DDPMScheduler",
    "_diffusers_version": "0.11.1",
    "beta_end": 0.012,
    "beta_schedule": "scaled_linear",
    "beta_start": 0.00085,
    "clip_sample": False,
    "num_train_timesteps": 1000,
    "prediction_type": "v_prediction", 
    "set_alpha_to_one": False,
    "skip_prk_steps": True,
    "steps_offset": 1,
    "trained_betas": None,
    "variance_type": "fixed_small"
}

with open(f"{local_scheduler_dir}/scheduler_config.json", "w") as f:
    json.dump(scheduler_config, f, indent=4)

print(f"✅ Created local scheduler config at: {local_scheduler_dir}")

# 2. Redefine the Loader to use the Local Config
def load_tango_model(config_path, device):
    print(">>> Loading Tango Model components...")
    
    # Use the local path we just created
    scheduler_name = "local_config" 
    t5_name = "google/flan-t5-large"
    
    # Check T5 Cache
    try:
        print(f"   Checking local cache for {t5_name}...")
        T5EncoderModel.from_pretrained(t5_name, local_files_only=True)
        print("   -> Found in local cache.")
    except Exception:
        print("   -> Not found locally. Downloading...")

    # Initialize Model with LOCAL scheduler
    model = AudioDiffusion(
        text_encoder_name=t5_name,
        scheduler_name=scheduler_name, # <--- Points to our local folder
        unet_model_name=None, 
        unet_model_config_path=config_path,
        freeze_text_encoder=True 
    )
    
    return model.to(device)

print("✅ Setup Fixed. You can now run the Sanity Check.")

✅ Created local scheduler config at: local_config/scheduler
✅ Setup Fixed. You can now run the Sanity Check.
