# Introduction to Large Audio Language Models

**Laboratory session: AuGI - Towards audio general intelligence**

September 5th, 2025

## Introduction and Objectives

1. Setting up the Environment
2. Exploring CLAP text-audio encoder
3. Exploring Audio Flamingo 2
4. Exploring MMAU

In [None]:
!git clone https://github.com/ferugit/JSALT-LALMs-tutorial.git

In [None]:
# Local
#%cd ..

# Remote
%cd JSALT-LALMs-tutorial

In [None]:
!pip install -r requirements.txt

In [None]:
# Download Qwen2.5-0.5B model
!./download_hf_model.sh

In [None]:
# Download AF2 model: CLAP encoder, Audio Transformer and XATTN
!./download_af2.sh "YOUR_HF_TOKEN_HERE"

# CLAP: Audio Encoder

![CLAP Architecture](../assets/clap-arch.png)

In [None]:
import os
import sys
import yaml
import umap
import IPython

import torch
import torchaudio
import numpy as np

#from src.audio_flamingo_2.my_laion_clap.CLAP.src import laion_clap as local_clap
import laion_clap

import matplotlib.pyplot as plt

In [None]:
%%capture
model = laion_clap.CLAP_Module(enable_fusion=False)
model.load_ckpt()

## Zero-shot classification

![CLAP Architecture](../assets/zero-shot_classification.png)

In [None]:
# Let's listen some audios
cat_filename = "assets/cat.wav"
dog_filename = "assets/dog_barking.wav"
another_dog_filename = "assets/dog.wav"
breaking_filename = "assets/breaking.wav"
cough_filename = "assets/cough.wav"
music_filename = "assets/dance_matisse_musiclm.wav"
role_filename = "assets/role.wav"

In [None]:
IPython.display.Audio(cat_filename)

In [None]:
IPython.display.Audio(dog_filename)

In [None]:
# Get audio embeddings from audio files
audio_file = [cat_filename, dog_filename]
with torch.no_grad():
    audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)
print(audio_embed[:,-20:])
print(audio_embed.shape)

In [None]:
# Get text embedings from texts
text_data = ["This is a sound of a dog", "This is a sound of a cat"] 
with torch.no_grad():
    text_embed = model.get_text_embedding(text_data, use_tensor=True)
print(text_embed[:,-20:])
print(text_embed.shape)

In [None]:
similarity = audio_embed @ text_embed.t()
print("Similarity matrix:\n", similarity)

In [None]:
# Concatenate embeddings
embeddings = torch.cat([audio_embed, text_embed], dim=0).cpu().numpy()
labels = ['audio_cat', 'audio_dog', 'text_dog', 'text_cat']

# Solve UMAP 2D projection
reducer = umap.UMAP(n_neighbors=2, random_state=1)
embeddings_2d = reducer.fit_transform(embeddings)

# Plot emdedding distances
plt.figure(figsize=(8, 6))

for i, label in enumerate(labels):
    plt.scatter(embeddings_2d[i, 0], embeddings_2d[i, 1], label=label)
    plt.text(embeddings_2d[i, 0]+0.01, embeddings_2d[i, 1]+0.01, label)

# Draw lines between audio and text pairs to show distances
plt.plot([embeddings_2d[0, 0], embeddings_2d[3, 0]], [embeddings_2d[0, 1], embeddings_2d[3, 1]], 'r--', label='cat distance')
plt.plot([embeddings_2d[1, 0], embeddings_2d[2, 0]], [embeddings_2d[1, 1], embeddings_2d[2, 1]], 'b--', label='dog distance')

plt.legend()
plt.title('2D Visualization of Audio and Text Embeddings with Distances')
plt.xlabel('UMAP-1')
plt.ylabel('UMAP-2')
plt.show()

In [None]:
# Cosine Similarity
cos_sim = torch.nn.CosineSimilarity(dim=0) 

In [None]:
# Cacluclate cosine distance
audio_file = [dog_filename]
with torch.no_grad():
    audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)

with torch.no_grad():
    text_embed = model.get_text_embedding("This is a dog barking", use_tensor=True)

similarity = cos_sim(audio_embed[-1], text_embed[-1])
distance = 1 - similarity
print("Cosine Distance:", distance.item()) 

In [None]:
IPython.display.Audio(cough_filename)

In [None]:
# Cacluclate cosine distance
audio_file = [cough_filename]
with torch.no_grad():
    audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)

with torch.no_grad():
    text_embed = model.get_text_embedding("This is a dog barking", use_tensor=True)

similarity = cos_sim(audio_embed[-1], text_embed[-1])
distance = 1 - similarity
print("Cosine Distance:", distance.item()) 

In [None]:
# Remove some vars
del model
del audio_embed
del text_embed

# Audio Flamingo 2

![AF2 Architecture](../assets/af2_arch.png)

In [None]:
!cat src/audio_flamingo_2/config/inference_2.yaml

In [None]:
!cat run_af2_single_inference.sh

In [None]:
!./run_af2_single_inference.sh

## Now load the model

In [None]:
import json

import src.audio_flamingo_2.factory as factory
from src.audio_flamingo_2.inference_utils import read_audio, load_audio, predict, get_num_windows
from src.audio_flamingo_2.utils import Dict2Class, float32_to_int16, int16_to_float32, get_autocast, get_cast_dtype
from safetensors.torch import load_file

In [None]:
# Load the config file
config = yaml.load(open("src/audio_flamingo_2/config/inference_2.yaml"), Loader=yaml.FullLoader)

#print(config)
data_config = config['data_config']
model_config = config['model_config']
clap_config = config['clap_config']
model_args = Dict2Class(config['train_config'])

# Cast the model to the appropriate dtype
autocast = get_autocast(
    model_args.precision, cache_enabled=(not model_args.fsdp)
)
cast_dtype = get_cast_dtype(model_args.precision)

# Get the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set Hugging Face cache directory
model, tokenizer = factory.create_model_and_transforms(
    **model_config,
    clap_config=clap_config, 
    use_local_files=True,
    gradient_checkpointing=False,
    freeze_lm_embeddings=True,
    device=device,
)

print("Model and tokenizer created successfully.")

print("Loading trained weights...")

# CLAP, tokenizer and LLM are pretrained. 
# XATTN and Transformer are not. We need to load the pretrained weights.
model = model.to(device)
model.eval()

# Load the pretrained weights
ckpt_path = config['inference_config']['pretrained_path']
metadata_path = os.path.join(ckpt_path, "safe_ckpt/metadata.json")

# Load metadata
with open(metadata_path, "r") as f:
    metadata = json.load(f)

# Reconstruct the full state_dict
state_dict = {}

# Load each SafeTensors chunk
for chunk_name in metadata:
    chunk_path = f"safe_ckpt/{chunk_name}.safetensors"
    chunk_tensors = load_file(os.path.join(ckpt_path, chunk_path))

    # Merge tensors into state_dict
    state_dict.update(chunk_tensors)

missing_keys, unexpected_keys = model.load_state_dict(state_dict, False)

print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)
print("Model loaded successfully.")

In [None]:
decoding = {
    "do_sample": False,  # Set to True for sampling, False for greedy/beam search
    "temperature": 0.0,
    "num_beams": 1,
    "top_k": 30,
    "top_p": 0.95,
    "num_return_sequences": 1,
}


question =  "What is the gender of the person?"

# Perform inference
result = predict(
    cough_filename,
    question,
    clap_config,
    inference_kwargs=decoding,
    cast_dtype=cast_dtype,
    device=device,
    tokenizer=tokenizer,
    model=model
)
print("Inference completed.\n\n")
print("*" * 50)
print("Prompt:", question)
print("Audio path:", cough_filename)
print("Inference result:", result)

In [None]:
decoding = {
    "do_sample": False,  # Set to True for sampling, False for greedy/beam search
    "temperature": 0.0,
    "num_beams": 1,
    "top_k": 30,
    "top_p": 0.95,
    "num_return_sequences": 1,
}


question =  "Is the person ill?"

# Perform inference
result = predict(
    cough_filename,
    question,
    clap_config,
    inference_kwargs=decoding,
    cast_dtype=cast_dtype,
    device=device,
    tokenizer=tokenizer,
    model=model
)
print("Inference completed.\n\n")
print("*" * 50)
print("Prompt:", question)
print("Audio path:", cough_filename)
print("Inference result:", result)

# MMAU Benchmark

![AF2 Architecture](../assets/mmau_hero.jpg)

In [None]:
decoding = {
    "do_sample": False,  # Set to True for sampling, False for greedy/beam search
    "temperature": 0.0,
    "num_beams": 1,
    "top_k": 30,
    "top_p": 0.95,
    "num_return_sequences": 1,
}


question =  """How are the two speakers connected?
      (A). rental agent-tenant
      (B). curator-artist
      (C). author-editor
      (D). flight instructor-student pilot"""

# Perform inference
result = predict(
    cough_filename,
    question,
    clap_config,
    inference_kwargs=decoding,
    cast_dtype=cast_dtype,
    device=device,
    tokenizer=tokenizer,
    model=model
)
print("Inference completed.\n\n")
print("*" * 50)
print("Prompt:", question)
print("Audio path:", cough_filename)
print("Inference result:", result)