# JSALT 2025 - Introduction to Large Audio Language Models

**Laboratory session: AuGI - Towards audio general intelligence**

June 20th, 2025

## Introduction and Objectives

1. Setting up the Environment
2. Exploring Audio Flamingo 2
3. Exploring MMAU
4. Preparing AQA data
5. Simple training/fine-tuning


## Materials

In [None]:
# Local
%cd ..

# Remote
#!git clone https://github.com/ferugit/JSALT-LALMs-tutorial.git
#%cd JSALT-LALMs-tutorial

In [None]:
!pip install -r requirements.txt

In [None]:
# Download Qwen2.5-0.5B model
!./download_hf_model.sh

In [None]:
# Download AF2 model: CLAP encoder, Audio Transformer and XATTN
!./download_af2.sh "YOUR_HF_TOKEN_HERE"

In [None]:
!cat run_af2_single_inference.sh

In [None]:
!cat src/audio_flamingo_2/config/inference.yaml

In [None]:
!./run_af2_single_inference.sh

# CLAP: Audio Encoder

![CLAP Architecture](../assets/clap-arch.png)

In [None]:
import os
import sys
import yaml
import umap
import IPython

import torch
import torchaudio
import numpy as np

#from src.audio_flamingo_2.my_laion_clap.CLAP.src import laion_clap as local_clap
import laion_clap

import matplotlib.pyplot as plt

In [None]:
%%capture
model = laion_clap.CLAP_Module(enable_fusion=False)
model.load_ckpt()

## Zero-shot classification

![CLAP Architecture](../assets/zero-shot_classification.png)

In [None]:
# Let's listen some audios
cat_filename = "assets/cat.wav"
dog_filename = "assets/dog_barking.wav"
another_dog_filename = "assets/dog.wav"
breaking_filename = "assets/breaking.wav"
cough_filename = "assets/cough.wav"
music_filename = "assets/dance_matisse_musiclm.wav"

In [None]:
IPython.display.Audio(cat_filename)

In [None]:
IPython.display.Audio(dog_filename)

In [None]:
# Get audio embeddings from audio files
audio_file = [cat_filename, dog_filename]
with torch.no_grad():
    audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)
print(audio_embed[:,-20:])
print(audio_embed.shape)

In [None]:
# Get text embedings from texts
text_data = ["This is a sound of a dog", "This is a sound of a cat"] 
with torch.no_grad():
    text_embed = model.get_text_embedding(text_data, use_tensor=True)
print(text_embed[:,-20:])
print(text_embed.shape)

In [None]:
similarity = audio_embed @ text_embed.t()
print("Similarity matrix:\n", similarity)

In [None]:
# Concatenate embeddings
embeddings = torch.cat([audio_embed, text_embed], dim=0).cpu().numpy()
labels = ['audio_cat', 'audio_dog', 'text_dog', 'text_cat']

# Solve UMAP 2D projection
reducer = umap.UMAP(n_neighbors=2, random_state=1)
embeddings_2d = reducer.fit_transform(embeddings)

# Plot emdedding distances
plt.figure(figsize=(8, 6))

for i, label in enumerate(labels):
    plt.scatter(embeddings_2d[i, 0], embeddings_2d[i, 1], label=label)
    plt.text(embeddings_2d[i, 0]+0.01, embeddings_2d[i, 1]+0.01, label)

# Draw lines between audio and text pairs to show distances
plt.plot([embeddings_2d[0, 0], embeddings_2d[3, 0]], [embeddings_2d[0, 1], embeddings_2d[3, 1]], 'r--', label='cat distance')
plt.plot([embeddings_2d[1, 0], embeddings_2d[2, 0]], [embeddings_2d[1, 1], embeddings_2d[2, 1]], 'b--', label='dog distance')

plt.legend()
plt.title('2D Visualization of Audio and Text Embeddings with Distances')
plt.xlabel('UMAP-1')
plt.ylabel('UMAP-2')
plt.show()

In [None]:
# Cosine Similarity
cos_sim = torch.nn.CosineSimilarity(dim=0) 

In [None]:
# Cacluclate cosine distance
audio_file = [dog_filename]
with torch.no_grad():
    audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)

with torch.no_grad():
    text_embed = model.get_text_embedding("This is a dog barking", use_tensor=True)

similarity = cos_sim(audio_embed[-1], text_embed[-1])
distance = 1 - similarity
print("Cosine Distance:", distance.item()) 

In [None]:
IPython.display.Audio(cough_filename)

In [None]:
# Cacluclate cosine distance
audio_file = [cough_filename]
with torch.no_grad():
    audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)

with torch.no_grad():
    text_embed = model.get_text_embedding("This is a dog barking", use_tensor=True)

similarity = cos_sim(audio_embed[-1], text_embed[-1])
distance = 1 - similarity
print("Cosine Distance:", distance.item()) 

# Audio Flamingo 2

![AF2 Architecture](../assets/af2_arch.png)

In [None]:
!cat src/audio_flamingo_2/config/inference.yaml

In [None]:
!cat run_af2_single_inference.sh

In [None]:
!./run_af2_single_inference.sh