## Install dependencies

In [23]:
! pip install --upgrade --quiet transformers==4.50.3

In [17]:
from huggingface_hub.utils import HfFolder

if HfFolder.get_token() is None:
    from huggingface_hub import notebook_login
    notebook_login()

## Load and play cough audio recording

In [18]:
SAMPLE_RATE = 16000  # Samples per second (Hz)
CLIP_DURATION = 2    # Duration of the audio clip in seconds
CLIP_LENGTH = SAMPLE_RATE * CLIP_DURATION  # Total number of samples

In [None]:
!wget -nc https://upload.wikimedia.org/wikipedia/commons/b/be/Woman_coughing_three_times.wav

In [19]:
from scipy.io import wavfile

# Load file
with open('unhealthy.wav', 'rb') as f:
  original_sampling_rate, audio_array = wavfile.read(f)

print(f"Sample Rate: {original_sampling_rate} Hz")
print(f"Data Shape: {audio_array.shape}")
print(f"Data Type: {audio_array.dtype}")


Sample Rate: 22050 Hz
Data Shape: (218295,)
Data Type: int16


In [20]:
from IPython.display import Audio, display
import importlib
audio_utils = importlib.import_module(
    "hear.python.data_processing.audio_utils"
)
resample_audio_and_convert_to_mono = audio_utils.resample_audio_and_convert_to_mono


audio_array = resample_audio_and_convert_to_mono(
  audio_array=audio_array, 
  sampling_rate=original_sampling_rate,
  new_sampling_rate=SAMPLE_RATE,
)
display(Audio(audio_array, rate=SAMPLE_RATE))

## Load Event Detector and HeAR Models

In [24]:
import os
import numpy as np
import tensorflow as tf
from huggingface_hub import HfFolder, from_pretrained_keras
from transformers import ViTConfig, ViTModel

# Get your token (this should work if you've logged in with notebook_login)
token = HfFolder.get_token()

# Ensure the token is being used
if token:
    print("Token found, attempting to use it for authentication")
    os.environ["HUGGINGFACE_TOKEN"] = token
else:
    print("No token found. Please run the notebook_login() cell again")
    from huggingface_hub import notebook_login
    notebook_login()
    token = HfFolder.get_token()
    os.environ["HUGGINGFACE_TOKEN"] = token

# Download the models from HuggingFace Hub
from huggingface_hub import snapshot_download
hugging_face_repo = "google/hear"
local_snapshot_path = snapshot_download(repo_id=hugging_face_repo)
print(f"Saved {hugging_face_repo} to {local_snapshot_path}\n")

# Constants for event detector
EVENT_DETECTOR = "event_detector_small"  # Can use "event_detector_large" for better accuracy
LABEL_LIST = ['Cough', 'Snore', 'Baby Cough', 'Breathe', 'Sneeze', 'Throat Clear', 'Laugh', 'Speech']
DETECTION_THRESHOLD = 0.5  # Probability threshold for detection

# Load HeAR model (large ViT model)
print(f"\nLoading HeAR model")
hear_model = from_pretrained_keras(local_snapshot_path)
hear_infer = hear_model.signatures["serving_default"]

# Load frontend model for efficiently computing spectrogram features
frontend_path = os.path.join("event_detector/", "spectrogram_frontend")
print(f"\nLoading frontend model from: {frontend_path}")
frontend_model = from_pretrained_keras(
    os.path.join(local_snapshot_path, frontend_path)
)

# Load event detector model
event_detector_path = os.path.join("event_detector/", EVENT_DETECTOR)
print(f"\nLoading detector model from: {event_detector_path}")
event_detector = from_pretrained_keras(
    os.path.join(local_snapshot_path, event_detector_path)
)

# For the ViT model loaded directly from transformers
configuration = ViTConfig(
    image_size=(192, 128),
    hidden_size=1024,
    num_hidden_layers=24,
    num_attention_heads=16,
    intermediate_size=1024 * 4,
    hidden_act="gelu_fast",
    hidden_dropout_prob=0.0,
    attention_probs_dropout_prob=0.0,
    initializer_range=0.02,
    layer_norm_eps=1e-6,
    pooled_dim=512,
    patch_size=16,
    num_channels=1,
    qkv_bias=True,
    encoder_stride=16,
    pooler_act='linear',
    pooler_output_size=512,
)
loaded_model = ViTModel.from_pretrained(
    "google/hear-pytorch",
    config=configuration,
    token=token,
)

Token found, attempting to use it for authentication


Fetching 24 files: 100%|██████████| 24/24 [01:00<00:00,  2.52s/it]
config.json not found in C:\Users\suvan\.cache\huggingface\hub\models--google--hear\snapshots\9b2eb2853c426676255cc6ac5804b7f1fe8e563f


Saved google/hear to C:\Users\suvan\.cache\huggingface\hub\models--google--hear\snapshots\9b2eb2853c426676255cc6ac5804b7f1fe8e563f


Loading HeAR model


ImportError: Called a TensorFlow-specific function but could not import it.

## Utility Functions for Visualization

In [None]:
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio
import warnings

# Suppress common warnings
warnings.filterwarnings("ignore", category=UserWarning, module="soundfile")
warnings.filterwarnings("ignore", module="librosa")

def plot_waveform(sound, sr, title, figsize=(12, 4), color='blue', alpha=0.7):
    """Plots the waveform of the audio"""
    plt.figure(figsize=figsize)
    librosa.display.waveshow(sound, sr=sr, color=color, alpha=alpha)
    plt.title(f"{title}\nshape={sound.shape}, sr={sr}, dtype={sound.dtype}")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_detection_scores(scores_batch, label_list, title, figsize=(12, 4), cmap='Blues'):
    """Plots per-label detection scores for batch"""
    plt.figure(figsize=figsize)
    scores_img = np.transpose(scores_batch)
    # Explicitly set the color limits for imshow
    im = plt.imshow(scores_img, aspect='auto', cmap=cmap, vmin=0, vmax=1)
    # Set up the 'y' label axis
    plt.yticks(np.arange(len(label_list)), [l.replace(' ', '\n') for l in label_list])
    # Add horizontal grid lines between labels
    for i in range(1, scores_img.shape[0]):
        plt.axhline(y=i - 0.5, color='gray', linestyle='--')
    plt.grid(axis='y', which='major', color='white', alpha=0)
    # Setup the 'x' time axis
    n_clips = scores_img.shape[1]
    plt.xticks(np.arange(n_clips), [f'Clip {i+1}' for i in range(n_clips)])
    plt.xlabel("Time Step")
    # Add vertical grid lines between time steps
    for j in range(1, n_clips):
        plt.axvline(x=j - 0.5, color='gray', linestyle='--')
    plt.title(f"{title} - Sound Event Detections")
    # Add colorbar with a consistent scale from 0 to 1
    plt.colorbar(im, ticks=[0, 0.2, 0.4, 0.6, 0.8, 1.0])
    plt.tight_layout()
    plt.show()

def plot_spectrogram(sound, sr, title, figsize=(12, 4), n_fft=2048, hop_length=256, n_mels=128, cmap='viridis'):
    """Plots the Mel spectrogram of the audio"""
    plt.figure(figsize=figsize)
    mel_spectrogram = librosa.feature.melspectrogram(y=sound, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    librosa.display.specshow(log_mel_spectrogram, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap=cmap)
    plt.title(f"{title} - Mel Spectrogram")
    plt.tight_layout()
    plt.show()

## Analyze Audio File for Health Sounds

In [None]:
# This function analyzes an audio file for health-related sounds and generates HeAR embeddings for detected clips
def analyze_audio_file(file_path, overlap_percent=50, plot=True):
    """
    Analyze audio file for health-related sounds using the event detector model.
    
    Args:
        file_path: Path to the audio file
        overlap_percent: Percentage of overlap between adjacent clips
        plot: Whether to show visualization plots
        
    Returns:
        Dictionary containing detection results and HeAR embeddings
    """
    print(f"\nAnalyzing file: {file_path}")
    
    # Load and preprocess audio
    audio, original_sr = librosa.load(file_path, sr=None, mono=True)
    if original_sr != SAMPLE_RATE:
        print(f"Resampling from {original_sr}Hz to {SAMPLE_RATE}Hz")
        audio = librosa.resample(audio, orig_sr=original_sr, target_sr=SAMPLE_RATE)
    
    # Display audio information
    print(f"Audio duration: {len(audio)/SAMPLE_RATE:.2f} seconds")
    
    # Show audio visualization if requested
    if plot:
        plot_waveform(audio, SAMPLE_RATE, title=f"Audio Waveform: {os.path.basename(file_path)}")
        display(Audio(data=audio, rate=SAMPLE_RATE))
    
    # Segment audio into overlapping clips
    frame_length = int(CLIP_DURATION * SAMPLE_RATE)
    frame_step = int(frame_length * (1 - overlap_percent / 100))
    
    # Pad audio if shorter than frame_length
    if len(audio) < frame_length:
        audio = np.pad(audio, (0, frame_length - len(audio)), mode='constant')
    
    # Create overlapping clips
    audio_clip_batch = tf.signal.frame(audio, frame_length, frame_step)
    print(f"Number of audio clips in batch: {len(audio_clip_batch)}")
    
    # Run event detection on all clips
    print(f"Running event detection on audio clips")
    detection_scores_batch = event_detector(audio_clip_batch)["scores"].numpy()
    
    # Show detection scores
    if plot:
        plot_detection_scores(
            detection_scores_batch, 
            LABEL_LIST, 
            title=f'Event Detection: {os.path.basename(file_path)}'
        )
    
    # Health-related labels to focus on
    health_labels = ['Cough', 'Snore', 'Baby Cough', 'Breathe', 'Sneeze', 'Throat Clear']
    
    # Find clips with health-related sounds
    health_clips = []
    health_clip_indices = []
    health_detections = []
    
    for clip_idx, scores in enumerate(detection_scores_batch):
        clip_detections = {}
        for label_idx, label in enumerate(LABEL_LIST):
            if label in health_labels and scores[label_idx] > DETECTION_THRESHOLD:
                if not clip_detections:
                    health_clips.append(audio_clip_batch[clip_idx])
                    health_clip_indices.append(clip_idx)
                clip_detections[label] = float(scores[label_idx])
        
        if clip_detections:
            health_detections.append(clip_detections)
    
    # Generate HeAR embeddings for clips with health sounds
    if health_clips:
        print(f"Found {len(health_clips)} clips with health-related sounds")
        hear_embeddings = []
        
        # Convert to numpy array for batch processing
        health_clips_array = np.array(health_clips)
        
        # Generate embeddings for health clips using the HeAR model
        print("Generating HeAR embeddings for detected health clips")
        hear_embedding_batch = hear_infer(x=health_clips_array)["output_0"].numpy()
        
        # Create results dictionary with detailed information
        results = {
            "file_name": os.path.basename(file_path),
            "total_clips": len(audio_clip_batch),
            "health_clips_count": len(health_clips),
            "health_clip_indices": health_clip_indices,
            "health_detections": health_detections,
            "hear_embeddings": hear_embedding_batch,
            "all_detection_scores": detection_scores_batch
        }
        
        print(f"Analysis complete. Found {len(health_clips)} clips with health sounds.")
        return results
    
    else:
        print("No health-related sounds detected in this audio file.")
        return {
            "file_name": os.path.basename(file_path),
            "total_clips": len(audio_clip_batch),
            "health_clips_count": 0,
            "all_detection_scores": detection_scores_batch
        }

# Analyze the example cough audio
cough_file = 'Woman_coughing_three_times.wav'
results = analyze_audio_file(cough_file, overlap_percent=50)


## Analyze Your Own Audio

In [None]:
# To analyze your own audio file, upload it to the notebook and run:
# results = analyze_audio_file('your_audio_file.wav')

# You can also download and analyze additional examples
!wget -nc https://upload.wikimedia.org/wikipedia/commons/c/cc/Man_coughing.ogg
!wget -nc https://upload.wikimedia.org/wikipedia/commons/d/d0/Sneezing.ogg

# Analyze additional examples
examples = ['Man_coughing.ogg', 'Sneezing.ogg']
for example in examples:
    results = analyze_audio_file(example)


## Compute embeddings from specific timestamp

In [None]:
import torch

preprocess_audio = audio_utils.preprocess_audio

# This index corresponds to a cough and was determined by hand. In practice, you
# would need a detector.
START = 0

# Add batch dimension
input_tensor = np.expand_dims(audio_array[START: START + CLIP_LENGTH], axis=0)

# Call inference
infer = lambda audio_array: loaded_model.forward(
    preprocess_audio(audio_array), return_dict=True, output_hidden_states=True)
output = infer(torch.Tensor(input_tensor))

# Extract the embedding vector
embedding_vector = np.asarray(output.pooler_output.detach()).flatten()
print("Size of embedding vector:", len(embedding_vector))

# Plot the embedding vector
plt.figure(figsize=(12, 4))
plt.plot(embedding_vector)
plt.title('Embedding Vector')
plt.xlabel('Index')
plt.ylabel('Value')
plt.grid(True)
plt.show()


## Compare Similarity Between Health Sounds

This section lets you compare the similarity between different health sounds based on their HeAR embeddings.


In [None]:
from scipy.spatial import distance

def compare_audio_similarity(file_paths, overlap_percent=50):
    """
    Compare similarity between multiple audio files using HeAR embeddings.
    
    Args:
        file_paths: List of paths to audio files
        overlap_percent: Percentage of overlap between adjacent clips
        
    Returns:
        Dictionary of similarity scores between files
    """
    # Analyze each file and collect embeddings
    print("Analyzing files and generating embeddings...")
    file_embeddings = {}
    
    for file_path in file_paths:
        results = analyze_audio_file(file_path, overlap_percent=overlap_percent, plot=False)
        if results.get("health_clips_count", 0) > 0:
            # Average the embeddings for each file
            file_embeddings[os.path.basename(file_path)] = np.mean(results["hear_embeddings"], axis=0)
    
    # Calculate similarity between all pairs of files
    similarities = {}
    for file1 in file_embeddings:
        similarities[file1] = {}
        for file2 in file_embeddings:
            if file1 != file2:
                similarity = 1 - distance.cosine(file_embeddings[file1], file_embeddings[file2])
                similarities[file1][file2] = similarity
    
    # Display similarity matrix
    print("\nSimilarity scores between files:")
    for file1, scores in similarities.items():
        for file2, score in scores.items():
            print(f"  {file1} vs {file2}: {score:.3f}")
    
    return similarities

# Example usage
file_paths = ['Woman_coughing_three_times.wav', 'Man_coughing.ogg', 'Sneezing.ogg']
similarities = compare_audio_similarity(file_paths)
