# Imports

# Dev

## clap-ipa


# Misc

In [None]:
import torch.nn.functional as F
from clap.encoders import PhoneEncoder, SpeechEncoder
from transformers import AutoProcessor, DebertaV2Tokenizer

# Initialize device
device = torch.device("cpu")

# Load encoders and move to device
speech_encoder = SpeechEncoder.from_pretrained('anyspeech/clap-ipa-tiny-speech')
phone_encoder = PhoneEncoder.from_pretrained('anyspeech/clap-ipa-tiny-phone')
phone_encoder.eval().to(device)
speech_encoder.eval().to(device)

# Initialize tokenizer and processor
tokenizer = DebertaV2Tokenizer.from_pretrained('charsiu/IPATokenizer')
processor = AutoProcessor.from_pretrained('openai/whisper-tiny')

# Process inputs
audio_input = processor(some_audio)
ipa_input = tokenizer(some_ipa_string)

# Generate embeddings
with torch.no_grad():
    speech_embed = speech_encoder(audio_input)
    phone_embed = phone_encoder(ipa_input)

# Calculate similarity
similarity = F.cosine_similarity(speech_embed, phone_embed, dim=-1)

In [None]:
import os
import time
from typing import Dict, List, Tuple

import numpy as np
import soundfile as sf
import torch
import torch.nn.functional as F
from clap.encoders import PhoneEncoder, SpeechEncoder
from transformers import AutoProcessor, DebertaV2Tokenizer

# Global configuration
DEVICE = torch.device("cpu")
print(f"Using device: {DEVICE}")

def measure_time(func):
    """Decorator to measure execution time of functions."""
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        execution_time = (time.time() - start_time) * 1000
        print(f"{func.__name__} took {execution_time:.2f} ms")
        return result
    return wrapper

class AudioProcessor:
    def __init__(self):
        self.speech_encoder = None
        self.phone_encoder = None
        self.tokenizer = None
        self.processor = None

    @measure_time
    def initialize_models(self) -> None:
        """Initialize and configure all required models and tokenizers."""
        # Load models
        self.speech_encoder = SpeechEncoder.from_pretrained('anyspeech/clap-ipa-tiny-speech')
        self.phone_encoder = PhoneEncoder.from_pretrained('anyspeech/clap-ipa-tiny-phone')

        # Configure models for inference
        self._setup_inference_mode()

        # Load tokenizers
        self.tokenizer = DebertaV2Tokenizer.from_pretrained('charsiu/IPATokenizer')
        self.processor = AutoProcessor.from_pretrained('openai/whisper-tiny')

    def _setup_inference_mode(self) -> None:
        """Configure models for optimal inference performance."""
        for model in [self.speech_encoder, self.phone_encoder]:
            model.eval()
            model.to(DEVICE)
        torch.set_grad_enabled(False)

    @measure_time
    def process_audio(self, audio_path: str, cache_dir: str = None) -> Dict[str, torch.Tensor]:
        """Process audio file and prepare for model input."""
        sample, _ = sf.read(audio_path)
        sample = np.array(sample, dtype=np.float32)

        audio_input = self.processor(
            sample,
            sampling_rate=16000,
            return_tensors='pt',
            return_attention_mask=True
        )
        return {k: v.to(DEVICE) for k, v in audio_input.items()}

    @measure_time
    def process_ipa(self, ipa_list: List[str]) -> torch.Tensor:
        """Process IPA text and prepare for model input."""
        ipa_string = ''.join(ipa_list)
        transcript_tokens = torch.tensor(
            self.tokenizer(
                ipa_string,
                return_attention_mask=False,
                return_length=True,
                return_token_type_ids=False,
                add_special_tokens=False
            )['input_ids'],
            device=DEVICE
        )
        return transcript_tokens

    @measure_time
    def compute_similarity(
        self,
        audio_input: Dict[str, torch.Tensor],
        transcript_tokens: torch.Tensor
    ) -> float:
        """Compute similarity between audio and IPA transcription."""
        audio_input = {k: v.to(DEVICE) for k, v in audio_input.items()}
        transcript_tokens = transcript_tokens.to(DEVICE)

        with torch.inference_mode():
            speech_features = self._get_speech_features(audio_input)
            phone_features = self._get_phone_features(transcript_tokens)
            
            speech_embed = torch.mean(speech_features, dim=0, keepdim=True)
            phone_embed = torch.mean(phone_features, dim=0, keepdim=True)
            
            similarity = F.cosine_similarity(speech_embed, phone_embed, dim=1)
            
        return similarity.item()

    def _get_speech_features(self, audio_input: Dict[str, torch.Tensor]) -> torch.Tensor:
        """Extract features from speech input."""
        return self.speech_encoder(**audio_input, return_dict=True).last_hidden_state.squeeze(0)

    def _get_phone_features(self, transcript_tokens: torch.Tensor) -> torch.Tensor:
        """Extract features from phone input."""
        return self.phone_encoder(transcript_tokens.unsqueeze(0)).last_hidden_state.squeeze(0)

def main():
    processor = AudioProcessor()

    # Initialize models
    print("\nInitializing models and tokenizers...")
    processor.initialize_models()

    # Process audio
    print("\nProcessing audio...")
    audio_path = "data/audio/intro_model_chatGPT_35.mp3"
    audio_input = processor.process_audio(audio_path)

    # Process IPA string
    print("\nProcessing IPA string...")
    ipa_string = ['dʒi', 'pi', 'ti']
    transcript_tokens = processor.process_ipa(ipa_string)

    # Compute similarity
    print("\nComputing similarity...")
    start_total = time.time()
    similarity_score = processor.compute_similarity(audio_input, transcript_tokens)
    total_time = (time.time() - start_total) * 1000

    # Print results
    print(f"\nResults:")
    print(f"Similarity score: {similarity_score:.4f}")
    print(f"Total inference time: {total_time:.2f} ms")

if __name__ == "__main__":
    main()

# Ref

- https://github.com/search?q=Keyword+spotting&type=repositories&s=stars&o=desc
  - https://github.com/wenet-e2e/wekws
  - https://github.com/harvard-edge/multilingual_kws
  - https://github.com/lingjzhu/clap-ipa
- https://paperswithcode.com/task/keyword-spotting
- https://huggingface.co/anyspeech?sort_models=likes#models
- https://huggingface.co/search/full-text?q=keyword+spotting&type=model&type=space