In [31]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf
import numpy as np

In [32]:
librispeech_path = "/data/LLMs/librispeech/"

dataset = torchaudio.datasets.LIBRISPEECH(
    root=librispeech_path,  # or the full path to the parent directory
    url="train-clean-100",
    download=False
)

waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id = dataset[111]

def compute_waveform_length(waveform, sample_rate=16000):
    
    waveform_len = waveform.shape[1]
    print(f"{waveform_len / sample_rate} seconds") 
    
compute_waveform_length(waveform)
print(transcript)

2.81 seconds
WAS ASCENDING THE STAIRS LEADING TO DEBRAY'S APARTMENTS


In [20]:
from speechbrain.lobes.models.huggingface_transformers.hubert import HuBERT

hubert_path = "facebook/hubert-large-ls960-ft"
model_hubert = HuBERT(hubert_path, save_path="/data/LLMs/")

In [21]:
fea_hubert = model_hubert(waveform)

In [34]:
import torch
import torch.nn as nn
from torchaudio.models import Conformer

class LightweightHuBERTConformer(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Core Conformer Encoder
        self.conformer = Conformer(
            input_dim=1024,        # HuBERT feature dimension
            num_heads=4,           # Reduced from typical 8-16 heads
            ffn_dim=512,           # Balanced capacity vs size (original: 2048)
            num_layers=6,          # 6 layers vs typical 12-16
            depthwise_conv_kernel_size=31,
            dropout=0.1,
            use_group_norm=True,   # Better for small batches
            convolution_first=True # Better convergence per [2][4]
        )
        
        # Dimensionality reduction
        self.projection = nn.Sequential(
            nn.Linear(1024, 256),
            nn.GELU(),
            nn.LayerNorm(256)
        )

    def forward(self, features, lengths):
        """
        Args:
            features: (B, T, 1024) HuBERT features
            lengths: (B,) sequence lengths
        Returns:
            (B, T, 256) compressed representations
        """
        # Conformer processing
        x, _ = self.conformer(features, lengths)  # [B, T, 1024]
        
        # Projection to target dimension
        return self.projection(x)  # [B, T, 256]

In [35]:
model = LightweightHuBERTConformer()

In [36]:
output_model = model(fea_hubert, torch.tensor([fea_hubert.shape[1]]))

TypeError: LightweightHuBERTConformer.forward() missing 1 required positional argument: 'lengths'

In [2]:
import os
import torch
import torchaudio
import soundfile as sf
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# Load the phoneme-based Wav2Vec2 model
model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# Path to LibriSpeech dataset
librispeech_dir = "/data/LLMs/librispeech/LibriSpeech/train-clean-100"

# Function to process an audio file and extract phoneme representations
def extract_wav2vec_phonemes(audio_path):
    # Load and resample audio if needed
    speech, sample_rate = sf.read(audio_path)
    if sample_rate != 16000:
        speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(torch.tensor(speech)).numpy()

    # Tokenize input
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)

    # Get model output (hidden states, not logits)
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_states = outputs.last_hidden_state  # (batch_size, time_steps, hidden_size)

    return last_hidden_states

# Process all .flac files in LibriSpeech directory
flac_files = []
for root, _, files in os.walk(librispeech_dir):
    for file in files:
        if file.endswith(".flac"):
            flac_files.append(os.path.join(root, file))

# Extract phonemes from all files
for audio_file in flac_files[:10]:  # Process a subset first
    embedding = extract_wav2vec_phonemes(audio_file)
    print(f"File: {audio_file}")
    


AttributeError: 'CausalLMOutput' object has no attribute 'last_hidden_state'