In [3]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf
import numpy as np

In [4]:
import re
from g2p_en import G2p
g2p = G2p()

PHONE_DEF = [
    'AA', 'AE', 'AH', 'AO', 'AW',
    'AY', 'B',  'CH', 'D', 'DH',
    'EH', 'ER', 'EY', 'F', 'G',
    'HH', 'IH', 'IY', 'JH', 'K',
    'L', 'M', 'N', 'NG', 'OW',
    'OY', 'P', 'R', 'S', 'SH',
    'T', 'TH', 'UH', 'UW', 'V',
    'W', 'Y', 'Z', 'ZH'
]
PHONE_DEF_SIL = PHONE_DEF + ['SIL']

def phoneToId(p):
    return PHONE_DEF_SIL.index(p)

def convert_to_phonemes(transcript):
    
    thisTranscription = transcript.strip()
    thisTranscription = re.sub(r'[^a-zA-Z\- \']', '', thisTranscription)
    thisTranscription = thisTranscription.replace('--', '').lower()
    addInterWordSymbol = True

    phonemes = []
    
    for p in g2p(thisTranscription):
        if addInterWordSymbol and p==' ':
            phonemes.append('SIL')
        p = re.sub(r'[0-9]', '', p)  # Remove stress
        if re.match(r'[A-Z]+', p):  # Only keep phonemes
            phonemes.append(p)

    #add one SIL symbol at the end so there's one at the end of each word
    if addInterWordSymbol:
        phonemes.append('SIL')
        
    seqLen = len(phonemes)
    maxSeqLen = 500
    seqClassIDs = np.zeros([maxSeqLen]).astype(np.int32)
    seqClassIDs[0:seqLen] = [phoneToId(p) + 1 for p in phonemes]
    return seqClassIDs


In [7]:
from speechbrain.lobes.models.huggingface_transformers.hubert import HuBERT

hubert_path = "facebook/hubert-large-ls960-ft"
model_hubert = HuBERT(hubert_path, save_path="/data/LLMs/", freeze=True)



In [None]:
# Inspect the first sample in the dataset

librispeech_path = "/data/LLMs/librispeech/"

dataset = torchaudio.datasets.LIBRISPEECH(
    root=librispeech_path,  # or the full path to the parent directory
    url="train-clean-100",
    download=False
)

sample = dataset[0]

# Unpack the returned tuple
waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id = sample
print(transcript)

In [None]:
from torch.utils.data import DataLoader
import torchaudio.transforms as T

# Custom collate function with padding
def collate_fn(batch):
    waveforms = [item[0].squeeze(0) for item in batch]  # Remove channel dimension
    lengths = torch.tensor([wav.shape[0] for wav in waveforms])
    
    # Pad sequences to match longest in batch
    padded_waveforms = torch.nn.utils.rnn.pad_sequence(
        waveforms, 
        batch_first=True
    )
    
    transcripts = [convert_to_phonemes(item[2]) for item in batch]  # Extract transcripts
    
    
    return padded_waveforms, lengths, transcripts

# Create DataLoader
dataloader = DataLoader(
    dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=4
)

for waveforms, lengths, transcripts in dataloader:
    # HuBERT expects float32 inputs in range [-1, 1]
    waveforms = waveforms.float()
    
    # Forward pass
    with torch.no_grad():
        embeddings = model_hubert(waveforms)
    
    print(f"Input shape: {waveforms.shape}")
    print(f"Embeddings shape: {embeddings.shape}")
    break

In [None]:
fea_hubert = model_hubert(waveform)

In [None]:
import torch
import torch.nn as nn
from torchaudio.models import Conformer

class LightweightHuBERTConformer(nn.Module):
    def __init__(self, input_dim=1024, num_heads=4, ffn_dim=512, num_layers=6, depthwise_conv_kernel_size=31, 
                 dropout=0.1, use_group_norm=False, convolution_first):
        super().__init__()
        
        # Core Conformer Encoder
        self.conformer = Conformer(
            input_dim=1024,        # HuBERT feature dimension
            num_heads=4,           # Reduced from typical 8-16 heads
            ffn_dim=512,           # Balanced capacity vs size (original: 2048)
            num_layers=6,          # 6 layers vs typical 12-16
            depthwise_conv_kernel_size=31,
            dropout=0.1,
            use_group_norm=True,   # Better for small batches
            convolution_first=True # Better convergence per [2][4]
        )
        
        # Dimensionality reduction
        self.projection = nn.Sequential(
            nn.Linear(1024, 256),
            nn.GELU(),
            nn.LayerNorm(256)
        )

    def forward(self, features, lengths):
        """
        Args:
            features: (B, T, 1024) HuBERT features
            lengths: (B,) sequence lengths
        Returns:
            (B, T, 256) compressed representations
        """
        # Conformer processing
        x, _ = self.conformer(features, lengths)  # [B, T, 1024]
        
        # Projection to target dimension
        return self.projection(x)  # [B, T, 256]

In [None]:
torch.std(fea_hubert[:, 0, :])

In [None]:
model = LightweightHuBERTConformer()

In [None]:
output_model = model(fea_hubert, torch.tensor([fea_hubert.shape[1]]))

In [None]:
output_model.shape

In [None]:
import os
import torch
import torchaudio
import soundfile as sf
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# Load the phoneme-based Wav2Vec2 model
model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# Path to LibriSpeech dataset
librispeech_dir = "/data/LLMs/librispeech/LibriSpeech/train-clean-100"

# Function to process an audio file and extract phoneme representations
def extract_wav2vec_phonemes(audio_path):
    # Load and resample audio if needed
    speech, sample_rate = sf.read(audio_path)
    if sample_rate != 16000:
        speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(torch.tensor(speech)).numpy()

    # Tokenize input
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)

    # Get model output (hidden states, not logits)
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_states = outputs.last_hidden_state  # (batch_size, time_steps, hidden_size)

    return last_hidden_states

# Process all .flac files in LibriSpeech directory
flac_files = []
for root, _, files in os.walk(librispeech_dir):
    for file in files:
        if file.endswith(".flac"):
            flac_files.append(os.path.join(root, file))

# Extract phonemes from all files
for audio_file in flac_files[:10]:  # Process a subset first
    embedding = extract_wav2vec_phonemes(audio_file)
    print(f"File: {audio_file}")
    
