In [1]:
# Notebook 1: extract_features_part1.ipynb
import os
import torch
import torchaudio
import pandas as pd
import numpy as np
from tqdm import tqdm
from speechbrain.pretrained import SpeakerRecognition

# --------- Load CSV & Prepare Partial Data ---------
df = pd.read_csv("D:/Projects/speech_project/other.csv")
df = df[['client_id', 'path']].dropna()
df['full_path'] = df['path'].apply(lambda x: os.path.join("D:/Projects/speech_project/clips", x))
df = df[df['full_path'].apply(os.path.exists)]

# Keep speakers with at least 2 clips
speaker_counts = df['client_id'].value_counts()
df = df[df['client_id'].isin(speaker_counts[speaker_counts >= 2].index)]

# Encode labels
label_to_id = {label: idx for idx, label in enumerate(sorted(df['client_id'].unique()))}
df['label'] = df['client_id'].map(label_to_id)

# Take first half
df_part = df.iloc[len(df)//2:]  # Instead of the first half

# --------- Load ECAPA-TDNN Model ---------
speaker_model = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="speechbrain_model")

# --------- Feature Extraction ---------
def extract_embedding(audio_path, sample_rate=16000, duration=3.0):
    waveform, sr = torchaudio.load(audio_path)
    waveform = torchaudio.transforms.Resample(sr, sample_rate)(waveform.mean(dim=0))
    fixed_len = int(sample_rate * duration)
    if waveform.size(0) < fixed_len:
        waveform = torch.nn.functional.pad(waveform, (0, fixed_len - waveform.size(0)))
    else:
        waveform = waveform[:fixed_len]
    with torch.no_grad():
        return speaker_model.encode_batch(waveform.unsqueeze(0)).squeeze().numpy()

# --------- Extract & Save ---------
features, labels = [], []
for _, row in tqdm(df_part.iterrows(), total=len(df_part)):
    try:
        emb = extract_embedding(row['full_path'])
        features.append(emb)
        labels.append(row['label'])
    except Exception as e:
        print(f"Failed: {row['full_path']} - {e}")

# Save to disk (append mode)
features = np.array(features)
labels = np.array(labels)

if os.path.exists("features.npy"):
    prev_feat = np.load("features.npy")
    prev_lab = np.load("labels.npy")
    features = np.concatenate([prev_feat, features])
    labels = np.concatenate([prev_lab, labels])

np.save("features.npy", features)
np.save("labels.npy", labels)


  if ismodule(module) and hasattr(module, '__file__'):
  from speechbrain.pretrained import SpeakerRecognition
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
100%|██████████| 13509/13509 [1:38:49<00:00,  2.28it/s]  
