## (Model Name): Real-Time Single Channel Speaker Counting 

In [None]:
import torch
import pandas as pd
import math
import soundfile as sf
import re
from pathlib import Path
import librosa

from src.utils.SpectrogramExtractor import SpectrogramExtractor

## Data prep

1. Collect paths for recordings and metadata
2. Transform raw_recordings into 1 second mono clips.
3. Create the spectrograms for each 1 second clip. 
4. Use metadata to create the labels for each clip - how many speakers there are.

In [3]:
# Step 1 - collect paths

# Collect list of paths in libricss dataset. Returns the raw recording wav path & the meeting_info txt path.
# For each session there is (raw_recording.wav, meeting_info.txt)
def collect_paths(root_dir):
    pairs = []

    for overlap_dir in Path(root_dir).iterdir():
        if not overlap_dir.is_dir():
            continue

        for session_dir in overlap_dir.iterdir():
            raw_path = session_dir / "record" / "raw_recording.wav"
            info_path = session_dir / "transcription" / "meeting_info.txt"
            if raw_path.exists() and info_path.exists():
                pairs.append((raw_path, info_path))

    return pairs

In [4]:
# Step 2 - split recordings into 1s mono clips


# Split a wav into chunks of length clip_dur. If clip_dur isn't a factor of the audio's length, the end is discarded.
# Necessary so spectrogram images are always same size. CNNs struggle with varying size inputs.

def split_wav_into_clips(wav_path, clip_dur=1.0, sr=16000):
    wav, _ = librosa.load(wav_path, sr=sr, mono=True) # librosa handles mono conversion for us
    clip_len = int(clip_dur * sr)
    num_clips = len(wav) // clip_len

    clips = [
        wav[i * clip_len : (i + 1) * clip_len]
        for i in range(num_clips)
    ]
    return clips


def wavs_to_clips():
    # Destination dir
    out_dir = Path("data/clips")
    out_dir.mkdir(parents=True, exist_ok=True)



    for raw_path, _ in collect_paths("data/libricss"):

        # Formatting for file names
        speaker = raw_path.parents[2].name
        session_folder = raw_path.parents[1].name
        m = re.search(r"(session\d+)", session_folder)
        session = m.group(1) if m else session_folder

        # Split raw_path into 1s clips
        clips = split_wav_into_clips(raw_path, clip_dur=1.0, sr=16000)

        # Save clips 
        for idx, clip in enumerate(clips):
            fname = f"{speaker}_{session}_clip{idx}.wav"
            sf.write(out_dir / fname, clip, 16000)

# wavs_to_clips()

In [5]:
# Step 3 - turn each clip into spectrogram
def clips_to_specs():
    extractor = SpectrogramExtractor()


    clips_dir = Path("data/clips")
    specs_dir = Path("data/spectrograms")
    specs_dir.mkdir(parents=True, exist_ok=True)

    # Convert all clips to spectrograms
    for wav_path in clips_dir.glob("*.wav"):

        spec = extractor(str(wav_path))   # Returns a tensor shape: [1, n_mels, time_frames]
        spec = spec.squeeze(0)  # Remove the leading channel dim. Now [n_mels, time_frames]
        

        out_path = specs_dir / wav_path.with_suffix(".pt").name
        torch.save(spec, out_path) # save as a PyTorch tensor

# clips_to_specs()

In [6]:
# Step 4 - labels for each spectrogram


# 1) Create a wrapper to find the number of speakers in each defined time window
def count_speakers_per_window(meeting_file, time_window=1.0):


    # Parse the start and end time from meeting_file
    intervals = []
    with open(meeting_file, "r") as f:
        next(f)  # Skip header line
        for line in f:
            parts = line.strip().split()
            start_time = float(parts[0])
            end_time = float(parts[1])
            intervals.append((start_time, end_time))
    if not intervals:
        return []



    # Count the number of speakers active for each time_window
    max_time = max(end for _, end in intervals)
    num_windows = math.ceil(max_time / time_window)
    speaker_counts = [0] * num_windows

    for start, end in intervals:
        start_idx = int(start // time_window)
        end_idx = int(end // time_window)

        for i in range(start_idx, end_idx + 1):
            if i < num_windows:
                speaker_counts[i] += 1

    return speaker_counts



# 2) Build a dict {spectrogram path -> its label, from 1)}


# Collect Key paths
specs_dir = Path("data/spectrograms")
spec_to_label = {}



# Collect value paths
counts_map = {}
for _, info_path in collect_paths("data/libricss"):
    speaker = info_path.parents[2].name 
    session_folder = info_path.parents[1].name
    

    m = re.search(r"(session\d+)", session_folder)
    session = m.group(1) if m else session_folder
    
    key = f"{speaker}_{session}"              
    counts_map[key] = count_speakers_per_window(info_path)



# Create key/value pair
for spec_path in specs_dir.glob("*.pt"):
    stem = spec_path.stem 
    parts = stem.rsplit("_clip", 1)    
    if len(parts) != 2:
        continue  
    
    key, idx_str = parts
    idx = int(idx_str)
    
    if key not in counts_map:
        raise KeyError(f"No counts for session key '{key}'")
    counts = counts_map[key]
    
    if idx < 0 or idx >= len(counts):
        raise IndexError(f"Clip index {idx} out of range for '{key}'")
    
    spec_to_label[spec_path] = counts[idx]



# 3) Result
records = [
    {"spectrogram": path.name, "speaker_count": label}
    for path, label in spec_to_label.items()
]
df = pd.DataFrame(records); df.head()



# 4) Save mapping to csv
# df.to_csv("data/spectrogram_labels.csv", index=False)

Unnamed: 0,spectrogram,speaker_count
0,0L_session0_clip0.pt,0
1,0L_session0_clip1.pt,0
2,0L_session0_clip10.pt,1
3,0L_session0_clip100.pt,1
4,0L_session0_clip101.pt,1


## Final DataFrame: Labelled Spectrograms

In [7]:
df.head(10)

Unnamed: 0,spectrogram,speaker_count
0,0L_session0_clip0.pt,0
1,0L_session0_clip1.pt,0
2,0L_session0_clip10.pt,1
3,0L_session0_clip100.pt,1
4,0L_session0_clip101.pt,1
5,0L_session0_clip102.pt,1
6,0L_session0_clip103.pt,1
7,0L_session0_clip104.pt,1
8,0L_session0_clip105.pt,1
9,0L_session0_clip106.pt,1
