In [1]:
import os
import numpy as np
import pandas as pd
import glob
from tqdm.auto import tqdm

import torch
import torchaudio

from sklearn.model_selection    import train_test_split
from sklearn.preprocessing      import StandardScaler
from sklearn.pipeline           import Pipeline
from sklearn.ensemble           import RandomForestClassifier
from sklearn.neighbors          import KNeighborsClassifier
from sklearn.svm                import SVC
from sklearn.semi_supervised    import LabelPropagation
from sklearn.metrics            import classification_report, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt

DEVICE         = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_CSV       = './Data/data_labeled_filtered.csv'
AUDIO_ROOT     = './Data/Audio'

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import os
import pandas as pd
import torch
import torchaudio
from torchaudio.functional import detect_pitch_frequency, spectrogram
from tqdm import tqdm
import warnings

# --- Configuration ---
# Constants
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DATA_CSV = './Data/data_labeled_filtered.csv' # Path to your input CSV
AUDIO_ROOT = './Data/Audio' # Root directory containing audio subfolders/files
OUTPUT_CSV = './Data/data_labeled_filtered_with_pitch.csv' # Path to save the output CSV

# --- FFmpeg Backend Check (Optional but Recommended) ---
try:
    # Check if ffmpeg backend is available
    torchaudio.utils.ffmpeg_available()
    # You might want to explicitly set the backend, though often not necessary if ffmpeg is in PATH
    # torchaudio.set_audio_backend("ffmpeg")
    print("FFmpeg backend found.")
except RuntimeError:
    warnings.warn(
        "FFmpeg backend not available. MP3 loading might fail. "
        "Please install FFmpeg and ensure it's in your system's PATH."
    )
except AttributeError:
     # Older torchaudio versions might not have ffmpeg_available()
     print("Could not check for FFmpeg backend (might be an older torchaudio version). Assuming it might work.")


# --- Load Data ---
print(f"Loading data from {DATA_CSV}...")
try:
    df = pd.read_csv(DATA_CSV)
except FileNotFoundError:
    print(f"Error: Input CSV file not found at {DATA_CSV}")
    exit()

# Initialize the new column, overwriting if it exists
df['pitch_mean'] = None
# Ensure the pitch column is float type to handle NaN/None properly
df['pitch_mean'] = df['pitch_mean'].astype(float)

print(f"Using device: {DEVICE}")

# --- Pitch Extraction Function ---
def extract_pitch(audio_path):
    """
    Extracts the mean pitch (F0) from an audio file.
    Returns the mean pitch frequency (float) or None if pitch cannot be detected or an error occurs.
    """
    try:
        waveform, sample_rate = torchaudio.load(audio_path)
    except Exception as e:
        print(f"Error loading audio file {os.path.basename(audio_path)}: {e}")
        return None

    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    waveform = waveform.to(DEVICE)

    # --- Method 1: detect_pitch_frequency (Preferred) ---
    try:
        # --- ADJUSTMENT ---
        # Instead of strictly 30ms, let's try a fixed window size like 1024 samples.
        # This is common in FFT-based methods and might avoid internal size conflicts.
        # 1024 samples at 48kHz is ~21.3ms, which is still reasonable for pitch.
        # You could also try 2048 (~42.6ms) if 1024 is too short for low pitches.
        # Let's start with 1024.
        win_length = 1024

        # Keep the hop time at 10ms
        frame_time = 0.01
        # The hop_length in samples would be int(sample_rate * frame_time)
        # e.g., 480 samples at 48kHz

        # Ensure win_length is not smaller than what the function requires internally
        # (Usually not an issue unless win_length is made very small)
        # And ensure win_length <= waveform length
        if win_length > waveform.shape[-1]:
             print(f"Warning: win_length ({win_length}) > waveform length ({waveform.shape[-1]}) for {os.path.basename(audio_path)}. Skipping pitch detection.")
             # Or adjust win_length = waveform.shape[-1] if you want to process very short files
             return None


        pitch = detect_pitch_frequency(
            waveform,
            sample_rate=sample_rate,
            frame_time=frame_time,  # Corresponds to hop_length = int(sample_rate * frame_time)
            win_length=win_length,  # Using the adjusted window length
            freq_low=50.0,
            freq_high=500.0
        ).squeeze()

        pitch = pitch[pitch > 0]

        if pitch.numel() > 0:
            return pitch.mean().item()
        else:
            return None

    except Exception as e:
        # Print the specific error type and message for better debugging
        print(f"detect_pitch_frequency failed for {os.path.basename(audio_path)}: {type(e).__name__}: {e}. Trying fallback.")

        # --- Method 2: Spectrogram-based Fallback (Less Accurate) ---
        # (Fallback code remains the same as before)
        try:
            n_fft = 1024
            spec_win_length = 400 # Can keep this independent or link to pitch win_length
            spec_hop_length = 100
            window = torch.hann_window(spec_win_length).to(DEVICE)

            spec = spectrogram(
                waveform=waveform, pad=0, window=window, n_fft=n_fft,
                hop_length=spec_hop_length, win_length=spec_win_length,
                power=1, normalized=False,
            ).squeeze(0)

            freqs = torch.fft.rfftfreq(n_fft, 1/sample_rate).to(DEVICE) # Move freqs to device
            spec_sum_over_time = spec.sum(dim=-1)

            valid_freq_indices = torch.where((freqs >= 50.0) & (freqs <= 500.0))[0]
            if len(valid_freq_indices) > 0:
                 max_mag_idx_in_valid = torch.argmax(spec_sum_over_time[valid_freq_indices])
                 main_freq_idx = valid_freq_indices[max_mag_idx_in_valid]
            else:
                 main_freq_idx = torch.argmax(spec_sum_over_time)

            dominant_freq = freqs[main_freq_idx].item()
            return dominant_freq

        except Exception as fallback_e:
            print(f"Spectrogram fallback also failed for {os.path.basename(audio_path)}: {fallback_e}")
            return None

# --- Iterate and Extract Pitch ---
print("Starting pitch extraction...")
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting pitch"):
    # Construct full audio path, handle potential missing values in 'path' column
    relative_path = row.get('path') # Use .get for safety if 'path' column might be missing
    if pd.isna(relative_path):
        # print(f"Warning: Missing audio path for row index {idx}. Skipping.")
        continue # Skip row if path is NaN or None

    audio_file = os.path.join(AUDIO_ROOT, relative_path)

    if os.path.exists(audio_file):
        # extract_pitch handles internal errors and returns None on failure
        pitch_value = extract_pitch(audio_file)
        # Use .loc for potentially faster assignment, especially on large dataframes
        df.loc[idx, 'pitch_mean'] = pitch_value
    else:
        print(f"Warning: Audio file not found at {audio_file} for row index {idx}. Skipping.")
        # df.loc[idx, 'pitch_mean'] = None # Already initialized to None, but explicit is ok

# --- Save Output ---
print(f"\nSaving results to {OUTPUT_CSV}...")
try:
    df.to_csv(OUTPUT_CSV, index=False)
    print("Pitch extraction complete.")
except Exception as e:
    print(f"Error saving CSV file: {e}")

Could not check for FFmpeg backend (might be an older torchaudio version). Assuming it might work.
Loading data from ./Data/data_labeled_filtered.csv...
Using device: cuda
Starting pitch extraction...


Extracting pitch:   0%|          | 5/172158 [00:02<22:14:10,  2.15it/s]

detect_pitch_frequency failed for common_voice_en_19687174.mp3: RuntimeError: maximum size for tensor at dimension 1 is 1006 but size is 1024. Trying fallback.


Extracting pitch:   0%|          | 6/172158 [00:02<21:32:57,  2.22it/s]

detect_pitch_frequency failed for common_voice_en_18421093.mp3: RuntimeError: maximum size for tensor at dimension 1 is 948 but size is 1024. Trying fallback.


Extracting pitch:   0%|          | 7/172158 [00:03<20:58:00,  2.28it/s]

detect_pitch_frequency failed for common_voice_en_18421094.mp3: RuntimeError: maximum size for tensor at dimension 1 is 874 but size is 1024. Trying fallback.


Extracting pitch:   0%|          | 8/172158 [00:03<20:52:19,  2.29it/s]

detect_pitch_frequency failed for common_voice_en_18421095.mp3: RuntimeError: maximum size for tensor at dimension 1 is 963 but size is 1024. Trying fallback.


Extracting pitch:   0%|          | 9/172158 [00:04<20:37:15,  2.32it/s]

detect_pitch_frequency failed for common_voice_en_18421096.mp3: RuntimeError: maximum size for tensor at dimension 1 is 946 but size is 1024. Trying fallback.


Extracting pitch:   0%|          | 15/172158 [00:06<19:10:32,  2.49it/s]

detect_pitch_frequency failed for common_voice_en_18841725.mp3: RuntimeError: maximum size for tensor at dimension 1 is 987 but size is 1024. Trying fallback.


Extracting pitch:   0%|          | 16/172158 [00:06<19:11:34,  2.49it/s]

detect_pitch_frequency failed for common_voice_en_19093168.mp3: RuntimeError: maximum size for tensor at dimension 1 is 1013 but size is 1024. Trying fallback.


Extracting pitch:   0%|          | 17/172158 [00:07<19:13:57,  2.49it/s]

detect_pitch_frequency failed for common_voice_en_19093169.mp3: RuntimeError: maximum size for tensor at dimension 1 is 994 but size is 1024. Trying fallback.


Extracting pitch:   0%|          | 21/172158 [00:09<21:01:11,  2.27it/s]


KeyboardInterrupt: 