In [1]:
import numpy as np
import os
import gc
import librosa
import subprocess
import io
import soundfile as sf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from transformers import (AutoFeatureExtractor, MCTCTFeatureExtractor, ParakeetFeatureExtractor, SeamlessM4TFeatureExtractor)

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


In [2]:
class Song:
    def __init__(self, name, audio_vector, lang, samplerate, length, split):
        self.name = name
        self.audio = audio_vector
        self.lang = lang
        self.samplerate = samplerate
        self.length = length
        self.split = split

    def _resample_audio(self, target_sr):
        return librosa.resample(y=self.audio, orig_sr=self.samplerate, target_sr=target_sr)

    def extract_all_features(self):
        """Extract ALL features at once, then delete audio"""

        # intialize
        features = {}

        # Traditional features
        resample_12k = self._resample_audio(12000)
        features['stft'] = librosa.stft(y=resample_12k, n_fft=512, hop_length=256)
        features['mel_specs'] = librosa.feature.melspectrogram(y=resample_12k, sr=12000, hop_length=256, n_fft=512, n_mels=96)
        features['mfccs'] = librosa.feature.mfcc(y=resample_12k, sr=12000, hop_length=256, n_fft=512)
        del resample_12k
        gc.collect()

        # Learned features
        resample_16k = self._resample_audio(16000)

        # MCTCT
        feature_extractor = MCTCTFeatureExtractor()
        features['mctct'] = feature_extractor(raw_speech=resample_16k, sampling_rate=16000, return_tensors="pt")['input_features'][0]
        del feature_extractor
        gc.collect()

        # Parakeet
        feature_extractor = ParakeetFeatureExtractor()
        features['parakeet'] = feature_extractor(resample_16k, sampling_rate=16000, return_tensors="pt")['input_features'][0]
        del feature_extractor
        gc.collect()

        # SeamlessM4T
        feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/hf-seamless-m4t-medium")
        features['seamlessM4T'] = feature_extractor(resample_16k, sampling_rate=16000, return_tensors="pt")['input_features'][0]
        del feature_extractor
        gc.collect()

        # Whisper
        feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
        features['whisper'] = feature_extractor(resample_16k, sampling_rate=16000, return_tensors="pt")['input_features'][0].T
        del feature_extractor
        gc.collect()

        # NOW delete audio
        del self.audio
        self.audio = None
        gc.collect()

        return features

In [3]:
def process_song(path, lang, sr=22050, clip_len=15):
    """Process a single song into clips"""
    try:
        y, _ = librosa.load(path, sr=sr, mono=True)
    except Exception:
        try:
            proc = subprocess.run(
                ['ffmpeg', '-v', 'error', '-nostdin', '-i', path,
                 '-ac', '1', '-ar', str(sr), '-f', 'wav', '-'],
                check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
            )
            wav_bytes = io.BytesIO(proc.stdout)
            y, _ = sf.read(wav_bytes, dtype='float32')
            if y.ndim > 1:
                y = np.mean(y, axis=1)
        except Exception:
            return []

    name = os.path.splitext(os.path.basename(path))[0].lower()

    if y is None or len(y) == 0:
        return []

    duration = librosa.get_duration(y=y, sr=sr)
    if duration < 120:
        return []

    center = duration / 2
    start = int((center - 60) * sr)
    end = int((center + 60) * sr)
    y = y[start:end]

    clip_size = clip_len * sr
    clips = [y[i:i + clip_size] for i in range(0, len(y), clip_size)
             if len(y[i:i + clip_size]) == clip_size]

    if len(clips) == 0:
        return []

    split_assignments = ['train', 'test', 'train', 'validation', 'train', 'test', 'train', 'train']

    return [Song(name, clip, lang, sr, clip_len, split_assignments[i])
            for i, clip in enumerate(clips[:len(split_assignments)])]



In [None]:
def build_dataset_by_language_batch():
    """
    Process languages in batches with INCREMENTAL SAVING.
    Saves after each language to prevent data loss on crashes.
    Can resume from checkpoint.
    """

    languages = ['patois', 'mandarin', 'english', 'spanish', 'hindi', 'pidgin']
    feature_types = ['stft', 'mel_specs', 'mfccs', 'mctct', 'parakeet', 'seamlessM4T', 'whisper']
    
    # Checkpoint file to track progress
    checkpoint_file = 'preprocessing_checkpoint.json'
    completed_languages = []
    
    # Load checkpoint if exists
    if os.path.exists(checkpoint_file):
        import json
        with open(checkpoint_file, 'r') as f:
            checkpoint = json.load(f)
            completed_languages = checkpoint.get('completed_languages', [])
            print(f"Resuming from checkpoint. Already completed: {completed_languages}")
    
    # Storage for current language only (not all languages)
    lang_features = {
        'train': {feat: [] for feat in feature_types},
        'validation': {feat: [] for feat in feature_types},
        'test': {feat: [] for feat in feature_types}
    }
    lang_labels = {'train': [], 'validation': [], 'test': []}

    # Process each language separately
    for lang_idx, lang in enumerate(languages):
        # Skip if already completed
        if lang in completed_languages:
            print(f"\n{'='*60}")
            print(f"Skipping {lang} (already completed)")
            print(f"{'='*60}")
            continue
            
        print(f"\n{'='*60}")
        print(f"Processing language {lang_idx+1}/{len(languages)}: {lang}")
        print(f"{'='*60}")

        # Reset for this language
        for split in ['train', 'validation', 'test']:
            for feat in feature_types:
                lang_features[split][feat] = []
        lang_labels = {'train': [], 'validation': [], 'test': []}

        folder = f'./{lang}'
        files = [f for f in os.listdir(folder) if f.endswith('.mp3')]

        for file_idx, file in enumerate(files):
            path = os.path.join(folder, file)

            try:
                clips = process_song(path, lang)

                for clip in clips:
                    # Extract ALL features at once
                    all_clip_features = clip.extract_all_features()

                    # Distribute features to their respective lists
                    for feat_name, feat_data in all_clip_features.items():
                        lang_features[clip.split][feat_name].append(feat_data)

                    del all_clip_features

                    lang_labels[clip.split].append(clip.lang)
                    del clip

                del clips
                gc.collect()

                if file_idx % 10 == 0:
                    print(f"  {lang}: {file_idx}/{len(files)} files", end='\r')

            except Exception as e:
                print(f"\nError processing {file}: {e}")
                continue

        print(f"\n✓ Completed processing {lang}")
        
        # SAVE IMMEDIATELY after each language to prevent data loss
        print(f"Saving {lang} features incrementally...")
        try:
            # Save this language's features to temporary files
            for feat in feature_types:
                lang_data = {
                    'train_features': np.array(lang_features['train'][feat]) if lang_features['train'][feat] else np.array([]),
                    'val_features': np.array(lang_features['validation'][feat]) if lang_features['validation'][feat] else np.array([]),
                    'test_features': np.array(lang_features['test'][feat]) if lang_features['test'][feat] else np.array([])
                }
                
                # Save to language-specific file
                lang_filename = f'{feat}_features_{lang}.npz'
                np.savez_compressed(lang_filename, **lang_data)
                
                del lang_data
                gc.collect()
            
            # Save this language's labels
            lang_labels_data = {
                'train_labels': lang_labels['train'],
                'val_labels': lang_labels['validation'],
                'test_labels': lang_labels['test']
            }
            np.savez_compressed(f'labels_{lang}.npz', **lang_labels_data)
            del lang_labels_data
            gc.collect()
            
            # Update checkpoint
            completed_languages.append(lang)
            checkpoint = {'completed_languages': completed_languages}
            import json
            with open(checkpoint_file, 'w') as f:
                json.dump(checkpoint, f, indent=2)
            
            print(f"✓ Saved {lang} features and updated checkpoint")
            
            # Clear memory
            for split in ['train', 'validation', 'test']:
                for feat in feature_types:
                    del lang_features[split][feat]
                    lang_features[split][feat] = []
            gc.collect()
            
        except Exception as e:
            print(f"❌ Error saving {lang}: {e}")
            raise  # Re-raise to stop and fix issue

    # Now combine all language files into final feature files
    print("\n" + "="*60)
    print("Combining all languages into final feature files...")
    print("="*60)
    
    # Accumulate all languages
    all_features = {
        'train': {feat: [] for feat in feature_types},
        'validation': {feat: [] for feat in feature_types},
        'test': {feat: [] for feat in feature_types}
    }
    all_labels = {'train': [], 'validation': [], 'test': []}
    
    for lang in completed_languages:
        print(f"Loading {lang}...")
        for feat in feature_types:
            lang_filename = f'{feat}_features_{lang}.npz'
            if os.path.exists(lang_filename):
                lang_data = np.load(lang_filename, allow_pickle=True)
                if len(lang_data['train_features']) > 0:
                    all_features['train'][feat].append(lang_data['train_features'])
                if len(lang_data['val_features']) > 0:
                    all_features['validation'][feat].append(lang_data['val_features'])
                if len(lang_data['test_features']) > 0:
                    all_features['test'][feat].append(lang_data['test_features'])
                del lang_data
        
        # Load labels
        lang_labels_file = f'labels_{lang}.npz'
        if os.path.exists(lang_labels_file):
            lang_labels_data = np.load(lang_labels_file, allow_pickle=True)
            all_labels['train'].extend(lang_labels_data['train_labels'])
            all_labels['validation'].extend(lang_labels_data['val_labels'])
            all_labels['test'].extend(lang_labels_data['test_labels'])
            del lang_labels_data
        gc.collect()
    
    # Concatenate and save final files
    print("\nConcatenating and saving final feature files...")
    for feat in feature_types:
        try:
            final_data = {
                'train_features': np.concatenate(all_features['train'][feat], axis=0) if all_features['train'][feat] else np.array([]),
                'val_features': np.concatenate(all_features['validation'][feat], axis=0) if all_features['validation'][feat] else np.array([]),
                'test_features': np.concatenate(all_features['test'][feat], axis=0) if all_features['test'][feat] else np.array([])
            }
            
            filename = f'{feat}_features.npz'
            np.savez_compressed(filename, **final_data)
            print(f"✓ Saved {feat}: train={final_data['train_features'].shape}, val={final_data['val_features'].shape}, test={final_data['test_features'].shape}")
            
            del final_data
            gc.collect()
        except Exception as e:
            print(f"❌ Error saving {feat}: {e}")
    
    # Encode and save final labels
    all_labels_flat = all_labels['train'] + all_labels['validation'] + all_labels['test']
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(all_labels_flat)
    y_onehot = to_categorical(y_encoded)
    
    train_size = len(all_labels['train'])
    val_size = len(all_labels['validation'])
    
    labels_data = {
        'labels_inorder': list(encoder.classes_),
        'train_labels': y_onehot[:train_size],
        'val_labels': y_onehot[train_size:train_size+val_size],
        'test_labels': y_onehot[train_size+val_size:]
    }
    
    np.savez_compressed(f'labels.npz', **labels_data)
    print(f"\n✓ Saved labels: {len(encoder.classes_)} classes")
    
    # Save metadata
    metadata = {
        'feature_types': feature_types,
        'languages': list(encoder.classes_),
        'splits': {'train': train_size, 'val': val_size, 'test': len(all_labels['test'])}
    }
    
    import json
    with open(f'metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    # Clean up temporary language-specific files
    print("\nCleaning up temporary language-specific files...")
    for lang in completed_languages:
        for feat in feature_types:
            lang_filename = f'{feat}_features_{lang}.npz'
            if os.path.exists(lang_filename):
                os.remove(lang_filename)
        lang_labels_file = f'labels_{lang}.npz'
        if os.path.exists(lang_labels_file):
            os.remove(lang_labels_file)
    
    # Remove checkpoint
    if os.path.exists(checkpoint_file):
        os.remove(checkpoint_file)
    
    print(f"\n{'='*60}")
    print("✓ All features saved and temporary files cleaned up!")
    print(f"{'='*60}")

    return metadata


In [1]:
metadata

NameError: name 'metadata' is not defined

In [None]:
print("Starting optimized preprocessing...")
metadata = build_dataset_by_language_batch()

print("\nPreprocessing complete!")
print(f"Metadata: {metadata}")

Starting Colab-optimized preprocessing...

Processing language 1/6: patois
  patois: 100/105 files
✓ Completed patois

Processing language 2/6: mandarin
  mandarin: 90/95 files
✓ Completed mandarin

Processing language 3/6: english
  english: 90/93 files
✓ Completed english

Processing language 4/6: spanish
  spanish: 30/101 files

[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x77bd7d56 at offset 945978.
Note: Trying to resync...
Note: Skipped 134 bytes in input.
Note: Illegal Audio-MPEG-Header 0x0e9b81e3 at offset 1082816.
Note: Trying to resync...
Note: Skipped 98 bytes in input.
Note: Illegal Audio-MPEG-Header 0xd30bc33e at offset 3202978.
Note: Trying to resync...
Note: Skipped 152 bytes in input.


  spanish: 100/101 files
✓ Completed spanish

Processing language 5/6: hindi
  hindi: 80/81 files
✓ Completed hindi

Processing language 6/6: pidgin
  pidgin: 60/100 files