# Preprocessing for Uzbek STT Dataset

Here I'm preprocessing the raw audio files to create the final dataset. The main steps are splitting the audio into segments, removing silence, and converting to 16kHz mono.

## Setup

In [None]:
import os
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
import pandas as pd

## Parameters

In [None]:
# Paths
RAW_AUDIO_DIR = "raw_audio"
OUTPUT_DIR = "clips"

# Settings for preprocessing
TARGET_SAMPLE_RATE = 16000  # 16kHz is standard for speech
MIN_DURATION = 3000  # min 3 seconds
MAX_DURATION = 10000  # max 10 seconds
SILENCE_THRESH = -40  # silence threshold in dB
MIN_SILENCE_LEN = 500  # minimum silence length

os.makedirs(OUTPUT_DIR, exist_ok=True)

## Processing Functions

In [None]:
def load_audio(file_path):
    """Load audio file"""
    audio = AudioSegment.from_wav(file_path)
    return audio

def detect_speech_segments(audio, min_silence_len=500, silence_thresh=-40):
    """Find where speech is (removes silence)"""
    segments = detect_nonsilent(
        audio,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh
    )
    return segments

def resample_and_convert_to_mono(audio, target_rate=16000):
    """Convert to 16kHz mono"""
    audio = audio.set_frame_rate(target_rate)
    audio = audio.set_channels(1)
    return audio

def filter_by_duration(segments, audio, min_dur=3000, max_dur=10000):
    """Keep only segments between 3-10 seconds"""
    valid_segments = []
    
    for start_ms, end_ms in segments:
        duration = end_ms - start_ms
        
        if min_dur <= duration <= max_dur:
            valid_segments.append((start_ms, end_ms))
        
        elif duration > max_dur:
            # Split long segments
            num_chunks = int(duration / max_dur) + 1
            chunk_duration = duration / num_chunks
            
            for i in range(num_chunks):
                chunk_start = start_ms + int(i * chunk_duration)
                chunk_end = min(start_ms + int((i + 1) * chunk_duration), end_ms)
                
                if (chunk_end - chunk_start) >= min_dur:
                    valid_segments.append((chunk_start, chunk_end))
    
    return valid_segments

## Main Processing

In [None]:
def process_audio_file(audio_path, output_dir, clip_counter):
    """Process one audio file and split into clips"""
    print(f"Processing: {os.path.basename(audio_path)}")
    
    # Load audio
    audio = load_audio(audio_path)
    print(f"  Original: {len(audio)/1000:.2f}s, {audio.frame_rate}Hz, {audio.channels} channels")
    
    # Find speech segments
    segments = detect_speech_segments(
        audio,
        min_silence_len=MIN_SILENCE_LEN,
        silence_thresh=SILENCE_THRESH
    )
    print(f"  Found {len(segments)} speech segments")
    
    # Filter by duration
    valid_segments = filter_by_duration(
        segments,
        audio,
        min_dur=MIN_DURATION,
        max_dur=MAX_DURATION
    )
    print(f"  {len(valid_segments)} valid segments")
    
    # Process and save clips
    clips_created = 0
    metadata = []
    
    for start_ms, end_ms in valid_segments:
        clip = audio[start_ms:end_ms]
        
        # Resample to 16kHz mono
        clip = resample_and_convert_to_mono(clip, TARGET_SAMPLE_RATE)
        
        clip_filename = f"clip_{clip_counter:03d}.wav"
        clip_path = os.path.join(output_dir, clip_filename)
        
        clip.export(clip_path, format="wav")
        
        metadata.append({
            'file_name': clip_filename,
            'duration_seconds': round((end_ms - start_ms) / 1000, 2),
            'source_file': os.path.basename(audio_path)
        })
        
        clip_counter += 1
        clips_created += 1
    
    print(f"  Created {clips_created} clips\n")
    return metadata, clip_counter

## Process All Files

In [None]:
# Get all audio files
audio_files = [f for f in os.listdir(RAW_AUDIO_DIR) if f.endswith('.wav')]

print(f"Found {len(audio_files)} files\n")

# Process each one
all_metadata = []
clip_counter = 1

for audio_file in audio_files:
    audio_path = os.path.join(RAW_AUDIO_DIR, audio_file)
    metadata, clip_counter = process_audio_file(audio_path, OUTPUT_DIR, clip_counter)
    all_metadata.extend(metadata)

print(f"Done! Total clips: {len(all_metadata)}")

## Save Metadata

In [None]:
# Save to CSV
df = pd.DataFrame(all_metadata)
df.to_csv('preprocessed_metadata.csv', index=False)

print("Saved metadata")
print("Next: manually transcribe the clips")

## Summary

In [None]:
print("\nStats:")
print(f"Total clips: {len(df)}")
print(f"Total duration: {df['duration_seconds'].sum():.2f}s")
print(f"Average duration: {df['duration_seconds'].mean():.2f}s")
print(f"Min: {df['duration_seconds'].min():.2f}s, Max: {df['duration_seconds'].max():.2f}s")
print(f"\nAll clips are 16kHz mono WAV")

## Notes

The preprocessing steps:
1. Load raw audio
2. Detect speech (remove silence)
3. Keep segments 3-10 seconds
4. Resample to 16kHz
5. Convert to mono
6. Save as WAV

After this, I manually transcribed all clips and split them 80/20 for train/val.