In [1]:
import os
import pandas as pd
import torch
import whisper
from tqdm import tqdm
import librosa
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [2]:
# ===== CONFIGURATION =====
# Absolute paths - these will always work
BASE_DIR = Path(r"C:\Users\Heidi\OneDrive\Desktop\Projects\smg_labs\caribbean-asr-hackathon")
AUDIO_DIR = BASE_DIR / "data" / "raw" / "Audio"
SPLITS_DIR = BASE_DIR / "data" / "splits"

# Model settings
MODEL_SIZE = 'base'  # or 'tiny', 'small', 'medium'
LANGUAGE = 'en'

# Load your splits
training_data = pd.read_csv(SPLITS_DIR / "train.csv")
evaluation_data = pd.read_csv(SPLITS_DIR / "eval.csv")
holdout_data = pd.read_csv(SPLITS_DIR / "holdout.csv")

print(f"Train: {len(training_data)} samples")
print(f"Eval: {len(evaluation_data)} samples")
print(f"Holdout: {len(holdout_data)} samples")
print(f"\nAudio directory: {AUDIO_DIR}")
print(f"Audio directory exists: {AUDIO_DIR.exists()}")

Train: 14891 samples
Eval: 2979 samples
Holdout: 1986 samples

Audio directory: C:\Users\Heidi\OneDrive\Desktop\Projects\smg_labs\caribbean-asr-hackathon\data\raw\Audio
Audio directory exists: True


In [3]:
# Quick check: Do your eval IDs have audio files?
print("Checking first 5 eval samples...")
for i, row in evaluation_data.head(5).iterrows():
    audio_id = row['ID']
    audio_path = AUDIO_DIR / f"{audio_id}.wav"
    exists = audio_path.exists()
    print(f"{audio_id}: {'‚úì EXISTS' if exists else '‚úó MISSING'}")

Checking first 5 eval samples...
ID_LALDGM: ‚úì EXISTS
ID_HABWMO: ‚úì EXISTS
ID_EGLPMS: ‚úì EXISTS
ID_AWHRUX: ‚úì EXISTS
ID_RIZPOC: ‚úì EXISTS


In [9]:
"""
Simple check: Do the IDs in eval.csv have corresponding audio files?
"""

import pandas as pd
from pathlib import Path
import os

# Your paths
AUDIO_DIR = Path(r"C:\Users\Heidi\OneDrive\Desktop\Projects\smg_labs\caribbean-asr-hackathon\data\raw\Audio")
EVAL_CSV = Path(r"C:\Users\Heidi\OneDrive\Desktop\Projects\smg_labs\caribbean-asr-hackathon\data\splits\eval.csv")

print("Checking eval.csv IDs against Audio directory...")
print()

# Load eval data
eval_df = pd.read_csv(EVAL_CSV)
print(f"eval.csv has {len(eval_df)} rows")

# Check first 10 IDs
print(f"\nChecking first 10 IDs from eval.csv:")
missing_count = 0

for i, row in eval_df.head(10).iterrows():
    audio_id = row['ID']
    audio_path = AUDIO_DIR / f"{audio_id}.wav"
    exists = audio_path.exists()
    
    status = "‚úì FOUND" if exists else "‚úó MISSING"
    print(f"  {audio_id}: {status}")
    
    if not exists:
        missing_count += 1

# Check all IDs
print(f"\nChecking all {len(eval_df)} IDs...")
all_missing = []
for idx, row in eval_df.iterrows():
    audio_id = row['ID']
    audio_path = AUDIO_DIR / f"{audio_id}.wav"
    if not audio_path.exists():
        all_missing.append(audio_id)

print(f"\nRESULTS:")
print(f"  Total IDs in eval.csv: {len(eval_df)}")
print(f"  IDs with audio files:  {len(eval_df) - len(all_missing)}")
print(f"  IDs WITHOUT audio:     {len(all_missing)}")

if all_missing:
    print(f"\n‚ö†Ô∏è PROBLEM: {len(all_missing)} IDs in eval.csv don't have audio files!")
    print(f"\nFirst 10 missing IDs:")
    for mid in all_missing[:10]:
        print(f"  {mid}")
    
    print(f"\nüîß SOLUTION: Run create_proper_splits.py to fix this")
else:
    print(f"\n‚úì All IDs in eval.csv have audio files!")

Checking eval.csv IDs against Audio directory...

eval.csv has 2979 rows

Checking first 10 IDs from eval.csv:
  ID_LALDGM: ‚úì FOUND
  ID_HABWMO: ‚úì FOUND
  ID_EGLPMS: ‚úì FOUND
  ID_AWHRUX: ‚úì FOUND
  ID_RIZPOC: ‚úì FOUND
  ID_FJEYAO: ‚úì FOUND
  ID_BOGKAR: ‚úì FOUND
  ID_APTPAE: ‚úì FOUND
  ID_KKHXPY: ‚úì FOUND
  ID_RREHAI: ‚úì FOUND

Checking all 2979 IDs...

RESULTS:
  Total IDs in eval.csv: 2979
  IDs with audio files:  2979
  IDs WITHOUT audio:     0

‚úì All IDs in eval.csv have audio files!


In [10]:
class WhisperBaseline:
    """
    Baseline ASR using Whisper models
    Conservative preprocessing: resample, mono, normalize only
    """
    
    def __init__(self, model_size='base', device=None):
        """
        Initialize Whisper model
        
        Args:
            model_size: 'tiny', 'base', 'small', 'medium', 'large'
            device: 'cuda' or 'cpu' (auto-detected if None)
        """
        self.model_size = model_size
        
        # Auto-detect device
        if device is None:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = device
            
        print(f"Loading Whisper {model_size} model on {self.device}...")
        self.model = whisper.load_model(model_size, device=self.device)
        print(f"Model loaded successfully!")
        
    def preprocess_audio(self, audio_path, target_sr=16000):
        """
        Conservative preprocessing as per team strategy
        - Resample to 16kHz (Whisper's native rate)
        - Convert to mono
        - Normalize amplitude
        """
        try:
            # Load audio
            audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
            
            # Normalize to [-1, 1] range
            if np.max(np.abs(audio)) > 0:
                audio = audio / np.max(np.abs(audio))
            
            return audio
            
        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
            return None
    
    def transcribe_file(self, audio_path, language='en'):
        """
        Transcribe a single audio file
        FIXED: Uses librosa to load audio first
        """
        try:
            # Convert to Path object
            audio_path = Path(audio_path)
            
            # Check if file exists
            if not audio_path.exists():
                return ""
            
            # Load audio with librosa (handles OneDrive paths correctly)
            import librosa
            audio, sr = librosa.load(str(audio_path), sr=16000, mono=True)
            
            # Transcribe using audio array instead of file path
            result = self.model.transcribe(
                audio,  # ‚Üê Pass array, not path!
                language=language,
                task='transcribe',
                fp16=(self.device == 'cuda'),
                verbose=False
            )
            
            return result['text'].strip()
            
        except Exception as e:
            print(f"Error transcribing {audio_path}: {e}")
            return ""
    
    def transcribe_dataset(self, csv_path, audio_dir, output_path, language='en'):
        """
        Transcribe entire dataset
        
        Args:
            csv_path: Path to Test.csv
            audio_dir: Directory containing audio files
            output_path: Path to save submission CSV
            language: Language code
        """
        # Load test data
        print(f"Loading test data from {csv_path}...")
        df = pd.read_csv(csv_path)
        print(f"Found {len(df)} audio files to transcribe")
        
        # Initialize results
        transcriptions = []
        
        # Process each audio file
        print("Starting transcription...")
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            audio_id = row['ID']
            audio_path = str(Path(audio_dir).resolve() / f"{audio_id}.wav")
            
            # Check if file exists
            if not os.path.exists(audio_path):
                print(f"Warning: Audio file not found: {audio_path}")
                transcriptions.append("")
                continue
            
            # Transcribe
            transcription = self.transcribe_file(audio_path, language=language)
            transcriptions.append(transcription)
            
            # Progress update every 100 files
            if (idx + 1) % 100 == 0:
                print(f"Processed {idx + 1}/{len(df)} files")
        
        # Create submission dataframe
        submission_df = pd.DataFrame({
            'ID': df['ID'],
            'Transcription': transcriptions
        })
        
        # Save submission
        submission_df.to_csv(output_path, index=False)
        print(f"\nSubmission saved to {output_path}")
        print(f"Total files processed: {len(submission_df)}")
        print(f"Non-empty transcriptions: {sum(1 for t in transcriptions if t)}")
        
        return submission_df

In [11]:
def calculate_wer(reference, hypothesis):
    """
    Calculate Word Error Rate
    WER = (S + D + I) / N
    where S=substitutions, D=deletions, I=insertions, N=words in reference
    """
    ref_words = reference.lower().split()
    hyp_words = hypothesis.lower().split()
    
    # Dynamic programming matrix
    d = [[0] * (len(hyp_words) + 1) for _ in range(len(ref_words) + 1)]
    
    for i in range(len(ref_words) + 1):
        d[i][0] = i
    for j in range(len(hyp_words) + 1):
        d[0][j] = j
    
    for i in range(1, len(ref_words) + 1):
        for j in range(1, len(hyp_words) + 1):
            if ref_words[i-1] == hyp_words[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                d[i][j] = min(
                    d[i-1][j] + 1,      # deletion
                    d[i][j-1] + 1,      # insertion
                    d[i-1][j-1] + 1     # substitution
                )
    
    wer = d[len(ref_words)][len(hyp_words)] / len(ref_words) if ref_words else 0
    return wer

In [12]:
def validate_on_train_split(model_size='base', train_df=None, val_df=None, language='en', audio_dir=Path('../../data/raw/Audio')):
    """
    Create validation split from training data to estimate WER
    This helps predict public leaderboard performance
    
    Args:
        model_size: Whisper model size
        train_df: Training dataframe (not used, kept for compatibility)
        val_df: Validation dataframe
        language: Language code
        audio_dir: Directory containing audio files
    """
    print(f"\n{'='*60}")
    print(f"VALIDATION ON TRAIN SPLIT - Model: {model_size}")
    print(f"{'='*60}\n")
    
    # Initialize model
    baseline = WhisperBaseline(model_size=model_size)
    
    print(f"Validation set size: {len(val_df)} samples")
    print(f"Audio directory: {audio_dir}")
    
    # Transcribe validation samples
    wer_scores = []
    files_not_found = 0
    
    for idx, row in tqdm(val_df.iterrows(), total=len(val_df)):
        audio_id = row['ID']
        reference = row['Transcription']
        audio_path = str(Path(audio_dir).resolve() / f"{audio_id}.wav")
        
        if not os.path.exists(audio_path):
            files_not_found += 1
            if files_not_found <= 5:  # Only print first 5 missing files
                print(f"Warning: Audio file not found: {audio_path}")
            continue
        
        # Get prediction
        hypothesis = baseline.transcribe_file(audio_path, language=language)
        
        # Calculate WER
        wer = calculate_wer(reference, hypothesis)
        wer_scores.append(wer)
    
    # Print results
    print(f"\n{'='*60}")
    print(f"VALIDATION RESULTS - {model_size.upper()}")
    print(f"{'='*60}")
    
    if len(wer_scores) == 0:
        print(f"ERROR: No audio files were found!")
        print(f"Files not found: {files_not_found}")
        print(f"Please check the audio_dir path: {audio_dir}")
        print(f"{'='*60}\n")
        return None, []
    
    avg_wer = np.mean(wer_scores)
    median_wer = np.median(wer_scores)
    
    print(f"Files processed: {len(wer_scores)}")
    print(f"Files not found: {files_not_found}")
    print(f"Average WER: {avg_wer:.4f}")
    print(f"Median WER:  {median_wer:.4f}")
    print(f"Min WER:     {np.min(wer_scores):.4f}")
    print(f"Max WER:     {np.max(wer_scores):.4f}")
    print(f"{'='*60}\n")
    
    return avg_wer, wer_scores

In [13]:
print("\n" + "="*60)
print("CARIBBEAN VOICES ASR - BASELINE")
print("="*60 + "\n")
    
# Configuration
MODEL_SIZE = 'base'  # Start with 'base', then try 'small', 'medium'
LANGUAGE = 'en'      # English
    
TEST_CSV = Path("../../data/splits/eval.csv")
AUDIO_DIR = Path("../../data/raw/Audio")
OUTPUT_CSV = f'submission_whisper_{MODEL_SIZE}_baseline.csv'
    
# Check GPU availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
print()


CARIBBEAN VOICES ASR - BASELINE

PyTorch version: 2.9.1+cpu
CUDA available: False



In [14]:
# Run validation first
print("Step 1: Validate on training split...")
print("This helps estimate your public leaderboard score")
    
# Run validation
avg_wer, wer_scores = validate_on_train_split(
    model_size=MODEL_SIZE,
    train_df=training_data,      # Not used but kept for compatibility
    val_df=evaluation_data,       # Your validation set
    language=LANGUAGE,
    audio_dir=AUDIO_DIR           # Absolute path!
)

Step 1: Validate on training split...
This helps estimate your public leaderboard score

VALIDATION ON TRAIN SPLIT - Model: base

Loading Whisper base model on cpu...
Model loaded successfully!
Validation set size: 2979 samples
Audio directory: ..\..\data\raw\Audio


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 462/462 [00:01<00:00, 303.67frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 572/572 [00:01<00:00, 364.27frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 254/254 [00:01<00:00, 225.66frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1302/1302 [00:01<00:00, 700.98frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 344/344 [00:01<00:00, 284.75frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 896/896 [00:01<00:00, 600.81frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 994/994 [00:01<00:00, 519.20frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 322/322 [00:00<00:00, 362.46frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 320/320 [00:00<00:00, 372.13frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 326/326 [00:01<00:00, 293.27frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 412/412 [00:01<00:00, 380.66frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1058/1058 [00:02<00:00, 453.20frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 552/552 [00:01<00:00, 413.24frames/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚


VALIDATION RESULTS - BASE
Files processed: 2979
Files not found: 0
Average WER: 0.1644
Median WER:  0.1176
Min WER:     0.0000
Max WER:     2.6429




