In [None]:
# ==========================================
# 0. Install libraries
# ==========================================
!pip install -q transformers librosa soundfile

print("Libraries installed.")

In [None]:
# ==========================================
# 1. Mount Google Drive (force remount to be safe)
# ==========================================
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# ==========================================
# 2. Unzip Audio.zip (robust)
# ==========================================
import os
import zipfile
import shutil

drive_zip_path = '/content/drive/MyDrive/Hackathon/Audio.zip'
local_zip_path = '/content/Audio.zip'
audio_dir = 'audio_files'

# Check that the file exists on Drive
if not os.path.exists(drive_zip_path):
    raise FileNotFoundError(f"Could not find Audio.zip at: {drive_zip_path}")

print(f"Found zip on Drive: {drive_zip_path}")

# Copy zip locally to avoid Drive I/O issues
shutil.copy(drive_zip_path, local_zip_path)
print(f"Copied zip to local path: {local_zip_path}")

# Create extraction directory
os.makedirs(audio_dir, exist_ok=True)

# Extract from local copy
with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
    zip_ref.extractall(audio_dir)

print(f"Audio files extracted to: {audio_dir}")

In [None]:
# ==========================================
# 3. Load Test.csv
# ==========================================
import pandas as pd

test_csv_path = '/content/drive/MyDrive/Hackathon/Test.csv'
test_df = pd.read_csv(test_csv_path)

print("Test DataFrame head:")
print(test_df.head())
print("Total test files:", len(test_df))

In [None]:
# ==========================================
# 4. Load pretrained ASR model (no fine-tuning)
# ==========================================
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

MODEL_CHECKPOINT = "facebook/wav2vec2-base-960h"

processor = Wav2Vec2Processor.from_pretrained(MODEL_CHECKPOINT)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_CHECKPOINT)

use_cuda = torch.cuda.is_available()
device = "cuda" if use_cuda else "cpu"
model.to(device)
model.eval()

print("Using GPU:", use_cuda)
print("Model loaded on:", device)
if not use_cuda:
    print("⚠️ WARNING: Running on CPU will be very slow for ~8,000 files. "
          "In Colab, go to Runtime → Change runtime type → set Hardware accelerator to GPU.")

In [None]:
# ==========================================
# 5. Transcribe test audio
# ==========================================
import librosa
import time

AUDIO_SR = 16000       # target sample rate
MAX_SECONDS = 30

predictions = []

start_time = time.time()
n_files = len(test_df)

for idx, row in test_df.iterrows():
    audio_id = row["ID"]
    audio_path = os.path.join(audio_dir, f"{audio_id}.wav")

    if not os.path.exists(audio_path):
        print(f"Warning: missing audio file: {audio_path}")
        predictions.append("")
        continue

    # Progress log every 100 files
    if idx % 100 == 0:
        elapsed = time.time() - start_time
        print(f"Processing file {idx}/{n_files}  |  Elapsed: {elapsed:.1f}s")

    # Load audio
    speech_array, sr = librosa.load(audio_path, sr=AUDIO_SR)

    # Truncate for safety/speed
    max_len = int(MAX_SECONDS * AUDIO_SR)
    if len(speech_array) > max_len:
        speech_array = speech_array[:max_len]

    # Prepare inputs
    inputs = processor(
        speech_array,
        sampling_rate=AUDIO_SR,
        return_tensors="pt",
        padding=True,
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Inference
    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(
        predicted_ids,
        skip_special_tokens=True
    )[0]

    predictions.append(transcription.strip())

total_time = time.time() - start_time
print("Number of predictions:", len(predictions))
print(f"Total inference time: {total_time/60:.1f} minutes")


print("\nSample predictions:")
for i in range(min(5, len(predictions))):
    print(f"ID: {test_df['ID'].iloc[i]}, Pred: '{predictions[i]}'")

In [None]:
# ==========================================
# 6. Create SampleSubmission.csv
# ==========================================
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "Transcription": predictions
})

submission_path = "SampleSubmission.csv"
submission_df.to_csv(submission_path, index=False)

print(f"Saved submission file to: {submission_path}")
print(submission_df.head())