- ✅ Transcribes audio using Whisper
- ✅ Supports .wav, .m4a, .mp3, etc.
- ✅ Extracts audio duration using pydub + ffmpeg
- ✅ Saves each transcript as a .txt file
- ✅ Compiles all transcripts into a single cookie_transcripts.csv file with: participant_id (from first 6 characters of filename), transcript_raw and audio_duration_sec

In [None]:
from faster_whisper import WhisperModel
from pydub.utils import mediainfo
import pandas as pd
import os

model = WhisperModel("base", compute_type= 'int8')

#Getting current working diretcory so we can join paths 
path = os.getcwd()

#making sure we have correct audio folder 
audio_folder = os.path.join(path, "audio_files")
output_txt_folder = "cookie_transcripts_txt"
os.makedirs(output_txt_folder, exist_ok=True)

def get_audio_duration(file_path): 
    try: 
        info = mediainfo(file_path)
        duration = float(info['duration'])
        return round(duration, 2)
    except: 
        return None
    
#creates an empty list where we’ll collect all the transcription info (like participant ID, transcript text, and audio duration) for each audio file.

all_transcripts = []

for filename in os.listdir(audio_folder): 
    if filename.startswith("VR0") and filename.endswith((".wav", ".mp3", ".mp4", ".m4a")): 
        file_path = os.path.join(audio_folder, filename)
        participant_id = filename[:6]
        
        print (f"Transcribing {filename}...")
        
        # Transcribe using Whisper
        segments, info = model.transcribe(file_path)
        transcript_text = " ".join([segment.text.strip() for segment in segments])
        
        # Save individual .txt transcript
        txt_path = os.path.join(output_txt_folder, f"{participant_id}.txt")
        with open(txt_path, "w") as f:
            f.write(transcript_text)
            
        # Get audio duration
        duration_sec = get_audio_duration(file_path)

        # Save row
        all_transcripts.append({
            "participant_id": participant_id,
            "transcript_raw": transcript_text,
            "audio_duration_sec": duration_sec
            })
            

# Save full CSV
df = pd.DataFrame(all_transcripts)
df.to_csv("cookie_transcripts.csv", index=False)
print("✅ Done! Transcripts saved to 'cookie_transcripts.csv' and .txt files created.")

Transcribing VR0071.wav...
Transcribing VR0297 Cookie Theft.m4a...
Transcribing VR0193 Cookie Theft.m4a...
Transcribing VR0090_Cookie_trimmed (1).m4a...
Transcribing VR0096_Cookie.m4a...
Transcribing VR0125 Cookie Theft.m4a...
Transcribing VR0307 Cookie Theft.m4a...
Transcribing VR0187 Cookie Theft.m4a...
Transcribing VR0011_Screening.wav...
Transcribing VR0261 Cookie Theft.m4a...
Transcribing VR0026_Cookie_trimmed (1).wav...
Transcribing VR0094_Cookie_trimmed (1).m4a...
Transcribing VR0135 Cookie Theft.m4a...
Transcribing VR0130 Cookie Theft.m4a...
Transcribing VR0098_Cookie_trimmed (1).m4a...
Transcribing VR0009_Cookie.m4a...
Transcribing VR0475 cookie theft.m4a...
Transcribing VR0174 Cookie Theft.m4a...
Transcribing VR0111_Cookie_trimmed (1).m4a...
Transcribing VR0192 Cookie Theft.m4a...
Transcribing VR0513.m4a...
Transcribing VR0282 Cookie Theft.m4a...
Transcribing VR0004_Cookie.m4a...
Transcribing VR0093_Cookie.wav...
Transcribing VR0105 Cookie Theft.m4a...
Transcribing VR0127_Coo