# core

> Fetching transcript

In [None]:
#| default_exp core 

In [None]:
#| export
import os
import re
from pathlib import Path
from fastcore.all import *
from tqdm import tqdm   
import subprocess
import logging
from rich import print

from google import genai
from google.genai import types
import asyncio

import ffmpeg

In [None]:
#| exports
gemini_api_key = os.getenv("GEMINI_API_KEY")

In [None]:
#| exports
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
#| exports
def download_audio(
    vid_id: str, # YouTube video ID
    dest_dir: Path # Output directory
    ):
    "Download audio from YouTube video"
    logging.info(f"Downloading audio for video {vid_id}")
    Path(dest_dir).mkdir(exist_ok=True)
    out_file = Path(dest_dir)/f'{vid_id}.mp3'
    if not out_file.exists():
        subprocess.run(['yt-dlp', '-x', '--audio-format', 'mp3', f'https://www.youtube.com/watch?v={vid_id}', '-o', str(out_file)], check=True)
        logging.info(f"Downloaded audio to {out_file}")
    else:
        logging.info(f"Using existing audio file {out_file}")
    return out_file

In [None]:
#| eval: False
video_id = 'GJ0u09SIPh4'
download_audio(video_id, Path('../_audio'))

2025-07-20 19:09:25,533 - INFO - Downloading audio for video GJ0u09SIPh4


[youtube] Extracting URL: https://www.youtube.com/watch?v=GJ0u09SIPh4
[youtube] GJ0u09SIPh4: Downloading webpage
[youtube] GJ0u09SIPh4: Downloading tv client config
[youtube] GJ0u09SIPh4: Downloading player 69b31e11-main
[youtube] GJ0u09SIPh4: Downloading tv player API JSON
[youtube] GJ0u09SIPh4: Downloading ios player API JSON
[youtube] GJ0u09SIPh4: Downloading m3u8 information
[info] GJ0u09SIPh4: Downloading 1 format(s): 251
[download] Destination: ../_audio/GJ0u09SIPh4.webm
[download] 100% of   83.98MiB in 00:00:08 at 9.79MiB/s     
[ExtractAudio] Destination: ../_audio/GJ0u09SIPh4.mp3


2025-07-20 19:10:17,770 - INFO - Downloaded audio to ../_audio/GJ0u09SIPh4.mp3


Deleting original file ../_audio/GJ0u09SIPh4.webm (pass -k to keep)


Path('../_audio/GJ0u09SIPh4.mp3')

In [None]:
#| exports
def detect_silence(audio_file:Path):
    "Detect silence in audio file and return start and end times"
    stream = ffmpeg.input(str(audio_file))
    stream = stream.filter('silencedetect', noise='-30dB', d=0.5)
    stream = stream.output('null', f='null')
    out, err = ffmpeg.run(stream, capture_stderr=True)
    return out, err

In [None]:
#| eval: False
_, err = detect_silence(Path('../_audio/GJ0u09SIPh4.mp3'))

In [None]:
#| exports
def parse_silence_ends(stderr_output:bytes):
    "Parse silence ends from ffmpeg stderr output"
    pattern = r'silence_end: ([\d.]+)'
    matches = re.findall(pattern, stderr_output.decode())
    return [float(match) for match in matches]

In [None]:
#| eval: False
ends = parse_silence_ends(err); L(ends)[:10]

(#10) [0.513563,15.558687,26.482021,29.918437,32.245583,34.150583,35.980771,36.597167,39.411437,43.585812]

In [None]:
#| exports
def find_split_points(
    silence_ends:list[float], # silence ends
    total_len:float, # total length of the audio (in seconds)
    chunk_len:float=600 # length of the chunks (in seconds)
    ):
    "Find points to split audio based on silence detection, aiming for chunks of `chunk_len` seconds"
    splits,target = [0],chunk_len
    for t in silence_ends:
        if t >= target:
            splits.append(t)
            target += chunk_len
    splits.append(total_len) # final chunk
    return splits

In [None]:
#| exports
def get_audio_duration(audio_file:"Path|str"):
    "Get duration of audio file in seconds"
    probe = ffmpeg.probe(str(audio_file))
    return float(probe['format']['duration'])

In [None]:
#| eval: False
tot_len = get_audio_duration(Path('../_audio/GJ0u09SIPh4.mp3')); tot_len

6995.976

In [None]:
#| eval: False
soft_splits = find_split_points(ends, tot_len); soft_splits

[0,
 603.482062,
 1202.536562,
 1802.256479,
 2401.709521,
 3004.959437,
 3605.712229,
 4206.138958,
 4800.153625,
 5400.729625,
 6003.723708,
 6610.651771,
 6995.976]

In [None]:
#| exports
def get_mime_type(f): return 'audio/mpeg' if Path(f).suffix.lower() == '.mp3' else 'audio/mp4'

In [None]:
#| exports
def split_audio(
    fname:"Path", # Audio file to split
    splits:"list", # List of timestamps in seconds to split at
    dest_dir:'str|Path'="_audio_chunks"): # Directory to save chunks
    "Split audio file into chunks based on split points"
    Path(dest_dir).mkdir(exist_ok=True)
    chunks = []
    for i, start_time in tqdm(enumerate(splits[:-1]), total=len(splits)-1):
        duration = splits[i+1] - start_time
        chunk_name = f"{fname.stem}_chunk_{i+1:02d}.mp3"
        output_path = Path(dest_dir)/chunk_name
        chunks.append(output_path)
        (ffmpeg
         .input(str(fname), ss=start_time, t=duration)
         .output(str(output_path), acodec='copy')
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True))
    return chunks

In [None]:
#| eval: False
split_audio(Path('../_audio/GJ0u09SIPh4.mp3'), soft_splits, dest_dir='../_audio/_audio_chunks')

100%|██████████| 12/12 [00:02<00:00,  5.70it/s]


[Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_01.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_02.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_03.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_04.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_05.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_06.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_07.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_08.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_09.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_10.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_11.mp3'),
 Path('../_audio/_audio_chunks/GJ0u09SIPh4_chunk_12.mp3')]

In [None]:
#| exports
async def transcribe_audio(
    chunks_dir:str|Path,  # Directory containing audio chunks
    dest_file:str|Path, # File to save transcript to
    model:str='gemini-2.0-flash-001', # Gemini model to use
    max_concurrent:int=3,   # Max concurrent transcriptions
    prompt:str="Please transcribe this audio file:" # Custom prompt for transcription
) -> str:
    "Transcribe audio chunks in parallel and combine into single transcript"
    semaphore = asyncio.Semaphore(max_concurrent)
    client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
    
    async def _transcribe_chunk(chunk_path):
        async with semaphore:
            audio_data = chunk_path.read_bytes()
            audio_part = types.Part.from_bytes(
                mime_type=get_mime_type(chunk_path), 
                data=audio_data
            )
            response = await client.aio.models.generate_content(
                model=model,
                contents=[prompt, audio_part]
            )
            return response.text
    
    chunks = sorted(Path(chunks_dir).glob("*.mp3"))
    tasks = [_transcribe_chunk(chunk) for chunk in chunks]
    transcripts = await asyncio.gather(*tasks)
    
    full_transcript = '\n'.join(transcripts)
    dest_path = Path(dest_file)
    dest_path.parent.mkdir(parents=True, exist_ok=True)
    dest_path.write_text(full_transcript)
    return full_transcript

In [None]:
#| eval: False
transcript = await transcribe_audio(
    chunks_dir="../_audio/_audio_chunks", 
    dest_file="../_transcripts/transcript.txt",
    prompt="Please transcribe this audio file verbatim. Note that this is an academic course in French from College de France. The transcript should be in French."
)

2025-07-20 19:21:07,363 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:07,403 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:07,446 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:21,297 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:21,307 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:21,745 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:21,758 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:22,070 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent "HTTP/1.1 200 OK"
2025-07-20 19:21:22,075 - INFO - AFC is enabled with max remote calls: 10.
2025-07-20 19:21:35,576 - INFO - HTTP Request: POST https://g

In [None]:
#| eval: False
print(transcript[:1000])

In [None]:
#| eval: False
print(transcript[-1000:])