## Step 1: Setup (Run this first!) ‚öôÔ∏è

Click the ‚ñ∂Ô∏è button to install the required software. This may take a minute.

In [None]:
import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)

# Install required packages
!pip install -q google-genai pydub ipywidgets

# Import necessary libraries
import os
import time
import mimetypes
from pathlib import Path
from google.colab import files
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output, Markdown
from google import genai
from google.genai import types
from pydub import AudioSegment

# ============================================
# SUPPORTED FILE FORMATS
# ============================================

# Audio formats supported by Gemini
SUPPORTED_AUDIO_FORMATS = {
    '.mp3': 'audio/mpeg',
    '.wav': 'audio/wav',
    '.m4a': 'audio/mp4',
    '.flac': 'audio/flac',
    '.ogg': 'audio/ogg',
    '.webm': 'audio/webm',
    '.aac': 'audio/aac'
}

# Video formats supported by Gemini
SUPPORTED_VIDEO_FORMATS = {
    '.mp4': 'video/mp4',
    '.mov': 'video/quicktime',
    '.avi': 'video/x-msvideo',
    '.mkv': 'video/x-matroska',
    '.webm': 'video/webm'
}

# Combined formats
ALL_SUPPORTED_FORMATS = {**SUPPORTED_AUDIO_FORMATS, **SUPPORTED_VIDEO_FORMATS}

# ============================================
# CREATE FOLDER STRUCTURE
# ============================================

# Define folder paths
FOLDERS = {
    'media': 'media_files',
    'transcriptions': 'transcriptions',
    'prompts': 'prompts',
    'temp': 'temp_segments'
}

# Create all folders
for folder_name, folder_path in FOLDERS.items():
    os.makedirs(folder_path, exist_ok=True)

# ============================================
# CREATE PROMPT FILES
# ============================================

PROMPT_FILES = {
    "1_full_transcription.md": {
        "title": "Full Audio Transcription",
        "description": "Detailed word-for-word transcription with timestamps and speaker labels",
        "auto_split": True,
        "content": """# Full Audio Transcription

## Role and Objective
- Faithfully transcribe audio recordings into a publication-ready, accurate, and well-structured transcript.

## Instructions
- Transcribe exactly what is spoken without summarising or paraphrasing.
- Use standard punctuation and sentence case; break into paragraphs at topic or speaker shifts.
- Label each speaker consistently as Speaker 1:, Speaker 2:, etc.
- Insert a timestamp at the start of every speaker turn in the format [hh:mm:ss].
- For unclear audio, use [inaudible hh:mm:ss]. If unsure about a word or name, bracket with a question mark, e.g., [Kandahar?].
- Mark non-speech events (e.g., [overlapping speech], [laughter], [applause], [music]) in square brackets.
- Omit routine filler words ("um", "uh", repeated false starts) unless their inclusion changes the meaning of the sentence.
- Normalize numbers and dates for clarity (e.g., "twenty-five" ‚Üí "25", "first of May 2024" ‚Üí "1 May 2024").
- Preserve names and terms as heard; if unsure of spelling, use [term?].
- Maintain any code-switching or language changes as spoken; do not translate.
- Transcribe profanity, slurs, and sensitive language exactly as spoken.
- After completing the transcription, validate the output to ensure it matches the defined formatting conventions and is free of omissions, correcting any errors identified before finalizing the output.

## Output Format
- Each speaker turn starts on a new line with a timestamp [hh:mm:ss], speaker label, and the transcript.
- Clearly indicate non-speech and unclear audio using the conventions above.
- Separate paragraphs (speaker turns or topic shifts) with a blank line.
- Output should be in plain text or Markdown with appropriate spacing.
"""
    },
    "2_meeting_minutes.md": {
        "title": "Meeting Minutes",
        "description": "Summarized meeting notes with decisions, action items, and next steps",
        "auto_split": False,
        "content": """# Meeting Minutes

## Role and Objective
- Generate succinct, decision-oriented meeting minutes focused on actionable outcomes and relevant context.

## Instructions
- Summarize, do not transcribe. Capture only essential information for clarity and accountability.

## Scope
Include:
- Header details (title, date/time, location, chair, note-taker, attendees, apologies)
- Agenda coverage
- Announcements
- Decisions
- Action items (specifying owner and due date)
- Key risks/issues
- Dependencies
- Open questions
- Next steps/next meeting

Maintain only the context necessary to understand each decision, with brief rationale. Omit small talk and verbatim digressions.

## Participants & Timing
- List all attendees, apologies, chair, and note-taker.
- Add a `[hh:mm:ss]` timestamp at the start of any decision, action, or announcement if available in the input.

## Editing Rules
- Capture the core point, not all rhetoric; avoid unintended paraphrasing or misrepresentation.
- Normalize numbers and dates (e.g., 15 September 2025, 14:00‚Äì15:00 CEST).
- Use consistent speaker names/roles. If unknown, default to "Participant 1", "Participant 2", etc.
- For unclear audio, insert `[inaudible hh:mm:ss]`; for overlapping speakers, insert `[crosstalk]`.
- If any action item is missing an owner or deadline, set as Owner: TBD / Due: TBD and flag this instance.
"""
    },
    "3_interview.md": {
        "title": "Interview Transcription",
        "description": "Q&A format with interviewer/interviewee labels and emotional context",
        "auto_split": True,
        "content": """# Interview Transcription

## Role and Objective
Please transcribe this interview accurately and professionally.

## Instructions
- Clearly distinguish between interviewer and interviewee
- Format in a question-and-answer structure when possible
- Include emotional context (laughter, pauses) in [brackets]
- Maintain the conversational flow and natural speech patterns
- Preserve the tone and style of both speakers
- Note any significant pauses or interruptions
- Keep the chronological order of the conversation
- Use timestamps [hh:mm:ss] at speaker changes
"""
    },
    "4_lecture.md": {
        "title": "Lecture/Educational Content",
        "description": "Structured notes with key concepts, definitions, and Q&A sections",
        "auto_split": True,
        "content": """# Lecture Transcription

## Role and Objective
Transcribe the educational content accurately, focusing strictly on the key concepts and main points.

## Instructions
- Structure the transcript in clear paragraphs
- Only include slide references or visual descriptions when explicitly mentioned in the material
- Note audience questions and responses in a separate section
- Preserve all academic terminology and technical language precisely; do not simplify unless specifically requested
- Organize the material logically for educational clarity
- Highlight major concepts and definitions

## Focus Areas
Extract only the central ideas and supporting points emphasized by the speaker:
- Thesis and key claims
- Evidence and examples
- Methodologies
- Conclusions
- Implications or limitations

## Output Format
```
# Summary (‚â§ 200 words)

## Core Takeaways (5-8 bullets)

## Key Points by Section

## Definitions & Concepts

## Evidence & Examples

## Q&A (if any)

## Keywords/Tags
```
"""
    },
    "5_qa_summary.md": {
        "title": "Q&A Summary",
        "description": "Extract and condense only questions and answers from recordings",
        "auto_split": False,
        "content": """# Q&A-Focused Transcription (Extract & Condense)

## Role and Objective
Produce a concise Q&A transcript from audio recordings by extracting and condensing only the essential questions and answers.

## Instructions
- Include only questions and answers in the transcript
- Omit introductions, bios, housekeeping comments, and small talk
- For each question, summarize to the essential inquiry in 1‚Äì2 sentences, retaining key names, citations, numbers, and dates
- For each answer, distill the main claim(s) and provide up to 3‚Äì4 supporting points or examples

## Speakers & Timestamps
- Label each turn as: `[hh:mm:ss] Q (Name/Audience #):` and `[hh:mm:ss] A (Name/Role):`
- If the speaker is unnamed, use Audience 1, Audience 2, etc.

## Output Format
- Output must be strictly in Markdown
- Each Q and A block appears on its own line
- Insert a single blank line between each Q/A pair
"""
    },
    "6_translation.md": {
        "title": "Full Audio Translation (to English)",
        "description": "Translate non-English audio to English with cultural context notes",
        "auto_split": True,
        "content": """# Full Audio Translation (to English)

## Role and Objective
- Faithfully transcribe and translate audio recordings into a publication-ready, accurate, and well-structured English transcript.

## Instructions
- Translate all spoken content into English, regardless of the original language(s)
- Maintain the original meaning and tone as closely as possible while producing natural, fluent English
- Use standard punctuation and sentence case; break into paragraphs at topic or speaker shifts
- Label each speaker consistently as Speaker 1:, Speaker 2:, etc.
- Insert a timestamp at the start of every speaker turn in the format [hh:mm:ss]
- For unclear audio, use [inaudible hh:mm:ss]. If unsure about a word or name, bracket with a question mark, e.g., [Kandahar?]
- Mark non-speech events (e.g., [overlapping speech], [laughter], [applause], [music]) in square brackets

## Language Handling
- When the original language changes (code-switching), indicate the original language in brackets, e.g., [in French:] before the translated text if relevant for context
- For culturally specific terms, idiomatic expressions, or words with no direct English equivalent, provide the English translation followed by the original term in parentheses, e.g., "religious endowment (waqf)", "neighborhood (mahalla)"
"""
    }
}

# Write prompt files to disk
for filename, prompt_data in PROMPT_FILES.items():
    filepath = os.path.join(FOLDERS['prompts'], filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(prompt_data['content'])

# ============================================
# LOAD PROMPTS FROM FILES
# ============================================

def load_prompts():
    """Load all prompts from the prompts folder."""
    prompts = {}
    prompts_dir = Path(FOLDERS['prompts'])
    
    for filepath in sorted(prompts_dir.glob('*.md')):
        filename = filepath.name
        if filename in PROMPT_FILES:
            meta = PROMPT_FILES[filename]
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
            prompts[meta['title']] = {
                'description': meta['description'],
                'auto_split': meta['auto_split'],
                'content': content,
                'filepath': str(filepath)
            }
    
    return prompts

PROMPTS = load_prompts()

# Print folder structure
print("‚úÖ Setup complete!")
print()
print("üìÅ Folder structure created:")
print("   ‚îú‚îÄ‚îÄ üìÇ media_files/      ‚Üê Upload your audio/video here")
print("   ‚îú‚îÄ‚îÄ üìÇ transcriptions/   ‚Üê Output files saved here")
print("   ‚îú‚îÄ‚îÄ üìÇ prompts/          ‚Üê Editable prompt templates")
print("   ‚îÇ   ‚îú‚îÄ‚îÄ 1_full_transcription.md")
print("   ‚îÇ   ‚îú‚îÄ‚îÄ 2_meeting_minutes.md")
print("   ‚îÇ   ‚îú‚îÄ‚îÄ 3_interview.md")
print("   ‚îÇ   ‚îú‚îÄ‚îÄ 4_lecture.md")
print("   ‚îÇ   ‚îú‚îÄ‚îÄ 5_qa_summary.md")
print("   ‚îÇ   ‚îî‚îÄ‚îÄ 6_translation.md")
print("   ‚îî‚îÄ‚îÄ üìÇ temp_segments/    ‚Üê Temporary audio segments")
print()
print("üéµ Supported audio formats:", ", ".join(sorted(SUPPORTED_AUDIO_FORMATS.keys())))
print("üé¨ Supported video formats:", ", ".join(sorted(SUPPORTED_VIDEO_FORMATS.keys())))
print()
print("üí° Tip: You can edit the prompt files in the 'prompts' folder to customize them!")

## Step 2: Enter Your API Key üîë

Enter your Google Gemini API key below. 

**Don't have one?** Get it free at: https://aistudio.google.com/app/api-keys

Your API key is entered securely (hidden like a password).

In [None]:
# Create a secure password field for the API key
api_key_input = widgets.Password(
    placeholder='Paste your API key here',
    description='API Key:',
    layout=widgets.Layout(width='500px'),
    style={'description_width': '80px'}
)

api_key_status = widgets.HTML(value="")

def validate_api_key(change):
    if len(change['new']) > 20:
        api_key_status.value = "<span style='color: green;'>‚úÖ API key entered</span>"
    else:
        api_key_status.value = "<span style='color: orange;'>‚è≥ Please enter your full API key</span>"

api_key_input.observe(validate_api_key, names='value')

# Check for Colab Secret
try:
    from google.colab import userdata
    secret_key = userdata.get('GEMINI_API_KEY')
    if secret_key:
        api_key_input.value = secret_key
        api_key_status.value = "<span style='color: green;'>‚úÖ API key loaded from Colab Secrets</span>"
except Exception:
    pass # Secret not found or error accessing it

display(HTML("<b>Enter your Gemini API key:</b>"))
display(api_key_input)
display(api_key_status)
display(HTML("<br><i>üí° Tip: Your key starts with 'AIza...'</i>"))
display(HTML("<i>(Or add 'GEMINI_API_KEY' to your Colab Secrets for auto-loading)</i>"))

## Step 3: Upload Your Media File(s) üìÅ

Click the button below to select and upload your audio or video file(s).

**Supported audio formats:** MP3, WAV, M4A, FLAC, OGG, WEBM, AAC

**Supported video formats:** MP4, MOV, AVI, MKV, WEBM

In [None]:
# Store uploaded files
uploaded_files = []

upload_status = widgets.HTML(value="")

def upload_media_files(b):
    global uploaded_files
    upload_status.value = "<span style='color: blue;'>üì§ Upload dialog opened... Select your file(s)</span>"
    
    try:
        uploaded = files.upload()
        
        if uploaded:
            uploaded_files = []
            valid_files = []
            invalid_files = []
            
            for filename, content in uploaded.items():
                ext = Path(filename).suffix.lower()
                if ext in ALL_SUPPORTED_FORMATS:
                    # Save file to media folder
                    filepath = os.path.join(FOLDERS['media'], filename)
                    with open(filepath, 'wb') as f:
                        f.write(content)
                    uploaded_files.append(filepath)
                    valid_files.append((filename, ext))
                else:
                    invalid_files.append(filename)
            
            status_html = ""
            if valid_files:
                status_html += f"<span style='color: green;'>‚úÖ Uploaded {len(valid_files)} file(s) to <code>media_files/</code>:</span><br>"
                for f, ext in valid_files:
                    icon = "üé¨" if ext in SUPPORTED_VIDEO_FORMATS else "üéµ"
                    status_html += f"&nbsp;&nbsp;&nbsp;{icon} {f}<br>"
            if invalid_files:
                status_html += f"<span style='color: red;'>‚ùå Skipped {len(invalid_files)} unsupported file(s):</span><br>"
                for f in invalid_files:
                    status_html += f"&nbsp;&nbsp;&nbsp;‚ö†Ô∏è {f}<br>"
                status_html += f"<br><i>Supported: {', '.join(sorted(ALL_SUPPORTED_FORMATS.keys()))}</i>"
            
            upload_status.value = status_html
        else:
            upload_status.value = "<span style='color: orange;'>‚ö†Ô∏è No files uploaded</span>"
    except Exception as e:
        upload_status.value = f"<span style='color: red;'>‚ùå Error: {str(e)}</span>"

upload_button = widgets.Button(
    description='üìÅ Upload Audio/Video Files',
    button_style='primary',
    layout=widgets.Layout(width='250px', height='40px')
)
upload_button.on_click(upload_media_files)

display(upload_button)
display(upload_status)
display(HTML("<br><i>üí° Files will be saved to the <code>media_files/</code> folder</i>"))

## Step 4: Choose Your Settings üéõÔ∏è

Select the transcription style and options below.

In [None]:
# ============================================
# SETTINGS WIDGETS
# ============================================

# Model selection (Gemini 3 Pro uses thinking_level, Flash uses budget)
model_dropdown = widgets.Dropdown(
    options=[
        ('Gemini 3 Pro (Highest quality)', 'gemini-3-pro-preview'),
        ('Gemini 3 Flash (Faster, good quality)', 'gemini-3-flash-preview'),
    ],
    value='gemini-3-pro-preview',
    description='AI Model:',
    style={'description_width': '100px'},
    layout=widgets.Layout(width='450px')
)

# Model info display
model_info = widgets.HTML(value="")

def update_model_info(change):
    model = change['new']
    if "3-pro" in model:
        model_info.value = """
        <div style='background: #e8f5e9; padding: 10px; border-radius: 5px; margin: 5px 0;'>
        üß† <b>Gemini 3 Pro</b>: Thinking level LOW, Temperature 0.1<br>
        üéµ Best for: Complex audio, multiple speakers, challenging accents
        </div>
        """
    else:
        model_info.value = """
        <div style='background: #e3f2fd; padding: 10px; border-radius: 5px; margin: 5px 0;'>
        ‚ö° <b>Gemini 3 Flash</b>: Thinking level MINIMAL, Temperature 0.2<br>
        üéµ Best for: Standard recordings, faster processing
        </div>
        """

model_dropdown.observe(update_model_info, names='value')
update_model_info({'new': model_dropdown.value})

# Reload prompts button (in case user edited them)
def reload_prompts_click(b):
    global PROMPTS
    PROMPTS = load_prompts()
    prompt_dropdown.options = list(PROMPTS.keys())
    prompt_status.value = "<span style='color: green;'>‚úÖ Prompts reloaded from files!</span>"

reload_button = widgets.Button(
    description='üîÑ Reload Prompts',
    button_style='',
    layout=widgets.Layout(width='150px')
)
reload_button.on_click(reload_prompts_click)

prompt_status = widgets.HTML(value="")

# Prompt selection
prompt_dropdown = widgets.Dropdown(
    options=list(PROMPTS.keys()),
    value=list(PROMPTS.keys())[0] if PROMPTS else None,
    description='Style:',
    style={'description_width': '100px'},
    layout=widgets.Layout(width='350px')
)

# Prompt description display
prompt_description = widgets.HTML(
    value=f"<i>üìù {PROMPTS[list(PROMPTS.keys())[0]]['description']}</i>" if PROMPTS else ""
)

# Prompt file path display
prompt_filepath = widgets.HTML(
    value=f"<code style='font-size: 11px;'>üìÑ {PROMPTS[list(PROMPTS.keys())[0]]['filepath']}</code>" if PROMPTS else ""
)

# Preview prompt button and output
preview_output = widgets.Output(layout=widgets.Layout(max_height='300px', overflow='auto'))

def preview_prompt_click(b):
    with preview_output:
        clear_output()
        if prompt_dropdown.value and prompt_dropdown.value in PROMPTS:
            selected = prompt_dropdown.value
            filepath = PROMPTS[selected]['filepath']
            print(f"üìÑ Prompt file: {filepath}\n")
            print("=" * 50)
            with open(filepath, 'r', encoding='utf-8') as f:
                print(f.read())

preview_button = widgets.Button(
    description='üëÅÔ∏è Preview Prompt',
    button_style='info',
    layout=widgets.Layout(width='150px')
)
preview_button.on_click(preview_prompt_click)

def update_prompt_description(change):
    selected = change['new']
    if selected in PROMPTS:
        desc = PROMPTS[selected]['description']
        filepath = PROMPTS[selected]['filepath']
        auto_split = PROMPTS[selected]['auto_split']
        prompt_description.value = f"<i>üìù {desc}</i>"
        prompt_filepath.value = f"<code style='font-size: 11px;'>üìÑ {filepath}</code>"
        # Update split checkbox based on prompt recommendation
        split_checkbox.value = auto_split
        # Clear preview when changing selection
        with preview_output:
            clear_output()

prompt_dropdown.observe(update_prompt_description, names='value')

# Audio splitting options
split_checkbox = widgets.Checkbox(
    value=True,
    description='Split long files into segments (recommended for files > 10 min)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='500px')
)

segment_slider = widgets.IntSlider(
    value=10,
    min=5,
    max=30,
    step=5,
    description='Segment length (minutes):',
    style={'description_width': '180px'},
    layout=widgets.Layout(width='400px')
)

# Custom prompt option
use_custom_prompt = widgets.Checkbox(
    value=False,
    description='Use custom prompt instead',
    style={'description_width': 'initial'}
)

custom_prompt_text = widgets.Textarea(
    placeholder='Enter your custom transcription instructions here...\n\nExample: Please transcribe this audio in French, focusing on technical terminology.',
    layout=widgets.Layout(width='500px', height='150px'),
    disabled=True
)

def toggle_custom_prompt(change):
    custom_prompt_text.disabled = not change['new']
    prompt_dropdown.disabled = change['new']
    preview_button.disabled = change['new']
    reload_button.disabled = change['new']

use_custom_prompt.observe(toggle_custom_prompt, names='value')

# Display all settings
display(HTML("<h3>ü§ñ Select AI Model</h3>"))
display(model_dropdown)
display(model_info)

display(HTML("<h3>üìã Select Transcription Style</h3>"))
display(HTML("<p><i>üí° You can edit the prompt files in the <code>prompts/</code> folder and click 'Reload Prompts'</i></p>"))
display(widgets.HBox([prompt_dropdown, reload_button]))
display(prompt_status)
display(prompt_description)
display(prompt_filepath)
display(widgets.HBox([preview_button]))
display(preview_output)

display(HTML("<br>"))
display(use_custom_prompt)
display(custom_prompt_text)

display(HTML("<h3>‚úÇÔ∏è File Splitting Options</h3>"))
display(split_checkbox)
display(segment_slider)
display(HTML("<i>üí° Splitting helps with long recordings and improves accuracy.</i>"))

## Step 5: Start Transcription! üöÄ

Click the button below to start transcribing your audio file(s).

In [None]:
# ============================================
# TRANSCRIPTION ENGINE
# ============================================

class ColabMediaTranscriber:
    """
    Audio/Video Transcriber for Google Colab using Gemini API.
    Supports audio files (with optional splitting) and video files.
    Uses system_instruction for prompts with optimized thinking settings.
    """
    
    def __init__(self, api_key, model='gemini-3-pro-preview', prompt_file=None, custom_prompt=None):
        self.api_key = api_key
        self.model = model
        self.client = genai.Client(api_key=self.api_key)
        self.prompt_file = prompt_file
        self.custom_prompt = custom_prompt
        self.generation_config = self._setup_generation_config()
    
    def _get_system_instruction(self):
        """Load system instruction from prompt file or custom prompt."""
        if self.custom_prompt:
            return self.custom_prompt
        
        if self.prompt_file:
            try:
                with open(self.prompt_file, 'r', encoding='utf-8') as f:
                    return f.read()
            except Exception as e:
                print(f"‚ùå Error reading prompt file: {e}")
                raise
        
        # Default fallback
        return "Transcribe this audio/video accurately. Extract all spoken content."
    
    def _setup_generation_config(self):
        """Configure generation settings based on model type.
        
        Gemini 3 Pro: Uses thinking_level ('low'), temperature 0.1
        Gemini 3 Flash: Uses thinking_level ('MINIMAL'), temperature 0.2
        """
        config_params = {
            "top_p": 0.95,
            "top_k": 40,
            "max_output_tokens": 65535,
            "response_mime_type": "text/plain",
            "system_instruction": self._get_system_instruction(),
        }
        
        if "3-pro" in self.model.lower():
            # Gemini 3 Pro: Use thinking_level (not budget), low temperature
            config_params["temperature"] = 0.1
            config_params["thinking_config"] = types.ThinkingConfig(thinking_level="low")
        else:
            # Gemini 3 Flash: Use thinking_level MINIMAL, slightly higher temperature
            config_params["temperature"] = 0.2
            config_params["thinking_config"] = types.ThinkingConfig(thinking_level="minimal")
        
        # Safety settings for archival content
        config_params["safety_settings"] = [
            types.SafetySetting(
                category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
                threshold=types.HarmBlockThreshold.BLOCK_NONE
            ),
            types.SafetySetting(
                category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
                threshold=types.HarmBlockThreshold.BLOCK_NONE
            ),
            types.SafetySetting(
                category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
                threshold=types.HarmBlockThreshold.BLOCK_NONE
            ),
            types.SafetySetting(
                category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
                threshold=types.HarmBlockThreshold.BLOCK_NONE
            )
        ]
        
        return types.GenerateContentConfig(**config_params)
    
    def is_video_file(self, file_path):
        """Check if file is a video file."""
        ext = Path(file_path).suffix.lower()
        return ext in SUPPORTED_VIDEO_FORMATS
    
    def get_mime_type(self, file_path):
        """Get MIME type for a file."""
        ext = Path(file_path).suffix.lower()
        return ALL_SUPPORTED_FORMATS.get(ext, 'audio/mpeg')
    
    def split_audio(self, audio_file_path, segment_minutes=10):
        """Split audio into segments and save to temp folder."""
        try:
            segment_ms = segment_minutes * 60 * 1000
            audio = AudioSegment.from_file(audio_file_path)
            
            if len(audio) <= segment_ms:
                return [audio_file_path]
            
            segments = []
            base_name = Path(audio_file_path).stem
            ext = Path(audio_file_path).suffix
            
            for i, start in enumerate(range(0, len(audio), segment_ms), start=1):
                end = min(start + segment_ms, len(audio))
                chunk = audio[start:end]
                # Save segments to temp folder
                segment_path = os.path.join(FOLDERS['temp'], f"{base_name}_segment_{i:02d}{ext}")
                
                # Map extensions to export formats
                format_map = {'m4a': 'mp4', 'mp4': 'mp4', 'mp3': 'mp3', 
                              'wav': 'wav', 'flac': 'flac', 'ogg': 'ogg'}
                export_format = format_map.get(ext.lstrip('.').lower(), 'mp3')
                chunk.export(segment_path, format=export_format)
                segments.append(segment_path)
            
            return segments
        except Exception as e:
            print(f"‚ö†Ô∏è Could not split audio: {e}. Processing as single file.")
            return [audio_file_path]

    def split_video(self, video_file_path, segment_minutes=10):
        """Split video into segments using ffmpeg."""
        try:
            import subprocess
            import math
            
            # Check if ffmpeg is available
            try:
                subprocess.run(["ffmpeg", "-version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
            except FileNotFoundError:
                print("‚ö†Ô∏è ffmpeg not found. Cannot split video. Processing as single file.")
                return [video_file_path]

            # Get video duration using ffprobe
            try:
                result = subprocess.run(
                    ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", video_file_path],
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    check=True
                )
                duration = float(result.stdout)
            except Exception as e:
                print(f"‚ö†Ô∏è Could not determine video duration: {e}. Processing as single file.")
                return [video_file_path]
            
            segment_seconds = segment_minutes * 60
            if duration <= segment_seconds:
                return [video_file_path]
            
            segments = []
            base_name = Path(video_file_path).stem
            ext = Path(video_file_path).suffix
            
            num_segments = math.ceil(duration / segment_seconds)
            
            print(f"   ‚úÇÔ∏è Splitting video into {num_segments} segments...")
            
            for i in range(num_segments):
                start_time = i * segment_seconds
                segment_path = os.path.join(FOLDERS['temp'], f"{base_name}_segment_{i+1:02d}{ext}")
                
                # Use ffmpeg to split (stream copy for speed)
                # -ss before -i is faster but less accurate keyframes. 
                # -avoid_negative_ts 1 shifts timestamps to be positive.
                cmd = [
                    "ffmpeg", "-y",
                    "-ss", str(start_time),
                    "-t", str(segment_seconds),
                    "-i", video_file_path,
                    "-c", "copy",
                    "-avoid_negative_ts", "1",
                    "-loglevel", "error",
                    segment_path
                ]
                subprocess.run(cmd, check=True)
                segments.append(segment_path)
                
            return segments
            
        except Exception as e:
            print(f"‚ö†Ô∏è Could not split video: {e}. Processing as single file.")
            return [video_file_path]
    
    def transcribe_with_bytes(self, file_path):
        """Transcribe a media file by sending bytes directly (for smaller files)."""
        with open(file_path, 'rb') as f:
            media_bytes = f.read()
        
        mime_type = self.get_mime_type(file_path)
        
        media_part = types.Part.from_bytes(
            data=media_bytes,
            mime_type=mime_type
        )
        
        # Simple user prompt - system instruction is in config
        user_prompt = "Please perform complete transcription."
        
        response = self.client.models.generate_content(
            model=self.model,
            contents=[media_part, user_prompt],
            config=self.generation_config
        )
        
        return response.text.strip()
    
    def transcribe_with_upload(self, file_path):
        """Transcribe a media file by uploading first (for larger files like videos)."""
        print(f"   ‚îî‚îÄ üì§ Uploading file to Gemini...")
        
        # Upload the file
        uploaded_file = self.client.files.upload(
            file=file_path,
            config=types.UploadFileConfig(
                display_name=Path(file_path).name,
                mime_type=self.get_mime_type(file_path)
            )
        )
        
        # Wait for processing if needed
        while uploaded_file.state == 'PROCESSING':
            print(f"   ‚îî‚îÄ ‚è≥ Processing upload...")
            time.sleep(2)
            uploaded_file = self.client.files.get(name=uploaded_file.name)
        
        if uploaded_file.state == 'FAILED':
            raise Exception(f"File upload failed: {uploaded_file.error}")
        
        print(f"   ‚îî‚îÄ ‚úÖ Upload complete, transcribing...")
        
        # Simple user prompt - system instruction is in config
        user_prompt = "Please perform complete transcription."
        
        response = self.client.models.generate_content(
            model=self.model,
            contents=[uploaded_file, user_prompt],
            config=self.generation_config
        )
        
        # Clean up uploaded file
        try:
            self.client.files.delete(name=uploaded_file.name)
        except Exception:
            pass  # Ignore cleanup errors
        
        return response.text.strip()
    
    def transcribe(self, file_path, use_upload=False):
        """Transcribe a single media file."""
        if use_upload or self.is_video_file(file_path):
            return self.transcribe_with_upload(file_path)
        else:
            return self.transcribe_with_bytes(file_path)


# ============================================
# TRANSCRIPTION BUTTON AND OUTPUT
# ============================================

output_area = widgets.Output()
transcription_results = {}  # Store results for download

def run_transcription(b):
    global transcription_results
    transcription_results = {}
    
    with output_area:
        clear_output()
        
        # Get API key
        api_key = api_key_input.value
        
        # If widget is empty, try to load from secrets dynamically
        if not api_key:
            try:
                from google.colab import userdata
                api_key = userdata.get('GEMINI_API_KEY')
                if api_key:
                    print("üîë Found API key in Colab Secrets")
            except Exception:
                pass
        
        # Validate inputs
        if not api_key or len(api_key) < 20:
            print("‚ùå Please enter a valid API key in Step 2")
            print("   (Or add 'GEMINI_API_KEY' to Colab Secrets and re-run Step 2)")
            return
        
        if not uploaded_files:
            print("‚ùå Please upload at least one media file in Step 3")
            return
        
        # Get settings
        model = model_dropdown.value
        split_enabled = split_checkbox.value
        segment_minutes = segment_slider.value
        
        # Get prompt
        custom_prompt = None
        prompt_file = None
        selected_prompt_name = "Custom"
        
        if use_custom_prompt.value and custom_prompt_text.value.strip():
            custom_prompt = custom_prompt_text.value.strip()
            print("üìù Using custom prompt")
        else:
            selected_prompt_name = prompt_dropdown.value
            # Reload prompt from file to get latest version
            prompt_file = PROMPTS[selected_prompt_name]['filepath']
            print(f"üìù Using: {selected_prompt_name}")
            print(f"   üìÑ From: {prompt_file}")
        
        print(f"ü§ñ Model: {model}")
        if "3-pro" in model:
            print("üß† Thinking: level=LOW | Temperature: 0.1")
        else:
            print("‚ö° Thinking: dynamic | Temperature: 0.2")
        print(f"‚úÇÔ∏è File splitting: {'Enabled' if split_enabled else 'Disabled'}")
        if split_enabled:
            print(f"   Segment length: {segment_minutes} minutes")
        print("\n" + "="*50)
        
        try:
            # Initialize transcriber
            transcriber = ColabMediaTranscriber(api_key, model, prompt_file, custom_prompt)
            print("‚úÖ Connected to Gemini API\n")
            
            # Process each file
            for i, media_file in enumerate(uploaded_files, 1):
                filename = Path(media_file).name
                is_video = transcriber.is_video_file(media_file)
                file_type = "video" if is_video else "audio"
                file_icon = "üé¨" if is_video else "üéµ"
                
                print(f"\n{file_icon} Processing {file_type} {i}/{len(uploaded_files)}: {filename}")
                print("-" * 40)
                
                try:
                    segments = []
                    if split_enabled:
                        if is_video:
                            segments = transcriber.split_video(media_file, segment_minutes)
                        else:
                            segments = transcriber.split_audio(media_file, segment_minutes)
                    else:
                        segments = [media_file]

                    if len(segments) > 1:
                        print(f"   ‚úÇÔ∏è Split into {len(segments)} segments (saved to temp_segments/)")
                    
                    transcription_parts = []
                    for j, segment in enumerate(segments, 1):
                        if len(segments) > 1:
                            print(f"   ‚è≥ Transcribing segment {j}/{len(segments)}...")
                        else:
                            print("   ‚è≥ Transcribing...")
                            
                        # Use upload for videos or large segments, bytes for small audio
                        # But transcribe() handles this logic automatically based on file type
                        # However, split audio segments are usually small enough for bytes, 
                        # but split video segments might still be large.
                        # The transcribe method checks is_video_file.
                        
                        result = transcriber.transcribe(segment)
                        
                        if len(segments) > 1:
                            transcription_parts.append(f"[Segment {j}]\n{result}")
                            print(f"   ‚úÖ Segment {j} complete")
                        else:
                            transcription_parts.append(result)
                            print("   ‚úÖ Transcription complete")
                    
                    transcription = "\n\n".join(transcription_parts)
                    
                    # Store result - save to transcriptions folder
                    output_filename = Path(media_file).stem + "_transcription.txt"
                    output_path = os.path.join(FOLDERS['transcriptions'], output_filename)
                    transcription_results[output_filename] = {
                        'content': transcription,
                        'path': output_path
                    }
                    
                    # Save to transcriptions folder
                    with open(output_path, 'w', encoding='utf-8') as f:
                        f.write(f"Transcription of: {filename}\n")
                        f.write(f"Model: {model}\n")
                        f.write(f"Prompt: {selected_prompt_name}\n")
                        f.write("=" * 50 + "\n\n")
                        f.write(transcription)
                    
                    print(f"\n‚úÖ Saved to: {output_path}")
                    
                except Exception as e:
                    print(f"\n‚ùå Error transcribing {filename}: {str(e)}")
            
            # Summary
            print("\n" + "="*50)
            print("üéâ TRANSCRIPTION COMPLETE!")
            print(f"   Files processed: {len(transcription_results)}")
            print(f"   üìÅ Output folder: {FOLDERS['transcriptions']}/")
            print("\nüëá Download your transcriptions in the next step")
            
        except Exception as e:
            print(f"\n‚ùå Error: {str(e)}")
            if "API key" in str(e) or "authentication" in str(e).lower():
                print("\nüí° Please check that your API key is correct.")

transcribe_button = widgets.Button(
    description='üöÄ Start Transcription',
    button_style='success',
    layout=widgets.Layout(width='200px', height='50px')
)
transcribe_button.on_click(run_transcription)

display(transcribe_button)
display(HTML("<br>"))
display(output_area)

## Step 6: Download Your Transcriptions üì•

After transcription is complete, click below to download your files.

In [None]:
download_output = widgets.Output()

def download_transcriptions(b):
    with download_output:
        clear_output()
        
        if not transcription_results:
            print("‚ùå No transcriptions available yet. Please run Step 5 first.")
            return
        
        print("üì• Preparing downloads...\n")
        
        for filename, data in transcription_results.items():
            try:
                filepath = data['path']
                print(f"   Downloading: {filename}")
                print(f"   From: {filepath}")
                files.download(filepath)
            except Exception as e:
                print(f"   ‚ö†Ô∏è Could not download {filename}: {e}")
        
        print("\n‚úÖ Downloads initiated! Check your browser's download folder.")

def download_all_from_folder(b):
    """Download all files from the transcriptions folder."""
    with download_output:
        clear_output()
        
        transcriptions_path = Path(FOLDERS['transcriptions'])
        txt_files = list(transcriptions_path.glob('*.txt'))
        
        if not txt_files:
            print("‚ùå No transcription files found in the transcriptions folder.")
            return
        
        print(f"üì• Found {len(txt_files)} file(s) in {FOLDERS['transcriptions']}/\n")
        
        for filepath in txt_files:
            try:
                print(f"   Downloading: {filepath.name}")
                files.download(str(filepath))
            except Exception as e:
                print(f"   ‚ö†Ô∏è Could not download {filepath.name}: {e}")
        
        print("\n‚úÖ Downloads initiated! Check your browser's download folder.")

download_button = widgets.Button(
    description='üì• Download Latest Transcriptions',
    button_style='info',
    layout=widgets.Layout(width='250px', height='40px')
)
download_button.on_click(download_transcriptions)

download_all_button = widgets.Button(
    description='üì• Download All From Folder',
    button_style='',
    layout=widgets.Layout(width='250px', height='40px')
)
download_all_button.on_click(download_all_from_folder)

display(widgets.HBox([download_button, download_all_button]))
display(HTML(f"<br><i>üí° All transcriptions are saved in <code>{FOLDERS['transcriptions']}/</code></i>"))
display(download_output)

## Step 7 (Optional): View Transcription Results üëÅÔ∏è

Preview your transcription directly in this notebook.

In [None]:
results_preview_output = widgets.Output()

def show_preview(b):
    with results_preview_output:
        clear_output()
        
        if not transcription_results:
            print("‚ùå No transcriptions available yet. Please run Step 5 first.")
            return
        
        for filename, data in transcription_results.items():
            content = data['content']
            filepath = data['path']
            print("=" * 60)
            print(f"üìÑ {filename}")
            print(f"üìÅ {filepath}")
            print("=" * 60)
            print(content[:5000])  # Show first 5000 characters
            if len(content) > 5000:
                print(f"\n... [Truncated - {len(content) - 5000} more characters]")
            print("\n")

def list_all_transcriptions(b):
    """List all transcription files in the folder."""
    with results_preview_output:
        clear_output()
        
        transcriptions_path = Path(FOLDERS['transcriptions'])
        txt_files = list(transcriptions_path.glob('*.txt'))
        
        if not txt_files:
            print("üìÅ No transcription files found yet.")
            return
        
        print(f"üìÅ Files in {FOLDERS['transcriptions']}/\n")
        print("-" * 40)
        for filepath in sorted(txt_files):
            size_kb = filepath.stat().st_size / 1024
            print(f"   üìÑ {filepath.name} ({size_kb:.1f} KB)")
        print("-" * 40)
        print(f"\nTotal: {len(txt_files)} file(s)")

preview_results_button = widgets.Button(
    description='üëÅÔ∏è Preview Latest Results',
    button_style='',
    layout=widgets.Layout(width='200px', height='35px')
)
preview_results_button.on_click(show_preview)

list_files_button = widgets.Button(
    description='üìã List All Files',
    button_style='',
    layout=widgets.Layout(width='150px', height='35px')
)
list_files_button.on_click(list_all_transcriptions)

display(widgets.HBox([preview_results_button, list_files_button]))
display(results_preview_output)

---

## ‚ÑπÔ∏è Help & Troubleshooting

### üìÅ Folder Structure

```
‚îú‚îÄ‚îÄ üìÇ media_files/      ‚Üê Your uploaded audio/video files
‚îú‚îÄ‚îÄ üìÇ transcriptions/   ‚Üê Generated transcription outputs
‚îú‚îÄ‚îÄ üìÇ prompts/          ‚Üê Editable prompt templates (Markdown)
‚îÇ   ‚îú‚îÄ‚îÄ 1_full_transcription.md
‚îÇ   ‚îú‚îÄ‚îÄ 2_meeting_minutes.md
‚îÇ   ‚îú‚îÄ‚îÄ 3_interview.md
‚îÇ   ‚îú‚îÄ‚îÄ 4_lecture.md
‚îÇ   ‚îú‚îÄ‚îÄ 5_qa_summary.md
‚îÇ   ‚îî‚îÄ‚îÄ 6_translation.md
‚îî‚îÄ‚îÄ üìÇ temp_segments/    ‚Üê Temporary audio segments (auto-cleaned)
```

### üéµ Supported Audio Formats
MP3, WAV, M4A, FLAC, OGG, WEBM, AAC

### üé¨ Supported Video Formats  
MP4, MOV, AVI, MKV, WEBM

### ‚úèÔ∏è Customizing Prompts

1. Open the `prompts/` folder in the Colab file browser (left sidebar)
2. Double-click any `.md` file to edit it
3. Save your changes
4. Click **"üîÑ Reload Prompts"** in Step 4 to load your edits

### ü§ñ Model Selection

| Model | Best For | Speed |
|-------|----------|-------|
| **Gemini 3 Pro** | Complex audio, multiple speakers, challenging accents | Slower |
| **Gemini 3 Flash** | Standard recordings, faster processing | Faster |

### Common Issues:

**"API key not valid"**
- Make sure you copied the entire API key
- Get a new key at: https://aistudio.google.com/app/api-keys

**"File format not supported"**
- Audio: MP3, WAV, M4A, FLAC, OGG, WEBM, AAC
- Video: MP4, MOV, AVI, MKV, WEBM
- Try converting your file to a supported format

**"Transcription takes too long"**
- Try using "Gemini 3 Flash" for faster processing
- Enable audio splitting for long audio files
- Note: Video files are processed as a single unit

**"Output is not what I expected"**
- Try a different transcription style
- Edit the prompt file in the `prompts/` folder
- Use the custom prompt option for specific needs

**"Video upload failed"**
- Video files are uploaded to Gemini for processing
- Very large videos may take longer to upload and process
- Check your internet connection

---

### About

**ZMO AI Pipelines** created by [Fr√©d√©rick Madore](https://www.frederickmadore.com/)

Part of the [Leibniz-Zentrum Moderner Orient (ZMO)](https://www.zmo.de/) research tools.

## Step 8 (Optional): Cleanup üßπ

Delete temporary files or clear everything when you're done.

In [None]:
import shutil

cleanup_output = widgets.Output()

def cleanup_temp(b):
    """Delete only temporary segment files."""
    with cleanup_output:
        clear_output()
        temp_path = Path(FOLDERS['temp'])
        if temp_path.exists():
            files_deleted = list(temp_path.glob('*'))
            for f in files_deleted:
                f.unlink()
            print(f"üßπ Deleted {len(files_deleted)} temporary segment file(s)")
        else:
            print("üìÅ Temp folder is already empty")

def cleanup_media(b):
    """Delete uploaded media files."""
    with cleanup_output:
        clear_output()
        media_path = Path(FOLDERS['media'])
        if media_path.exists():
            files_deleted = list(media_path.glob('*'))
            for f in files_deleted:
                f.unlink()
            print(f"üßπ Deleted {len(files_deleted)} media file(s)")
            # Clear the uploaded files list
            global uploaded_files
            uploaded_files = []
        else:
            print("üìÅ Media folder is already empty")

def cleanup_transcriptions(b):
    """Delete all transcription outputs."""
    with cleanup_output:
        clear_output()
        trans_path = Path(FOLDERS['transcriptions'])
        if trans_path.exists():
            files_deleted = list(trans_path.glob('*'))
            for f in files_deleted:
                f.unlink()
            print(f"üßπ Deleted {len(files_deleted)} transcription file(s)")
            # Clear the results dict
            global transcription_results
            transcription_results = {}
        else:
            print("üìÅ Transcriptions folder is already empty")

def cleanup_all(b):
    """Delete everything except prompts."""
    with cleanup_output:
        clear_output()
        total_deleted = 0
        
        for folder_name in ['temp', 'media', 'transcriptions']:
            folder_path = Path(FOLDERS[folder_name])
            if folder_path.exists():
                files_deleted = list(folder_path.glob('*'))
                for f in files_deleted:
                    f.unlink()
                total_deleted += len(files_deleted)
        
        # Clear global state
        global uploaded_files, transcription_results
        uploaded_files = []
        transcription_results = {}
        
        print(f"üßπ Deleted {total_deleted} file(s) total")
        print("   ‚úÖ Temp segments cleared")
        print("   ‚úÖ Media files cleared")
        print("   ‚úÖ Transcriptions cleared")
        print("   üìÅ Prompts folder preserved")

def show_folder_status(b):
    """Show current folder contents."""
    with cleanup_output:
        clear_output()
        print("üìä Current folder status:\n")
        
        for folder_name, folder_path in FOLDERS.items():
            path = Path(folder_path)
            if path.exists():
                files = list(path.glob('*'))
                total_size = sum(f.stat().st_size for f in files if f.is_file()) / 1024
                print(f"   üìÇ {folder_path}/ : {len(files)} file(s), {total_size:.1f} KB")
            else:
                print(f"   üìÇ {folder_path}/ : (not created)")

# Create buttons
btn_temp = widgets.Button(description='üóëÔ∏è Delete Temp Files', button_style='', layout=widgets.Layout(width='180px'))
btn_media = widgets.Button(description='üóëÔ∏è Delete Media Files', button_style='warning', layout=widgets.Layout(width='180px'))
btn_trans = widgets.Button(description='üóëÔ∏è Delete Transcriptions', button_style='warning', layout=widgets.Layout(width='180px'))
btn_all = widgets.Button(description='üóëÔ∏è Delete Everything', button_style='danger', layout=widgets.Layout(width='180px'))
btn_status = widgets.Button(description='üìä Show Status', button_style='info', layout=widgets.Layout(width='150px'))

btn_temp.on_click(cleanup_temp)
btn_media.on_click(cleanup_media)
btn_trans.on_click(cleanup_transcriptions)
btn_all.on_click(cleanup_all)
btn_status.on_click(show_folder_status)

display(HTML("<b>Safe cleanup:</b>"))
display(widgets.HBox([btn_temp, btn_status]))

display(HTML("<br><b>‚ö†Ô∏è Careful - these delete your files:</b>"))
display(widgets.HBox([btn_media, btn_trans]))

display(HTML("<br><b>üî¥ Nuclear option:</b>"))
display(btn_all)
display(HTML("<i>Note: Prompts folder is always preserved</i>"))

display(HTML("<br>"))
display(cleanup_output)