In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib
!pip install --upgrade openai-whisper tqdm google-generativeai requests




In [21]:
to_be_transcribed = "1AKnppHssmAwkjBo2EPI8Twf6782hH2xv"
transcribed = "1AKnppHssmAwkjBo2EPI8Twf6782hH2xv"

In [22]:
import os
import json
from kaggle_secrets import UserSecretsClient
from google.oauth2 import service_account
from googleapiclient.discovery import build

# ──────────────────────────────────────────────────────────────────────────────
# 1. Pull the SERVICE_ACCOUNT JSON from Kaggle Secrets and write it to /kaggle/working
# ──────────────────────────────────────────────────────────────────────────────
s = UserSecretsClient()
sa_json_str = s.get_secret("GDRIVE_SA_JSON")

sa_path = "/kaggle/working/sa_key.json"
with open(sa_path, "w") as f:
    f.write(sa_json_str)

# ──────────────────────────────────────────────────────────────────────────────
# 2. Build the Drive client using that file
# ──────────────────────────────────────────────────────────────────────────────
SCOPES = ["https://www.googleapis.com/auth/drive"]
creds = service_account.Credentials.from_service_account_file(sa_path, scopes=SCOPES)
drive_service = build("drive", "v3", credentials=creds)

# ──────────────────────────────────────────────────────────────────────────────
# 3. Remove the local service account file for security
# ──────────────────────────────────────────────────────────────────────────────
if os.path.exists(sa_path):
    try:
        os.remove(sa_path)
        print(f"✅ Removal successful: '{sa_path}' has been deleted.")
    except Exception as e:
        print(f"❌ Removal failed: {e}")
else:
    print(f"ℹ️ No such file: '{sa_path}' (nothing to remove).")

# ──────────────────────────────────────────────────────────────────────────────
# 4. List files in the shared folder, then set new_files flag accordingly
# ──────────────────────────────────────────────────────────────────────────────
FOLDER_ID = to_be_transcribed  # ← replace with your actual folder ID

def list_files_in_folder(folder_id):
    query = f"'{folder_id}' in parents and trashed=false"
    files = []
    page_token = None
    while True:
        resp = drive_service.files().list(
            q=query,
            spaces="drive",
            fields="nextPageToken, files(id, name)",
            pageToken=page_token
        ).execute()
        files.extend(resp.get("files", []))
        page_token = resp.get("nextPageToken", None)
        if not page_token:
            break
    return files

files = list_files_in_folder(FOLDER_ID)

# Determine whether new files exist
if not files:
    new_files = False
    print(f"⚠️ No files found in folder ID = {FOLDER_ID!r}. new_files = False.")
else:
    new_files = True
    print(f"✅ Found {len(files)} file(s) in folder ID = {FOLDER_ID!r}. new_files = True.")
    for f in files:
        print(f" • {f['name']} (ID={f['id']})")

# ──────────────────────────────────────────────────────────────────────────────
# 5. Continue with the rest of the notebook’s logic, using new_files flag
# ──────────────────────────────────────────────────────────────────────────────

# Example usage:
#if new_files:
    # ... perform download/transcription/etc.
#    pass
#else:
    # ... skip processing, or run alternative steps
#    pass


✅ Removal successful: '/kaggle/working/sa_key.json' has been deleted.
✅ Found 2 file(s) in folder ID = '1AKnppHssmAwkjBo2EPI8Twf6782hH2xv'. new_files = True.
 • 20230730 OpenHCI presentation.m4a (ID=1CnnASu-V5Ta5GN4k5nQLi-cTrTfoJCVL)
 • 20230810 OpenHCI presentation feedback.m4a (ID=1AV3_V3hTZSTomSziIIEtgYpavmP9uxPr)


In [23]:
import datetime, shutil, sys, time
from pathlib import Path
from tqdm import tqdm
import whisper

# ──────────────────────────────────────────────────────────────────────────────
# Assume `new_files` has been set by the previous logic (True/False)
# ──────────────────────────────────────────────────────────────────────────────
if not new_files:
    print(f"[{datetime.datetime.now():%Y-%m-%d %H:%M:%S}] ⚠️ new_files == False → Skipping transcription.")
    # You can add any alternative logic here if needed
else:
    AUDIO_EXT = {'.wav', '.mp3', '.m4a', '.flac', '.ogg', '.webm'}

    def _now():
        """Log-friendly timestamp."""
        return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def _fmt_ts(seconds: float) -> str:
        """float seconds → HH:MM:SS.mmm string."""
        h, m = divmod(int(seconds), 3600)
        m, s = divmod(m, 60)
        ms = int((seconds - int(seconds)) * 1000)
        return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"

    def save_transcript(result: dict, out_path: Path):
        """
        Write Whisper's segment list to a .txt file.
        Each line: [start → end] text
        """
        with open(out_path, "w", encoding="utf-8") as f:
            for seg in result["segments"]:
                f.write(f"[{_fmt_ts(seg['start'])} → {_fmt_ts(seg['end'])}] "
                        f"{seg['text'].strip()}\n")

    # ─── Ensure a local cache directory in Kaggle working ───────────────────────
    CACHE_DIR = Path("/kaggle/working/whisper_models")
    CACHE_DIR.mkdir(parents=True, exist_ok=True)

    # ─── Load Whisper Model, using CACHE_DIR for both load & download ──────────
    start = time.time()
    print(f"[{_now()}] Attempting to load Whisper large-v3 from cache…")
    try:
        model = whisper.load_model("large-v3", download_root=str(CACHE_DIR))
        print(f"[{_now()}] Model loaded from cache or downloaded into '{CACHE_DIR}'.")
    except Exception as e:
        print(f"[{_now()}] ⚠️ Failed to load from cache: {e}")
        print(f"[{_now()}] Downloading Whisper large-v3 into '{CACHE_DIR}'…")
        model = whisper.load_model("large-v3", download_root=str(CACHE_DIR))
        print(f"[{_now()}] Model downloaded and cached at '{CACHE_DIR}'.")

    print(f"[{_now()}] Model ready (took {time.time() - start:.1f} s)\n")

    # ─── Set preferred language (or None for autodetect) ───────────────────────
    PREFERRED_LANGUAGE = "zh"  # or None
    if PREFERRED_LANGUAGE:
        print(f"[{_now()}] 🌐 Preferred language set to '{PREFERRED_LANGUAGE}'")
    else:
        print(f"[{_now()}] 🌐 Using Whisper's automatic language detection")

    # ─── Gather all audio files from the downloaded folder ─────────────────—
    INBOX_DIR = Path("/kaggle/working/from_google_drive")
    TRANSCRIPTS_DIR = Path("/kaggle/working/transcription")
    TRANSCRIPTS_DIR.mkdir(parents=True, exist_ok=True)

    audio_files = [p for p in INBOX_DIR.rglob("*") if p.suffix.lower() in AUDIO_EXT]
    total = len(audio_files)

    if total == 0:
        print(f"[{_now()}] 📂 No audio files found under {INBOX_DIR}")
        print("Please place your audio files in /kaggle/working/from_google_drive and re-run.")
    else:
        print(f"[{_now()}] 🎧 Found {total} audio file(s) under {INBOX_DIR}")

        # ─── Transcription loop with tqdm progress bar over files ───────────────
        for audio in tqdm(audio_files, desc="Transcribing files", unit="file"):
            tqdm.set_description(f"Transcribing: {audio.name}")

            # Build Whisper kwargs
            kwargs = {"word_timestamps": True, "verbose": False}
            if PREFERRED_LANGUAGE:
                kwargs["language"] = PREFERRED_LANGUAGE

            # Transcribe
            result = model.transcribe(str(audio), **kwargs)

            # Save transcript with same stem under TRANSCRIPTS_DIR
            out_txt = TRANSCRIPTS_DIR / f"{audio.stem}.txt"
            save_transcript(result, out_txt)

        print(f"\n[{_now()}] 🎉 All transcription jobs finished! Transcripts are in {TRANSCRIPTS_DIR}")


[2025-06-01 12:27:28] Attempting to load Whisper large-v3 from cache…
[2025-06-01 12:27:55] Model loaded from cache or downloaded into '/kaggle/working/whisper_models'.
[2025-06-01 12:27:55] Model ready (took 26.4 s)

[2025-06-01 12:27:55] 🌐 Preferred language set to 'zh'
[2025-06-01 12:27:55] 📂 No audio files found under /kaggle/working/from_google_drive
Please place your audio files in /kaggle/working/from_google_drive and re-run.


In [24]:
import os
import io
from googleapiclient.http import MediaIoBaseDownload

# ──────────────────────────────────────────────────────────────────────────────
# 1. Replace this with your actual Drive folder ID (the folder containing all files to download)
TO_BE_TRANSCRIBED_ID = to_be_transcribed

# 2. Create a local folder under /kaggle/working
local_root = "/kaggle/working/from_google_drive"
os.makedirs(local_root, exist_ok=True)

# ──────────────────────────────────────────────────────────────────────────────
# 3. List all non-folder files directly under TO_BE_TRANSCRIBED_ID
query_files = (
    f"'{TO_BE_TRANSCRIBED_ID}' in parents and "
    "trashed = false and "
    "mimeType != 'application/vnd.google-apps.folder'"
)
response = drive_service.files().list(
    q=query_files,
    spaces="drive",
    fields="files(id, name)"
).execute()
files = response.get("files", [])

if not files:
    print(f"⚠️ No files found in folder ID = {TO_BE_TRANSCRIBED_ID!r}, skip downloading.")
else:
    print(f"🔎 Found {len(files)} file(s) in folder ID = {TO_BE_TRANSCRIBED_ID!r}:")
    for f in files:
        print(f" • {f['name']}  (ID = {f['id']})")

# ──────────────────────────────────────────────────────────────────────────────
# 4. Download each file into /kaggle/working/from_google_drive
for f in files:
    file_id   = f["id"]
    file_name = f["name"]
    dest_path = os.path.join(local_root, file_name)

    try:
        request = drive_service.files().get_media(fileId=file_id)
        fh = io.FileIO(dest_path, mode="wb")
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        print(f"⬇️  Downloading {file_name!r} …")
        while not done:
            status, done = downloader.next_chunk()
        fh.close()
        print(f"✅ Saved to {dest_path}")
    except Exception as e:
        print(f"❌ Error downloading {file_name!r}: {e}")

🔎 Found 2 file(s) in folder ID = '1AKnppHssmAwkjBo2EPI8Twf6782hH2xv':
 • 20230730 OpenHCI presentation.m4a  (ID = 1CnnASu-V5Ta5GN4k5nQLi-cTrTfoJCVL)
 • 20230810 OpenHCI presentation feedback.m4a  (ID = 1AV3_V3hTZSTomSziIIEtgYpavmP9uxPr)
⬇️  Downloading '20230730 OpenHCI presentation.m4a' …
✅ Saved to /kaggle/working/from_google_drive/20230730 OpenHCI presentation.m4a
⬇️  Downloading '20230810 OpenHCI presentation feedback.m4a' …
✅ Saved to /kaggle/working/from_google_drive/20230810 OpenHCI presentation feedback.m4a


In [25]:
from pathlib import Path
import whisper
import datetime
from tqdm import tqdm

if not new_files:
    print(f"[{datetime.datetime.now():%Y-%m-%d %H:%M:%S}] ⚠️ new_files == False → Skipping transcription.")
else:
    PREFERRED_LANGUAGE = "zh"  # or None

    if PREFERRED_LANGUAGE:
        print(f"[{datetime.datetime.now():%Y-%m-%d %H:%M:%S}] 🌐 Preferred language set to '{PREFERRED_LANGUAGE}'")
    else:
        print(f"[{datetime.datetime.now():%Y-%m-%d %H:%M:%S}] 🌐 Using Whisper's automatic language detection")

    INBOX_DIR       = Path("/kaggle/working/from_google_drive")
    TRANSCRIPTS_DIR = Path("/kaggle/working/transcription")
    TRANSCRIPTS_DIR.mkdir(parents=True, exist_ok=True)

    audio_files = [
        p for p in INBOX_DIR.rglob("*")
        if p.suffix.lower() in AUDIO_EXT
    ]
    total = len(audio_files)

    if total == 0:
        print(f"[{datetime.datetime.now():%Y-%m-%d %H:%M:%S}] 📂 No audio files found under {INBOX_DIR}")
        print("Please place your audio files in /kaggle/working/from_google_drive and re-run.")
    else:
        print(f"[{datetime.datetime.now():%Y-%m-%d %H:%M:%S}] 🎧 Found {total} audio file(s) under {INBOX_DIR}")

        with tqdm(audio_files, desc="Transcribing files", unit="file") as pbar:
            for audio in pbar:
                pbar.set_postfix_str(f"Now: {audio.name}")

                kwargs = {"word_timestamps": True, "verbose": False}
                if PREFERRED_LANGUAGE:
                    kwargs["language"] = PREFERRED_LANGUAGE

                result = model.transcribe(str(audio), **kwargs)
                out_txt = TRANSCRIPTS_DIR / f"{audio.stem}.txt"
                save_transcript(result, out_txt)

        print(f"\n[{datetime.datetime.now():%Y-%m-%d %H:%M:%S}] 🎉 All transcription jobs finished! Transcripts are in {TRANSCRIPTS_DIR}")


[2025-06-01 12:28:04] 🌐 Preferred language set to 'zh'
[2025-06-01 12:28:04] 🎧 Found 2 audio file(s) under /kaggle/working/from_google_drive


Transcribing files:   0%|          | 0/2 [00:00<?, ?file/s, Now: 20230730 OpenHCI presentation.m4a]
  0%|          | 0/95178 [00:00<?, ?frames/s][A
  3%|▎         | 2814/95178 [00:05<03:02, 505.06frames/s][A
  6%|▌         | 5758/95178 [00:12<03:15, 458.00frames/s][A
  9%|▉         | 8650/95178 [00:18<03:04, 469.40frames/s][A
 12%|█▏        | 11650/95178 [00:24<02:57, 471.52frames/s][A
 15%|█▌        | 14650/95178 [00:31<02:53, 463.29frames/s][A
 19%|█▊        | 17650/95178 [00:37<02:42, 478.39frames/s][A
 22%|██▏       | 20570/95178 [00:42<02:31, 492.88frames/s][A
 25%|██▍       | 23570/95178 [00:48<02:22, 501.44frames/s][A
 28%|██▊       | 26516/95178 [00:54<02:16, 504.50frames/s][A
 31%|███       | 29516/95178 [00:59<02:07, 513.80frames/s][A
 34%|███▍      | 32516/95178 [01:05<02:01, 517.39frames/s][A
 37%|███▋      | 35516/95178 [01:10<01:48, 551.73frames/s][A
 40%|████      | 38516/95178 [01:13<01:32, 613.76frames/s][A
 43%|████▎     | 41272/95178 [01:19<01:32, 579.8


[2025-06-01 12:33:58] 🎉 All transcription jobs finished! Transcripts are in /kaggle/working/transcription





In [26]:
import re
from datetime import timedelta, datetime
from pathlib import Path

# Regex to match lines like: [HH:MM:SS.mmm → HH:MM:SS.mmm] text
timestamp_pattern = re.compile(
    r"\[(\d{2}:\d{2}:\d{2}\.\d{3})\s*→\s*(\d{2}:\d{2}:\d{2}\.\d{3})\]\s*(.*)"
)

def parse_time(s: str) -> timedelta:
    """Convert 'HH:MM:SS.mmm' → timedelta."""
    h, m, rest = s.split(":")
    s_part, ms = rest.split(".")
    return timedelta(hours=int(h), minutes=int(m),
                     seconds=int(s_part), milliseconds=int(ms))

def process_transcript(text: str) -> str:
    """
    Group segments every 5 minutes. Output format:
      [HH:MM:SS.mmm]
      <concatenated text for that 5-minute chunk>

    Blank line between chunks.
    """
    lines = text.splitlines()
    segments = []
    for line in lines:
        m = timestamp_pattern.match(line)
        if m:
            start_ts, end_ts, content = m.groups()
            segments.append((parse_time(start_ts), start_ts, content.strip()))

    if not segments:
        return ""  # no timed segments found

    result = []
    buffer = ""
    last_mark_minute = None

    for ts, start_ts_str, content in segments:
        curr_minute = int(ts.total_seconds() // 300)  # chunk index (every 300 sec)
        if last_mark_minute is None or curr_minute != last_mark_minute:
            if buffer:
                result.append(buffer.strip())
                buffer = ""
            result.append(f"[{start_ts_str}]")
            last_mark_minute = curr_minute
        buffer += content + " "

    if buffer:
        result.append(buffer.strip())

    # Combine into blocks of 3 lines: timestamp, paragraph, blank line
    output = []
    for i in range(0, len(result), 2):
        if i + 1 < len(result):
            output.append(result[i])   # timestamp line
            output.append(result[i+1]) # text for that block
            output.append("")          # blank line

    return "\n".join(output).strip()

# ─── Batch processing for Kaggle ─────────────
TRANSCRIPTS_DIR = Path("/kaggle/working/transcription")
PARSED_DIR = Path("/kaggle/working/parsed")
PARSED_DIR.mkdir(parents=True, exist_ok=True)

txt_files = list(TRANSCRIPTS_DIR.glob("*.txt"))
if not txt_files:
    print(f"[{datetime.now():%Y-%m-%d %H:%M:%S}] ⚠️ No .txt files found in {TRANSCRIPTS_DIR}. Skipping parsing.")
else:
    for txtfile in txt_files:
        with txtfile.open(encoding="utf-8") as f:
            text = f.read()

        processed = process_transcript(text)
        out_path = PARSED_DIR / txtfile.name.replace(".txt", "_parsed.txt")

        with out_path.open("w", encoding="utf-8") as f:
            f.write(processed)
        print(f"🔧 Processed {txtfile.name} → {out_path.name}")

    print("\n✅ All transcripts parsed.")


🔧 Processed 20230730 OpenHCI presentation.txt → 20230730 OpenHCI presentation_parsed.txt
🔧 Processed 20230810 OpenHCI presentation feedback.txt → 20230810 OpenHCI presentation feedback_parsed.txt

✅ All transcripts parsed.


In [27]:
from googleapiclient.discovery import build

DOC_ID = '1p44XUpBu7lPjyux4eANd_9FHT5F1UDbgUyx7q6Libvk'

def get_doc_text(doc_id: str, creds) -> str:
    service = build('docs', 'v1', credentials=creds)
    doc = service.documents().get(documentId=doc_id).execute()
    text = []
    for element in doc.get('body', {}).get('content', []):
        if 'paragraph' in element:
            for run in element['paragraph'].get('elements', []):
                txt = run.get('textRun', {}).get('content')
                if txt:
                    text.append(txt)
    return ''.join(text).strip()

# Retrieve the system prompt from the Google Doc
SYSTEM_PROMPT = get_doc_text(DOC_ID, creds)
print("[INFO] SYSTEM_PROMPT loaded from Google Doc. Preview:\n", SYSTEM_PROMPT)


[INFO] SYSTEM_PROMPT loaded from Google Doc. Preview:
 ## System Prompt

You are tasked with summarizing a speech provided in its original language. Create a concise, structured summary using clear and informative markdown formatting. Follow the outline and format precisely. You may use markdown tables, bullet points, paragraphs, or a combination as appropriate.

## Summary Structure

### Title

Provide a concise, relevant title reflecting the key theme or message of the speech.
Please make sure the title starts with a # to be recognized as title.

# Title

### Speaker

* **Name**: \\[Speaker's Name]
* **Affiliation/Role**: \\[Speaker’s Affiliation or Role, if known]
* **Event**: \\[Event or occasion where the speech was given, if applicable]
* **Date**: \\[Date of the speech, if available]

### Overview

Provide a short paragraph summarizing the overall purpose and main points of the speech.

### Key Points

Summarize each major point clearly. You may use markdown tables, bullet point

In [28]:
from pathlib import Path
from kaggle_secrets import UserSecretsClient
import google.generativeai as genai

# Retrieve Gemini API key from Kaggle secrets
s = UserSecretsClient()
api_key = s.get_secret("GEMINI_API_KEY")
if api_key is None:
    raise ValueError("GEMINI_API_KEY not found in Kaggle secrets! Add it via Add-ons → Secrets.")

genai.configure(api_key=api_key)

def generate_summary_with_gemini(speech_text: str, system_prompt: str) -> str:
    model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
    full_prompt = system_prompt.strip() + "\n\n" + speech_text.strip()
    try:
        response = model.generate_content(
            full_prompt,
            generation_config=genai.types.GenerationConfig(temperature=0.5),
            stream=False,
        )
        return response.text
    except Exception as e:
        print(f"[ERROR] Gemini API error: {e}")
        return None

def process_all_txt_files(parsed_dir: Path, markdown_dir: Path, system_prompt: str):
    parsed_dir = Path(parsed_dir)
    markdown_dir = Path(markdown_dir)
    markdown_dir.mkdir(parents=True, exist_ok=True)

    txt_files = list(parsed_dir.glob("*.txt"))
    if not txt_files:
        print(f"[INFO] No parsed .txt files found in {parsed_dir}. Skipping summarization.")
        return

    print(f"[INFO] Found {len(txt_files)} .txt files in {parsed_dir}")

    for txt_path in txt_files:
        print(f"\n[INFO] Processing: {txt_path.name}")
        try:
            with open(txt_path, "r", encoding="utf-8") as f:
                speech_text = f.read().strip()
        except Exception as e:
            print(f"[ERROR] Could not read {txt_path}: {e}")
            continue

        if not speech_text:
            print(f"[WARNING] {txt_path.name} is empty, skipping.")
            continue

        summary_md = generate_summary_with_gemini(speech_text, system_prompt)
        if summary_md is None:
            print(f"[ERROR] Gemini API failed for {txt_path.name}, skipping.")
            continue

        md_path = markdown_dir / (txt_path.stem.replace("_parsed", "") + ".md")

        try:
            with open(md_path, "w", encoding="utf-8") as f:
                f.write(summary_md)
            print(f"[INFO] Saved summary → {md_path.name}")
        except Exception as e:
            print(f"[ERROR] Could not save {md_path}: {e}")

# Set your input/output directories and system prompt
PARSED_DIR = Path("/kaggle/working/parsed")
MARKDOWN_DIR = Path("/kaggle/working/markdown")
# SYSTEM_PROMPT = ... # (get from your Google Doc as before)

# Only run if there are files to process
if list(PARSED_DIR.glob("*.txt")):
    process_all_txt_files(PARSED_DIR, MARKDOWN_DIR, SYSTEM_PROMPT)
    print("\n✅ All summaries generated.")
else:
    print(f"[INFO] No .txt files found in {PARSED_DIR}, nothing to summarize.")


[INFO] Found 2 .txt files in /kaggle/working/parsed

[INFO] Processing: 20230810 OpenHCI presentation feedback_parsed.txt
[INFO] Saved summary → 20230810 OpenHCI presentation feedback.md

[INFO] Processing: 20230730 OpenHCI presentation_parsed.txt
[INFO] Saved summary → 20230730 OpenHCI presentation.md

✅ All summaries generated.


In [29]:
import requests
import shutil
from pathlib import Path
from kaggle_secrets import UserSecretsClient

# ─── Retrieve HackMD token from Kaggle secrets ───────────────────────────
s = UserSecretsClient()
hackmd_token = s.get_secret("HACKMD_TOKEN")
if hackmd_token is None:
    raise ValueError("HACKMD_TOKEN not found in Kaggle secrets! Add it via Add-ons → Secrets.")

def upload_to_hackmd(md_content: str, filename: str, api_token: str) -> dict:
    """
    Uploads a single markdown string to HackMD. Returns {"title":..., "url":...} on success.
    """
    # Derive a clean title from the filename
    if filename.endswith('.md'):
        filename = filename[:-3]
    raw_title = filename.replace('_parsed', '').strip()
    title = raw_title.replace('_', ' ').strip()

    # Ensure there's a top-level heading
    md_lines = md_content.lstrip().splitlines()
    if not md_lines or not md_lines[0].strip().startswith("# "):
        md_content = f"# {title}\n\n" + md_content.lstrip()
    else:
        md_lines[0] = f"# {title}"
        md_content = "\n".join(md_lines)

    # Append hashtag if missing
    hashtag = "#whisper-stt-project"
    content_lines = md_content.rstrip().splitlines()
    if not any(line.strip() == hashtag for line in content_lines[-3:]):
        md_content = md_content.rstrip() + "\n\n" + hashtag + "\n"

    url = "https://api.hackmd.io/v1/notes"
    headers = {
        "Authorization": f"Bearer {api_token}",
        "Content-Type": "application/json"
    }
    data = {
        "title": title,
        "content": md_content,
        "readPermission": "guest",
        "writePermission": "signed_in"
    }
    response = requests.post(url, headers=headers, json=data)
    if response.ok:
        note_id = response.json().get("id")
        shared_url = f"https://hackmd.io/{note_id}"
        print(f"[INFO] Uploaded to HackMD: {shared_url}")
        return {"title": title, "url": shared_url}
    else:
        print(f"[ERROR] HackMD upload failed for {filename}: {response.status_code} {response.text}")
        return None

def batch_upload_markdown_and_move(markdown_dir: Path, uploaded_dir: Path, hackmd_token: str) -> list:
    """
    For each .md in markdown_dir: upload via upload_to_hackmd → move file to uploaded_dir.
    Returns list of {"title":..., "url":...}.
    """
    markdown_dir = Path(markdown_dir)
    uploaded_dir = Path(uploaded_dir)
    uploaded_dir.mkdir(parents=True, exist_ok=True)

    md_files = list(markdown_dir.glob("*.md"))
    if not md_files:
        print(f"[INFO] No markdown files found in {markdown_dir}, skipping upload.")
        return []

    print(f"[INFO] Found {len(md_files)} markdown files to upload.")

    shared_links = []
    for md_file in md_files:
        print(f"[INFO] Processing: {md_file.name}")
        try:
            with open(md_file, "r", encoding="utf-8") as f:
                md_content = f.read()
        except Exception as e:
            print(f"[ERROR] Could not read {md_file.name}: {e}")
            continue

        result = upload_to_hackmd(md_content, md_file.name, hackmd_token)
        if result:
            shared_links.append(result)
            dest_file = uploaded_dir / md_file.name
            try:
                shutil.move(str(md_file), dest_file)
                print(f"[INFO] Moved {md_file.name} → {dest_file}")
            except Exception as e:
                print(f"[ERROR] Failed to move {md_file.name}: {e}")
    return shared_links

# ─── Set directories and run HackMD upload if there are files ──────────
MARKDOWN_DIR = Path("/kaggle/working/markdown")
UPLOADED_DIR = Path("/kaggle/working/uploaded")

if list(MARKDOWN_DIR.glob("*.md")):
    shared_links = batch_upload_markdown_and_move(MARKDOWN_DIR, UPLOADED_DIR, hackmd_token)
    print("\n✅ All markdown files uploaded to HackMD.")
else:
    print(f"[INFO] No .md files found in {MARKDOWN_DIR}, nothing to upload.")


[INFO] Found 2 markdown files to upload.
[INFO] Processing: 20230730 OpenHCI presentation.md
[INFO] Uploaded to HackMD: https://hackmd.io/B3tbAXfyR4mt-ZSwjbx1zQ
[INFO] Moved 20230730 OpenHCI presentation.md → /kaggle/working/uploaded/20230730 OpenHCI presentation.md
[INFO] Processing: 20230810 OpenHCI presentation feedback.md
[INFO] Uploaded to HackMD: https://hackmd.io/dnuWXreWSiGDWSxF2GLUBw
[INFO] Moved 20230810 OpenHCI presentation feedback.md → /kaggle/working/uploaded/20230810 OpenHCI presentation feedback.md

✅ All markdown files uploaded to HackMD.


In [34]:
# ╔══════════════════════════════════════════════════════════════════╗
# 0) CONSTANT GOOGLE-DRIVE IDS (do NOT change names)                 #
# ╚══════════════════════════════════════════════════════════════════╝
INBOX_ID        = "1AKnppHssmAwkjBo2EPI8Twf6782hH2xv"  # to_be_transcribed
ARCHIVE_ID      = "1iuVCOQ6dpg0tff6bHxmUpl4WDIhVybWO"  # transcribed
PROCESSED_ID    = "1zpXQm4PKSD2PXSxmK3Q45VH2wSDbTGcr"  # processed data (text/md)

# ╔══════════════════════════════════════════════════════════════════╗
# 1) LOCAL WORKING PATHS                                             #
# ╚══════════════════════════════════════════════════════════════════╝
from pathlib import Path, PurePath
import shutil, datetime
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from googleapiclient.errors import HttpError

WORKING      = Path("/kaggle/working")
TRANS_DIR    = WORKING / "transcription"
PARSED_DIR   = WORKING / "parsed"
UPLOADED_MD  = WORKING / "uploaded"            # markdown from HackMD step
AUDIO_LOCAL  = WORKING / "from_google_drive"   # downloaded audio

drive = build("drive", "v3", credentials=creds)

def log(msg): print(f"[{datetime.datetime.now():%H:%M:%S}] {msg}")

# ╔══════════════════════════════════════════════════════════════════╗
# 2) HELPERS                                                         #
# ╚══════════════════════════════════════════════════════════════════╝
def ensure_subfolder(parent_id: str, name: str) -> str:
    """Return id of subfolder 'name' under parent, creating if absent."""
    q = (f"'{parent_id}' in parents and mimeType='application/vnd.google-apps.folder' "
         f"and name='{name}' and trashed=false")
    res = drive.files().list(q=q, spaces="drive", fields="files(id)").execute()
    if res["files"]:
        return res["files"][0]["id"]
    meta = {"name": name,
            "mimeType": "application/vnd.google-apps.folder",
            "parents": [parent_id]}
    return drive.files().create(body=meta, fields="id").execute()["id"]

def upload_file(local: Path, parent_id: str):
    media = MediaFileUpload(local, resumable=False)
    meta  = {"name": local.name, "parents": [parent_id]}
    drive.files().create(body=meta, media_body=media, fields="id").execute()
    log(f"  ↳ uploaded {local.name}")

def move_audio(audio_name: str):
# Move one audio file (exact name) from inbox → archive; silent if not found.
    q = (f"'{INBOX_ID}' in parents and name='{audio_name}' and trashed=false")
    res = drive.files().list(
        q=q, spaces="drive", fields="files(id)"
    ).execute().get("files", [])
    
    if not res:
        return                              # silent if not found
    fid = res[0]["id"]
    drive.files().update(
        fileId=fid,
        addParents=ARCHIVE_ID,
        removeParents=INBOX_ID,
        fields="id"
    ).execute()
    log(f"  ↳ moved {audio_name} → transcribed")

# ╔══════════════════════════════════════════════════════════════════╗
# 3) PROCESS MARKDOWN FILES                                          #
# ╚══════════════════════════════════════════════════════════════════╝
md_files = list(UPLOADED_MD.glob("*.md"))
if not md_files:
    log("ℹ️  No markdown files in /uploaded – nothing to sync.")
else:
    for md in md_files:
        stem = md.stem
        folder_id = ensure_subfolder(PROCESSED_ID, stem)
        log(f"📂 Drive subfolder '{stem}' (id {folder_id})")

        txt_path    = TRANS_DIR  / f"{stem}.txt"
        parsed_path = PARSED_DIR / f"{stem}_parsed.txt"

        for p in (txt_path, parsed_path, md):
            if p.exists():
                upload_file(p, folder_id)

        # Move corresponding audio (same base name, keep original extension)
        for audio_local in AUDIO_LOCAL.glob(f"{stem}.*"):
            if audio_local.is_file():
                move_audio(audio_local.name)
                break   # move only first match

        # Verify contents
        present = {f["name"] for f in drive.files().list(
            q=f"'{folder_id}' in parents and trashed=false",
            spaces="drive", fields="files(name)").execute()["files"]}
        expected = {txt_path.name, parsed_path.name, md.name}
        if expected - present:
            log(f"  ✖ missing {expected - present}")
        else:
            log("  ✅ files verified")

        md.unlink(missing_ok=True)   # remove local markdown

# ╔══════════════════════════════════════════════════════════════════╗
# 4) MOVE EXTRA AUDIO BASED ON LOCAL COPIES (if any)                 #
# ╚══════════════════════════════════════════════════════════════════╝
for audio_local in AUDIO_LOCAL.glob("*"):
    if audio_local.is_file():
        move_audio(audio_local.name)

# ╔══════════════════════════════════════════════════════════════════╗
# 5) CLEAN KAGGLE WORKSPACE (keep whisper_models)                    #
# ╚══════════════════════════════════════════════════════════════════╝
log("🧹 Cleaning /kaggle/working (keeping whisper_models)")
for item in WORKING.iterdir():
    if item.name == "whisper_models":
        continue
    try:
        shutil.rmtree(item) if item.is_dir() else item.unlink()
    except Exception as e:
        log(f"  ✖ could not delete {item}: {e}")
log("✅ All done")


[12:44:52] 📂 Drive subfolder '20230730 OpenHCI presentation' (id 1DzGdfx42hZ_Q8OEa6sWHrM0LPdntOaKx)
[12:44:53]   ↳ uploaded 20230730 OpenHCI presentation.txt
[12:44:54]   ↳ uploaded 20230730 OpenHCI presentation_parsed.txt
[12:44:56]   ↳ uploaded 20230730 OpenHCI presentation.md
[12:44:57]   ↳ moved 20230730 OpenHCI presentation.m4a → transcribed
[12:44:57]   ✅ files verified
[12:44:58] 📂 Drive subfolder '20230810 OpenHCI presentation feedback' (id 1CSfBnxnsWyQkf8rfDOk28RFVXbL45O1O)
[12:45:00]   ↳ uploaded 20230810 OpenHCI presentation feedback.txt
[12:45:01]   ↳ uploaded 20230810 OpenHCI presentation feedback_parsed.txt
[12:45:02]   ↳ uploaded 20230810 OpenHCI presentation feedback.md
[12:45:04]   ↳ moved 20230810 OpenHCI presentation feedback.m4a → transcribed
[12:45:04]   ✅ files verified
[12:45:05] 🧹 Cleaning /kaggle/working (keeping whisper_models)
[12:45:05] ✅ All done


In [35]:
# ╔══════════════════════════════════════════════════════════════════╗
#  EMAIL HACKMD LINKS  (only if something was uploaded)              #
# ╚══════════════════════════════════════════════════════════════════╝
from kaggle_secrets import UserSecretsClient
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.header import Header

# --- Skip entirely if no links were produced -----------------------
if not (globals().get("shared_links") and shared_links):
    print("[INFO] No uploaded Markdown links – skipping email step.")
else:
    # --- Retrieve secrets ------------------------------------------
    s = UserSecretsClient()
    email_user = s.get_secret("EMAIL_USER")
    email_pass = s.get_secret("EMAIL_PASS")
    email_to   = s.get_secret("EMAIL_TO")

    if not all([email_user, email_pass, email_to]):
        print("[WARN] Email secrets missing – email not sent.")
    else:
        # --- Build email body --------------------------------------
        subject = "📝 Your Uploaded HackMD Speech Summaries"
        body_lines = [
            "Hello,",
            "",
            "Your audio files were transcribed with Whisper and",
            "summarized using Gemini Flash 2.5. The summaries are now",
            "available on HackMD:",
            ""
        ] + [f"- {link['title']}: {link['url']}" for link in shared_links] + [
            "",
            "If you have questions just reply to this email.",
            "",
            "Best regards,",
            "Whisper-STT Bot"
        ]
        body = "\n".join(body_lines)

        # --- Compose & send ---------------------------------------
        msg             = MIMEMultipart()
        msg["From"]     = email_user
        msg["To"]       = email_to
        msg["Subject"]  = Header(subject, "utf-8")
        msg.attach(MIMEText(body, "plain", "utf-8"))

        try:
            with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
                server.login(email_user, email_pass)
                server.send_message(msg)
            print("[INFO] Email sent successfully.")
        except Exception as e:
            print(f"[ERROR] Email send failed: {e}")


[INFO] Email sent successfully.
