In [1]:
# 🛠️ One-time setup
#!pip install -q --upgrade openai-whisper tqdm

In [2]:
import datetime, shutil, sys
from pathlib import Path
from tqdm import tqdm
import whisper
import time

# === Path configuration (edit BASE_DIR if you like) ===
BASE_DIR        = Path(r"C:/Users/galen/Downloads/whisper-stt-project")
INBOX_DIR       = BASE_DIR / "inbox"
PROCESSED_DIR   = BASE_DIR / "processed"
TRANSCRIPTS_DIR = BASE_DIR / "transcripts"
MODEL_CACHE_DIR = BASE_DIR / "models"

AUDIO_EXT = {'.wav', '.mp3', '.m4a', '.flac', '.ogg', '.webm'}

# Create folders if they don’t exist
for d in (INBOX_DIR, PROCESSED_DIR, TRANSCRIPTS_DIR, MODEL_CACHE_DIR):
    d.mkdir(parents=True, exist_ok=True)

print("✅ Folder structure ready")

def _now():
    """Log-friendly timestamp."""
    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def _fmt_ts(seconds: float) -> str:
    """float seconds → HH:MM:SS.mmm string."""
    h, m = divmod(int(seconds), 3600)
    m, s = divmod(m, 60)
    ms   = int((seconds - int(seconds)) * 1000)
    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"

def save_transcript(result: dict, out_path: Path):
    """
    Write Whisper's segment list to a .txt file.
    Each line: [start → end] text
    """
    with open(out_path, "w", encoding="utf-8") as f:
        for seg in result["segments"]:
            f.write(f"[{_fmt_ts(seg['start'])} → {_fmt_ts(seg['end'])}] "
                    f"{seg['text'].strip()}\n")


start = time.time()
print(f"[{_now()}] Loading Whisper large-v3 …")
model = whisper.load_model("large-v3", download_root=str(MODEL_CACHE_DIR))
print(f"[{_now()}] Model ready (took {time.time() - start:.1f} s)\n")


✅ Folder structure ready
[2025-05-28 03:57:16] Loading Whisper large-v3 …
[2025-05-28 03:57:28] Model ready (took 11.8 s)



In [3]:
# 👇 Set to "zh", "en", "ja", etc. or keep None for auto-detect
PREFERRED_LANGUAGE = "zh"

if PREFERRED_LANGUAGE:
    print(f"🌐 Preferred language set to '{PREFERRED_LANGUAGE}'")
else:
    print("🌐 Using Whisper's automatic language detection")

🌐 Preferred language set to 'zh'


In [4]:
audio_files = [p for p in INBOX_DIR.iterdir()
               if p.suffix.lower() in AUDIO_EXT]
total = len(audio_files)

if total == 0:
    print("📂 No audio in inbox. Drop files there and re-run this cell.")
    sys.exit()

for idx, audio in enumerate(audio_files, 1):
    print(f"\n[{_now()}] ▶️  ({idx}/{total})  {audio.name}")

    # --- build kwargs ---
    kwargs = dict(word_timestamps=True, verbose=True)  # verbose prints segments live
    if PREFERRED_LANGUAGE:
        kwargs["language"] = PREFERRED_LANGUAGE
        print(f"[{_now()}] 🔧 Forcing language = '{PREFERRED_LANGUAGE}'")
    else:
        print(f"[{_now()}] 🔍 Auto language detection")

    # --- transcribe (prints out as it goes) ---
    result = model.transcribe(str(audio), **kwargs)

    # --- save & move ---
    out_txt = TRANSCRIPTS_DIR / f"{audio.stem}.txt"
    save_transcript(result, out_txt)
    print(f"[{_now()}] 📝 Saved transcript → {out_txt.name}")

    shutil.move(str(audio), PROCESSED_DIR / audio.name)
    print(f"[{_now()}] ✔️ Moved audio to processed/")

print("\n🎉 All files done!")



[2025-05-28 03:57:28] ▶️  (1/3)  20240927 新生出遊分享.m4a
[2025-05-28 03:57:28] 🔧 Forcing language = 'zh'




[00:00.000 --> 00:01.480] 已經PGY的
[00:01.480 --> 00:02.340] PGY就是
[00:03.200 --> 00:04.580] 一學期畢業之後
[00:04.580 --> 00:05.980] 過分科助研室
[00:05.980 --> 00:07.320] 有點像辭息的感覺
[00:08.960 --> 00:09.400] 好吧
[00:11.100 --> 00:13.200] 那你可以更多解釋
[00:13.200 --> 00:16.240] 剛才聽齊蓁
[00:16.740 --> 00:17.760] 看到采宜的分享
[00:17.760 --> 00:18.300] 我就覺得
[00:19.180 --> 00:20.680] 好懷念大學生活
[00:20.680 --> 00:22.920] 我會這樣建議
[00:23.360 --> 00:25.500] 在我看來就是大一到大六過來
[00:25.500 --> 00:26.640] 大思考 曲亂曲
[00:26.640 --> 00:27.740] 我都覺得超顯到
[00:27.740 --> 00:29.420] 先講一下今天經歷了什麼
[00:29.420 --> 00:32.180] 我早上七點去上班
[00:32.180 --> 00:33.880] 我在小兒心臟科
[00:33.880 --> 00:35.400] 所以今天有一台導管
[00:35.400 --> 00:36.400] 去辦房做導管
[00:36.400 --> 00:40.920] 小朋友就先天解剖異常
[00:40.920 --> 00:42.560] 所以一個小時的導管變四個小時
[00:42.560 --> 00:43.080] 所以我原本
[00:43.860 --> 00:45.060] 提早下導管
[00:45.060 --> 00:47.900] 做PBT的算盤就打壞了
[00:47.900 --> 00:50.440] 接下來我想說下午怎麼可以做PBT
[00:50.440 --> 00:52.800] 結果就突然接到一個學長的電話
[00:52.800 --> 00:54.300] 他說學弟我等一下要跟你交班
[00:54.300 --> 00



[00:07.580 --> 00:27.980] 字幕由 Amara.org 社群提供
[00:30.000 --> 00:30.740] 介紹一下我
[00:30.740 --> 00:32.960] 然後這就是我看起來最像醫生的時候
[00:32.960 --> 00:33.880] 現在超級不像
[00:33.880 --> 00:36.800] 然後就是我會這樣介紹我自己
[00:36.800 --> 00:37.540] 就是我覺得就是
[00:37.540 --> 00:38.880] 我是一個基督徒
[00:38.880 --> 00:39.860] 所以這邊應該有一些
[00:39.860 --> 00:42.680] 就是信仰愛團系的人就知道
[00:42.680 --> 00:44.400] 然後我也是MD
[00:44.400 --> 00:45.300] 就是Medical Doctor
[00:45.300 --> 00:46.560] 然後我喜歡科技
[00:46.560 --> 00:49.220] 然後我喜歡航空
[00:49.220 --> 00:52.380] 然後就是持續地在尋找人生的意義
[00:52.380 --> 00:55.820] 那我的經歷大概是這樣
[00:55.820 --> 00:59.340] 就是我高中畢業是桃園市的司儀大華高中
[00:59.340 --> 00:59.920] 不知道有什麼問題
[00:59.920 --> 01:01.000] 我經過就是一個社區高中
[01:01.000 --> 01:04.800] 然後我是姚明一三級畢業的
[01:04.800 --> 01:06.080] 所以這邊應該有一些學弟妹
[01:06.080 --> 01:10.200] 然後我上個月我是在台北榮總當
[01:10.200 --> 01:11.420] 部分科住院醫師
[01:11.420 --> 01:12.620] 現在流失行星了
[01:13.060 --> 01:14.800] 然後我是第260T的
[01:14.800 --> 01:16.840] 這個衛福部醫事室的替代醫藝男
[01:16.840 --> 01:17.140] 就這樣子
[01:18.020 --> 01:20.460] 

In [5]:
import re
from datetime import timedelta
from pathlib import Path

# Set BASE_DIR and folders
BASE_DIR = Path(r"C:/Users/galen/Downloads/whisper-stt-project")
transcripts_dir = BASE_DIR / "transcripts"
parsed_dir = BASE_DIR / "parsed"
parsed_dir.mkdir(exist_ok=True)

# Regex pattern
timestamp_pattern = re.compile(
    r"\[(\d{2}:\d{2}:\d{2}\.\d{3})\s*→\s*(\d{2}:\d{2}:\d{2}\.\d{3})\]\s*(.*)"
)

def parse_time(s):
    h, m, rest = s.split(":")
    s, ms = rest.split(".")
    return timedelta(hours=int(h), minutes=int(m), seconds=int(s), milliseconds=int(ms))

def process_transcript(text):
    lines = text.splitlines()
    segments = []
    for line in lines:
        m = timestamp_pattern.match(line)
        if m:
            start_ts, end_ts, content = m.groups()
            segments.append((parse_time(start_ts), start_ts, content.strip()))
    if not segments:
        return ""  # No matches found

    result = []
    buffer = ""
    last_mark_minute = None

    for ts, start_ts_str, content in segments:
        curr_minute = int(ts.total_seconds() // 300)  # every 5 minutes (300 sec)
        if last_mark_minute is None or curr_minute != last_mark_minute:
            if buffer:
                result.append(buffer.strip())
                buffer = ""
            result.append(f"[{start_ts_str}]")
            last_mark_minute = curr_minute
        buffer += content + " "
    if buffer:
        result.append(buffer.strip())

    # Group as: [timestamp]\n<text>\n\n
    output = []
    for i in range(0, len(result), 2):
        if i + 1 < len(result):
            output.append(result[i])         # timestamp
            output.append(result[i+1])       # paragraph
            output.append("")                # blank line
        else:
            output.append(result[i])
    return "\n".join(output).strip()

# ---- Batch processing ----
for txtfile in transcripts_dir.glob("*.txt"):
    with txtfile.open(encoding="utf-8") as f:
        text = f.read()
    processed = process_transcript(text)
    out_path = parsed_dir / txtfile.name.replace(".txt", "_parsed.txt")
    with out_path.open("w", encoding="utf-8") as f:
        f.write(processed)
    print(f"Processed {txtfile.name} -> {out_path.name}")


Processed 20240927 新生出遊分享.txt -> 20240927 新生出遊分享_parsed.txt
Processed 20241124 ASD分享.txt -> 20241124 ASD分享_parsed.txt
Processed 20250131 在家分享.txt -> 20250131 在家分享_parsed.txt
Processed 20250519 MedEdBot 發表.txt -> 20250519 MedEdBot 發表_parsed.txt
Processed 20250521 社醫 醫師公會.txt -> 20250521 社醫 醫師公會_parsed.txt
Processed 錄製 (5).txt -> 錄製 (5)_parsed.txt


In [6]:
SYSTEM_PROMPT = """
## System Prompt

You are tasked with summarizing a speech provided in its original language. Create a concise, structured summary using clear and informative markdown formatting. Follow the outline and format precisely. You may use markdown tables, bullet points, paragraphs, or a combination as appropriate.

## Summary Structure

### Title

Provide a concise, relevant title reflecting the key theme or message of the speech.
Please make sure the title starts with a # to be recognized as title.

# Title

### Speaker

* **Name**: \\[Speaker's Name]
* **Affiliation/Role**: \\[Speaker’s Affiliation or Role, if known]
* **Event**: \\[Event or occasion where the speech was given, if applicable]
* **Date**: \\[Date of the speech, if available]

### Overview

Provide a short paragraph summarizing the overall purpose and main points of the speech.

### Key Points

Summarize each major point clearly. You may use markdown tables, bullet points, or paragraphs as needed:

* **Key Point 1**: Brief description with supporting details.
* **Key Point 2**: Brief description with supporting details.
* Additional points as necessary.

Or alternatively, use a markdown table.

### Notable Quotes

Include one or two significant quotes from the speech, if available, highlighting central themes or key statements made by the speaker:

* *"Quote 1..."*
* *"Quote 2..."*

### Audience Reaction

Briefly describe audience reactions, if mentioned or apparent (e.g., applause, questions raised, notable silence).

### Conclusion

Summarize briefly how the speaker concluded their speech and highlight any key takeaway messages.

---

Ensure clarity, accuracy, and conciseness in the summary, preserving essential context and meaning.
Please summarize using the native language of the speech.

"""


In [7]:
import os
from pathlib import Path
from dotenv import load_dotenv
import google.generativeai as genai

# Load .env from BASE_DIR
BASE_DIR = Path(r"C:/Users/galen/Downloads/whisper-stt-project")
load_dotenv(dotenv_path=BASE_DIR / ".env")
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not found in .env file!")

# Configure Gemini API
genai.configure(api_key=api_key)

PARSED_DIR = BASE_DIR / "parsed"
MARKDOWN_DIR = BASE_DIR / "markdown"

def generate_summary_with_gemini(speech_text, system_prompt):
    model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
    full_prompt = system_prompt.strip() + "\n\n" + speech_text.strip()
    try:
        response = model.generate_content(
            full_prompt,
            generation_config=genai.types.GenerationConfig(
                temperature=0.5,
            ),
            stream=False,
        )
        return response.text
    except Exception as e:
        print(f"[ERROR] Gemini API error: {e}")
        return None

def process_all_txt_files(parsed_dir, markdown_dir, system_prompt):
    parsed_dir = Path(parsed_dir)
    markdown_dir = Path(markdown_dir)
    markdown_dir.mkdir(exist_ok=True)

    txt_files = list(parsed_dir.glob("*.txt"))
    print(f"[DEBUG] Found {len(txt_files)} .txt files in {parsed_dir}")

    for txt_path in txt_files:
        print(f"\n[DEBUG] Processing: {txt_path.name}")

        try:
            with open(txt_path, "r", encoding="utf-8") as f:
                speech_text = f.read().strip()
        except Exception as e:
            print(f"[ERROR] Could not read {txt_path}: {e}")
            continue

        if not speech_text:
            print(f"[WARNING] {txt_path.name} is empty, skipping.")
            continue

        summary_md = generate_summary_with_gemini(speech_text, system_prompt)
        if summary_md is None:
            print(f"[ERROR] Gemini API failed for {txt_path.name}, skipping.")
            continue

        md_path = markdown_dir / (txt_path.stem + ".md")
        try:
            with open(md_path, "w", encoding="utf-8") as f:
                f.write(summary_md)
            print(f"[INFO] Saved summary to {md_path}")
        except Exception as e:
            print(f"[ERROR] Could not save {md_path}: {e}")

# Run the batch processing
process_all_txt_files(PARSED_DIR, MARKDOWN_DIR, SYSTEM_PROMPT)


[DEBUG] Found 6 .txt files in C:\Users\galen\Downloads\whisper-stt-project\parsed

[DEBUG] Processing: 20240927 新生出遊分享_parsed.txt
[INFO] Saved summary to C:\Users\galen\Downloads\whisper-stt-project\markdown\20240927 新生出遊分享_parsed.md

[DEBUG] Processing: 20241124 ASD分享_parsed.txt
[INFO] Saved summary to C:\Users\galen\Downloads\whisper-stt-project\markdown\20241124 ASD分享_parsed.md

[DEBUG] Processing: 20250131 在家分享_parsed.txt
[INFO] Saved summary to C:\Users\galen\Downloads\whisper-stt-project\markdown\20250131 在家分享_parsed.md

[DEBUG] Processing: 20250519 MedEdBot 發表_parsed.txt
[INFO] Saved summary to C:\Users\galen\Downloads\whisper-stt-project\markdown\20250519 MedEdBot 發表_parsed.md

[DEBUG] Processing: 20250521 社醫 醫師公會_parsed.txt
[INFO] Saved summary to C:\Users\galen\Downloads\whisper-stt-project\markdown\20250521 社醫 醫師公會_parsed.md

[DEBUG] Processing: 錄製 (5)_parsed.txt
[INFO] Saved summary to C:\Users\galen\Downloads\whisper-stt-project\markdown\錄製 (5)_parsed.md


In [8]:
import os
from pathlib import Path
import shutil
import requests
from dotenv import load_dotenv

# Set your paths
BASE_DIR = Path(r"C:/Users/galen/Downloads/whisper-stt-project")
MARKDOWN_DIR = BASE_DIR / "markdown"
UPLOADED_DIR = BASE_DIR / "uploaded"
UPLOADED_DIR.mkdir(exist_ok=True)

# Load your HackMD API token
load_dotenv(dotenv_path=BASE_DIR / ".env")
hackmd_token = os.environ.get("HACKMD_TOKEN")
if not hackmd_token:
    raise ValueError("HACKMD_TOKEN not found in .env file!")

def upload_to_hackmd(md_content, filename, api_token):
    # Clean up title: remove '_parsed', '.md', and prettify
    if filename.endswith('.md'):
        filename = filename[:-3]
    raw_title = filename.replace('_parsed', '').strip()
    title = raw_title.replace('_', ' ').strip()  # Prettify for heading

    # Remove any initial blank lines
    md_lines = md_content.lstrip().splitlines()

    # If the first non-blank line is a heading, replace with our title; otherwise, insert our title heading
    if not md_lines or not md_lines[0].strip().startswith("# "):
        md_content = f"# {title}\n\n" + md_content.lstrip()
    else:
        md_lines[0] = f"# {title}"
        md_content = "\n".join(md_lines)

    # Add tag hashtag at the end (if not already present as its own line)
    hashtag = "#whisper-stt-project"
    content_lines = md_content.rstrip().splitlines()
    if not any(line.strip() == hashtag for line in content_lines[-3:]):  # Look at last few lines
        md_content = md_content.rstrip() + "\n\n" + hashtag + "\n"

    url = "https://api.hackmd.io/v1/notes"
    headers = {
        "Authorization": f"Bearer {api_token}",
        "Content-Type": "application/json"
    }
    data = {
        "title": title,
        "content": md_content,
        "readPermission": "guest",
        "writePermission": "signed_in"
    }
    response = requests.post(url, headers=headers, json=data)
    if response.ok:
        note_id = response.json().get("id")
        shared_url = f"https://hackmd.io/{note_id}"
        print(f"[INFO] Uploaded to HackMD: {shared_url}")
        return {"title": title, "url": shared_url}
    else:
        print(f"[ERROR] HackMD upload failed for {filename}: {response.status_code} {response.text}")
        return None

def batch_upload_markdown_and_move(markdown_dir, uploaded_dir, hackmd_token):
    markdown_dir = Path(markdown_dir)
    uploaded_dir = Path(uploaded_dir)
    uploaded_dir.mkdir(exist_ok=True)
    md_files = list(markdown_dir.glob("*.md"))
    print(f"[DEBUG] Found {len(md_files)} markdown files to upload.")
    shared_links = []
    for md_file in md_files:
        print(f"[DEBUG] Processing: {md_file.name}")
        try:
            with open(md_file, "r", encoding="utf-8") as f:
                md_content = f.read()
        except Exception as e:
            print(f"[ERROR] Could not read {md_file.name}: {e}")
            continue

        result = upload_to_hackmd(md_content, md_file.name, hackmd_token)
        if result:
            shared_links.append(result)
            dest_file = uploaded_dir / md_file.name
            try:
                shutil.move(str(md_file), dest_file)
                print(f"[INFO] Moved {md_file.name} to {dest_file}")
            except Exception as e:
                print(f"[ERROR] Failed to move {md_file.name}: {e}")
    return shared_links

# Run the batch uploader and collect links
shared_links = batch_upload_markdown_and_move(MARKDOWN_DIR, UPLOADED_DIR, hackmd_token)


[DEBUG] Found 6 markdown files to upload.
[DEBUG] Processing: 20240927 新生出遊分享_parsed.md
[INFO] Uploaded to HackMD: https://hackmd.io/lYzqXq9tRomnISSAqQmo_w
[INFO] Moved 20240927 新生出遊分享_parsed.md to C:\Users\galen\Downloads\whisper-stt-project\uploaded\20240927 新生出遊分享_parsed.md
[DEBUG] Processing: 20241124 ASD分享_parsed.md
[INFO] Uploaded to HackMD: https://hackmd.io/PyPkxjj3SvugR7egSMSrYg
[INFO] Moved 20241124 ASD分享_parsed.md to C:\Users\galen\Downloads\whisper-stt-project\uploaded\20241124 ASD分享_parsed.md
[DEBUG] Processing: 20250131 在家分享_parsed.md
[INFO] Uploaded to HackMD: https://hackmd.io/-n8mYvkNQuOQaHfbMiaSyw
[INFO] Moved 20250131 在家分享_parsed.md to C:\Users\galen\Downloads\whisper-stt-project\uploaded\20250131 在家分享_parsed.md
[DEBUG] Processing: 20250519 MedEdBot 發表_parsed.md
[INFO] Uploaded to HackMD: https://hackmd.io/Jgfdb7bISU2jzzRvpCDXAA
[INFO] Moved 20250519 MedEdBot 發表_parsed.md to C:\Users\galen\Downloads\whisper-stt-project\uploaded\20250519 MedEdBot 發表_parsed.md
[DEBUG] 

In [9]:
from dotenv import load_dotenv
import os
from pathlib import Path
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.header import Header

# Ensure .env is loaded
BASE_DIR = Path(r"C:/Users/galen/Downloads/whisper-stt-project")
load_dotenv(dotenv_path=BASE_DIR / ".env")

email_user = os.environ.get("EMAIL_USER")
email_pass = os.environ.get("EMAIL_PASS")
email_to = os.environ.get("EMAIL_TO")  # Add this to .env or set manually

if not (email_user and email_pass and email_to):
    raise ValueError("EMAIL_USER, EMAIL_PASS, or EMAIL_TO not found in .env file!")

subject = "📝 Your Uploaded HackMD Speech Summaries"
body_lines = [
    "Hello,",
    "",
    "Your audio files have been automatically transcribed using Whisper AI,",
    "and the speech content was summarized using Gemini Flash 2.5.",
    "",
    "Here are the links to your uploaded speech summaries on HackMD:",
    ""
]

for link in shared_links:
    body_lines.append(f"- {link['title']}: {link['url']}")

body_lines += [
    "",
    "All documents are shared and accessible to anyone with the link.",
    "",
    "If you have any questions or encounter any problems, feel free to reply to this email!",
    "",
    "Best regards,",
    "Whisper-STT-Project Bot"
]

body = "\n".join(body_lines)

# Build the email
msg = MIMEMultipart()
msg['From'] = email_user
msg['To'] = email_to
msg['Subject'] = Header(subject, 'utf-8')
msg.attach(MIMEText(body, 'plain', 'utf-8'))

# Send via SMTP (Gmail SMTP by default)
with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
    server.login(email_user, email_pass)
    server.send_message(msg)
    print("[INFO] Email sent successfully.")


[INFO] Email sent successfully.
