# Smart Meeting Assistant - WAV Transcript Processing
This notebook transcribes a meeting from an WAV file, performs speaker diarization, generates a summary, translates the content, and extracts action items.

In [7]:
import sys
import os

# Add the project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), "..")))

import whisper
from pyannote.audio import Pipeline
from dotenv import load_dotenv
from modules.summarizer import generate_summary
from modules.translator import translate_text
from modules.ds_action_items import extract_action_items_with_deepseek

load_dotenv()

# Load models
whisper_model = whisper.load_model("small")
hf_token = os.getenv("HF_TOKEN")
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\Oscar\.cache\torch\pyannote\models--pyannote--segmentation\snapshots\c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cpu. Bad things might happen unless you revert torch to 1.x.


In [8]:
def transcribe_with_diarization(wav_path):
    print(f"🎧 Transcribing {wav_path}...")

    result = whisper_model.transcribe(wav_path)
    segments = result.get("segments", [])

    diarization = diarization_pipeline(wav_path)
    speaker_turns = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        speaker_turns.append({
            "speaker": speaker,
            "start": turn.start,
            "end": turn.end
        })

    # Debug print
    print("\n--- Whisper Segments ---")
    for seg in segments:
        print(f"Whisper: {seg['start']:.2f}s - {seg['end']:.2f}s → {seg['text']}")

    print("\n--- PyAnnote Diarization ---")
    for turn in speaker_turns:
        print(f"PyAnnote: {turn['start']:.2f}s - {turn['end']:.2f}s → {turn['speaker']}")

    # Improved speaker matching
    speaker_map = {}
    speaker_counter = 1
    labeled_lines = []

    for seg in segments:
        start, end = seg['start'], seg['end']
        best_match = None
        max_overlap = 0.0

        for turn in speaker_turns:
            overlap_start = max(start, turn["start"])
            overlap_end = min(end, turn["end"])
            overlap = max(0.0, overlap_end - overlap_start)

            if overlap > max_overlap:
                best_match = turn["speaker"]
                max_overlap = overlap

        matched_speaker = best_match or "Unknown"

        if matched_speaker not in speaker_map:
            speaker_map[matched_speaker] = f"Speaker {speaker_counter}"
            speaker_counter += 1

        readable_speaker = speaker_map[matched_speaker]
        labeled_lines.append(f"[{readable_speaker}] {seg['text'].strip()}")

    return labeled_lines


In [None]:
# Path to WAV file
wav_file = "assets/sample_meeting_5.wav"

# Transcribe and diarize
transcript_lines = transcribe_with_diarization(wav_file)

# Show transcript
print("=== Transcript ===")
print("\n".join(transcript_lines))


🎧 Transcribing assets/sample_meeting_3.wav...


In [None]:
# Generate Summary
summary = generate_summary("\n".join(transcript_lines))
print("=== Summary ===")
print(summary)

=== Summary ===
Speaker 1 suggests to include emoji reactions and analytics tracking in the first version of the Q3 release. Speaker 2 proposes to use WebSockets for real-time updates. Speaker 3 proposes to send a draft of the spec by Friday for review.


In [None]:
# Translate to French
translation = translate_text("\n".join(transcript_lines), src_lang="en", tgt_lang="fr")
print("=== Translation (French) ===")
print(translation)



=== Translation (French) ===
[Speaker 1] Bienvenue à tout le monde. Finalisons les spécifications pour la version Q3. [Speaker 2] Pour la fonctionnalité de chat, je propose d'utiliser WebSockets pour les mises à jour en temps réel. [Speaker 3] Pour la fonctionnalité de chat, je propose d'utiliser WebSockets pour les mises à jour en temps réel. [Speaker 1] Je suggère d'inclure les réactions emoji dans la première version. [Speaker 1] Bonne idée. Quelqu'un peut-il aussi regarder le suivi analytique? [Speaker 2] Je vais le prendre.


In [None]:
# Extract Action Items
action_items = extract_action_items_with_deepseek(transcript_lines)
print("=== Action Items ===")
print(action_items)




=== Action Items ===
Speaker 1:
- Task: Finalize Q3 specs
- Task: Discuss emoji reactions
- Task: Consider analytics tracking

Speaker 2:
- Task: Propose WebSockets for chat feature
- Task: Implement chat feature using WebSockets
- Task: Ensure feature is completed by next sprint

Speaker 3:
- Task: Review chat feature proposal
- Task: Send out draft by Friday
- Task: Ensure draft is reviewed by Friday
