In [None]:
from audio_extractor.audio_processor import AudioExtractor, AudioChunker, VideoToTextPipeline

video_path = "/Users/youssefjanjar/Documents/formascience/volume/cours_1.mp4"
output_path = "/Users/youssefjanjar/Documents/formascience/volume/transcripts"



audio_extractor = AudioExtractor()
audio_extractor.extract_audio(video_path, output_path)

2025-07-27 15:48:19 - audio_extractor.audio_processor - INFO - Extracting audio from /Users/youssefjanjar/Documents/formascience/volume/cours_1.mp4
2025-07-27 15:48:23 - audio_extractor.audio_processor - INFO - Audio extracted successfully: /Users/youssefjanjar/Documents/formascience/volume/transcripts/cours_1_audio.wav


'/Users/youssefjanjar/Documents/formascience/volume/transcripts/cours_1_audio.wav'

In [None]:
from audio_extractor.audio_processor import AudioChunker
import os

audio_chunker = AudioChunker()
audio_path = "/Users/youssefjanjar/Documents/formascience/volume/audio/cours_1_audio.wav"
output_path = "/Users/youssefjanjar/Documents/formascience/volume/transcripts"

# Initialize chunker
chunker = AudioChunker()

# Split audio into 3-minute chunks
# This creates a folder named "audio_filename_chunks" automatically
chunk_paths = chunker.split_audio(
    audio_path, 
    chunk_duration_minutes=3.0  # 3-minute chunks
)

print(f"Created {len(chunk_paths)} chunks")
print(f"Chunks saved in: {os.path.dirname(chunk_paths[0]) if chunk_paths else 'N/A'}")

# Show first few chunk paths
for i, chunk in enumerate(chunk_paths[:3], 1):
    print(f"  Chunk {i}: {os.path.basename(chunk)}")

2025-07-27 15:59:51 - audio_extractor.audio_processor - INFO - Splitting audio into 3.0 minute chunks
2025-07-27 15:59:51 - audio_extractor.audio_processor - INFO - Output directory: /Users/youssefjanjar/Documents/formascience/volume/audio/cours_1_audio_chunks
2025-07-27 15:59:52 - audio_extractor.audio_processor - INFO - Created 25 audio chunks


Created 25 chunks
Chunks saved in: /Users/youssefjanjar/Documents/formascience/volume/audio/cours_1_audio_chunks
  Chunk 1: cours_1_audio_chunk_001.wav
  Chunk 2: cours_1_audio_chunk_002.wav
  Chunk 3: cours_1_audio_chunk_003.wav


# Transcription with timestamps

In [2]:
# Your specific use case - French academic transcription
from audio_extractor import WhisperTranscriber

# Academic French transcription prompt
prompt = """
Transcription académique d'un cours de médecine universitaire, en français.
Le locuteur est un professeur qui commente des diapositives.
Utilisez la terminologie médicale appropriée.
Indiquez les pauses importantes par des points.
Privilégiez la clarté.
"""

# Your audio file path
audio_path = "/Users/youssefjanjar/Documents/formascience/volume/audio/cours_1_audio_chunks/cours_1_audio_chunk_001.wav"

from audio_extractor import WhisperTranscriber

transcriber = WhisperTranscriber()

# French academic transcription with timestamp parsing
result = transcriber.transcribe_with_timestamps(
    audio_path=audio_path,
    language="fr",
    prompt=prompt,
    timestamp_granularities=["segment"]
)

print(f"✅ JSON saved to: {result['json_file']}")
print(f"✅ Parsed timestamps saved to: {result['parsed_file']}")

2025-07-27 16:52:05 - audio_extractor.transcription.whisper_client - INFO - Transcribing with timestamps: /Users/youssefjanjar/Documents/formascience/volume/audio/cours_1_audio_chunks/cours_1_audio_chunk_001.wav
2025-07-27 16:52:17 - audio_extractor.transcription.whisper_client - INFO - Transcription saved: /Users/youssefjanjar/Documents/formascience/volume/transcripts/json/cours_1_audio_chunk_001_transcription.json
2025-07-27 16:52:17 - audio_extractor.transcription.whisper_client - INFO - JSON transcription saved to: /Users/youssefjanjar/Documents/formascience/volume/transcripts/json/cours_1_audio_chunk_001_transcription.json
2025-07-27 16:52:17 - audio_extractor.transcription.whisper_client - INFO - Parsed timestamps saved to: /Users/youssefjanjar/Documents/formascience/volume/transcripts/parsed/cours_1_audio_chunk_001_timestamps.txt


✅ JSON saved to: /Users/youssefjanjar/Documents/formascience/volume/transcripts/json/cours_1_audio_chunk_001_transcription.json
✅ Parsed timestamps saved to: /Users/youssefjanjar/Documents/formascience/volume/transcripts/parsed/cours_1_audio_chunk_001_timestamps.txt


# PDF extraction


# Parsing the text into sections

In [None]:
"""
PDF ➜ deck ➜ slides ➜ nested bullet tree
– keeps banner filtering
– detects slide titles via biggest font size near the top
– uses REAL x‑position of the first bullet glyph for inline children
"""

import os, re, json, pdfplumber, unicodedata, string
from dotenv import load_dotenv
from typing import List, Dict, Tuple
from pydantic import BaseModel, Field
from openai import OpenAI

load_dotenv()       # OPENAI_API_KEY in .env

# ----------------------------------------------------------------------
# Config
# ----------------------------------------------------------------------
BULLET_CHARS = "•◦‣▪–"
BANNER_RX    = re.compile(r"^L\\d?SpS:")
INDENT_TOL   = 5                 # px – cluster threshold

# ----------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------
def slide_title(words, top_cutoff=150, use_biggest_font=1):
    if not words: 
        return ""
    
    # If flag is 1, find the biggest font string from all words
    if use_biggest_font == 1:
        max_size = max(w["size"] for w in words)
        biggest_font_words = sorted([w for w in words 
                                   if abs(w["size"] - max_size) < 0.5],
                                  key=lambda w: w["x0"])
        return " ".join(w["text"] for w in biggest_font_words)
    
    # Original logic for when flag is not 1
    top_words = [w for w in words if w["top"] < top_cutoff]
    if not top_words: 
        return ""
    max_size = max(w["size"] for w in top_words)
    title_words = sorted([w for w in top_words
                          if abs(w["size"] - max_size) < 0.5],
                         key=lambda w: w["x0"])
    return " ".join(w["text"] for w in title_words)

# ----------------------------------------------------------------------
# 1. PDF ➜ deck
# ----------------------------------------------------------------------
def extract_lines(pdf_path: str):
    deck, banner_hits = [], {}

    with pdfplumber.open(pdf_path) as pdf:
        for idx, page in enumerate(pdf.pages, 1):
            words = page.extract_words(use_text_flow=True, extra_attrs=["size"])
            rows: Dict[float, List[dict]] = {}
            for w in words:
                txt = w["text"].strip()
                if BANNER_RX.match(txt):
                    banner_hits[txt] = banner_hits.get(txt, 0) + 1
                    continue
                y = round(w["top"], 1)
                rows.setdefault(y, []).append(w)

            lines = []
            for y in sorted(rows):
                row_words = sorted(rows[y], key=lambda w: w["x0"])
                x0 = row_words[0]["x0"]
                txt = " ".join(w["text"] for w in row_words)
                txt = re.sub(rf"\s*{idx}\s*$", "", txt).strip()
                if txt:
                    lines.append((x0, txt, row_words))   # keep row_words

            deck.append({
                "page":  idx,
                "title": slide_title(words),
                "lines": lines                          # (x0, txt, row_words)
            })

    banner_texts = {t for t,c in banner_hits.items() if c/len(deck) >= 0.8}
    for sl in deck:
        sl["lines"] = [(x,t,rw) for x,t,rw in sl["lines"] if t not in banner_texts]

    return deck

# ----------------------------------------------------------------------
# 2. deck ➜ slides  (explode inline bullets)
# ----------------------------------------------------------------------
def explode_line(x0, line, row_words, page_idx):
    """
    If the row is “Parent • child • child”, use x‑pos of the first bullet
    glyph for all children so that INTRO and “Le génome humain” share the
    SAME indent.
    """
    line = re.sub(rf"\s*{page_idx}\s*$", "", line).strip()
    if "•" not in line:
        return [{"indent": x0, "text": line}]

    head, tail = line.split("•", 1)
    head = head.strip()
    out  = []

    # parent node (if any)
    if head:
        out.append({"indent": x0, "text": head})

    bullet_x = first_bullet_x(row_words) or x0
    children = [frag.strip() for frag in tail.split("•") if frag.strip()]
    out.extend({"indent": bullet_x, "text": c} for c in children)
    return out

def split_bullets(deck):
    slides = []
    for sl in deck:
        bullets = []
        for x, txt, row in sl["lines"]:
            bullets.extend(explode_line(x, txt, row, sl["page"]))
        slides.append({"page": sl["page"], "title": sl["title"], "bullets": bullets})
    return slides

# ----------------------------------------------------------------------
# 3. bullets ➜ nested tree
# ----------------------------------------------------------------------
def bullet_tree(slide: dict, tol=INDENT_TOL):
    bullets = slide["bullets"]
    if not bullets:
        return {"page": slide["page"], "title": slide["title"], "tree": []}

    indents = sorted({b["indent"] for b in bullets})
    # cluster indents
    groups, cur = [], [indents[0]]
    for x in indents[1:]:
        if x - cur[-1] <= tol: cur.append(x)
        else: groups.append(cur); cur=[x]
    groups.append(cur)
    depth = {x:i for i,g in enumerate(groups) for x in g}

    root, stack = [], []
    for b in bullets:
        lvl = depth[b["indent"]]
        node = {"text": b["text"], "children": []}
        while len(stack) > lvl: stack.pop()
        (root if not stack else stack[-1]["children"]).append(node)
        stack.append(node)

    return {"page": slide["page"], "title": slide["title"], "tree": root}

# ----------------------------------------------------------------------
# demo run
# ----------------------------------------------------------------------
PDF = "./volume/slides/cours_1.pdf"      # adjust path

deck   = extract_lines(PDF)
slides = split_bullets(deck)
trees  = [bullet_tree(sl) for sl in slides]

# print the first two slide trees for inspection
from pprint import pprint
pprint(trees, width=120, sort_dicts=False)


# Phase 1: Extract outline and slides

In [139]:
course_plan = trees[1]
trees.pop(1)  # Remove page 1 (course plan)
course_content = trees

pprint(course_plan, width=120, sort_dicts=False)

{'page': 2,
 'title': 'Plan du cours',
 'tree': [{'text': 'L1SpS: UE 2 Les molécules du vivant',
           'children': [{'text': 'Plan du cours',
                         'children': [{'text': 'Introduction',
                                       'children': [{'text': 'Notions fondamentales', 'children': []}]},
                                      {'text': 'Le génome humain',
                                       'children': [{'text': 'Projet de séquençage', 'children': []},
                                                    {'text': 'Architecture globale',
                                                     'children': [{'text': 'Génome mitochondrial', 'children': []},
                                                                  {'text': 'Génome nucléaire', 'children': []}]},
                                                    {'text': 'Description des principaux éléments constituants',
                                                     'children': [{'text': 'Gènes (codan

In [161]:
import importlib
import course

from course import Outline


# Reload the helper module to get the latest changes
importlib.reload(course)

from course import Outline, Course, Section, SlideMapping, ContentSection, Content

In [162]:



# Reload the helper module to get the latest changes
importlib.reload(course)

# -------------------- PHASE 1: OUTLINE CREATION --------------------
PHASE_1_SYSTEM_PROMPT = """
You are a course structure analyzer. Your task is to create a hierarchical outline from course slides and course plan.

## INPUT STRUCTURE:
You will receive TWO inputs:

### Assistant Message - Course Plan:
A hierarchical course plan extracted from slides, like:
```json
{
  "course_plan": {
    "title": "Plan du cours | Agenda du cours | ...",
    "tree": [
      {
        "text": "Cours sur la biologie",
        "children": [
          {
            "text": "Plan du cours", 
            "children": [
              {
                "text": "Section 1",
                "children": [
                  {"text": "Sous section 1.1", "children": []}
                ]
              },
              {
                "text": "Section 2",
                "children": [
                  {"text": "Sous section 2.1", "children": []},
                  {"text": "Sous section 2.2", "children": []}
                ]
              }
            ]
          }
        ]
      }
    ]
  }
}
```

### User Message - Course Content:
All slides including the plan slide:
```json
{
  "course_content": [
    {
      "page": 1,
      "title": "Introduction to Machine Learning", 
      "tree": "extracted content from slide...",
      "images": ["path1.png", "path2.jpg"]
    },
    {
      "page": 2,
      "title": "Plan du cours",
      "tree": [...], // The hierarchical plan structure
      "images": ["diagram.png"]
    }
  ]
}
```

## OUTPUT STRUCTURE:
Return ONLY valid JSON matching this exact schema:
```json
{
  "sections": [
    {
      "title": "Introduction",
      "content": [],  // ALWAYS EMPTY
      "subsections": [
        {
          "title": "Sous section 1.1",
          "content": [],  // ALWAYS EMPTY 
          "subsections": []
        }
      ]
    }
  ],
  "slide_mappings": [
    {
      "slide_number": 1,
      "section_path": ["Section 1"]
    },
    {
      "slide_number": 2, 
      "section_path": ["Section 1", "Sous section 1.1"]
    }
  ]
}
```

## RULES:
1. Use the course_plan from assistant message as your PRIMARY guide for structure
2. Create logical hierarchical sections based on the plan's tree structure
3. NEVER fill the "content" arrays - leave them empty []
4. Map every slide from course_content to exactly one section path
5. Slides that match plan sections should be grouped accordingly
6. Use section titles from the course_plan when available, adapt as needed
7. Ensure slide_mappings covers ALL slides in course_content (including the plan slide)
8. The plan slide itself should be mapped to a top-level or intro section
"""

client = OpenAI()

result_p1 = client.responses.parse(
    model="gpt-4.1-2025-04-14",
    input=[
        {"role": "system",    "content": PHASE_1_SYSTEM_PROMPT},
        {"role": "assistant", "content": json.dumps({"course_plan": course_plan}, ensure_ascii=False)},
        {"role": "user",      "content": json.dumps({"course_content": course_content}, ensure_ascii=False)}
    ],
    text_format=Content,
)

outline: Content = result_p1.output_parsed

outline.print_outline()



In [163]:
print(outline.print_outline())

Course Outline:
Introduction
  Notions fondamentales
Le génome humain
  Projet de séquençage
  Architecture globale
    Génome mitochondrial
    Génome nucléaire
  Description des principaux éléments constituants
    Gènes (codants et non-codants), pseudogènes et éléments répétés
Comparaison aux autres génomes
Les types de variations du génome humain et leurs conséquences
La variabilité du génome humain
Evolution des génomes: notions essentielles, mécanismes


In [None]:
len(outline.sections)

# Extracting only the sections and outlines


In [None]:
from course import Content

# -------- 2. Build the two JSON payloads ---------------
assistant_json = outline.model_dump_json()
slides_json    = json.dumps({"course_content": course_content}, ensure_ascii=False)
#  course_content is still the list of slide dicts you fed in Phase 1
#  (each dict has 'page', 'title', 'tree', … )

# -------- 3. Prompt for the writer model ---------------
PHASE_2_SYSTEM_PROMPT = """
You are a medical content writer. Your task is to generate detailed course content for each section.

## INPUT STRUCTURE:
- **Assistant message**: Outline of the course and the slide mappings
- **User message**: Original slides data parsed from PDF

The assistant message contains the hierarchy and slide mappings:
```json
{
  "sections": [
    {
      "title": "Chapter 1: Introduction", 
      "content": [],  // Empty 
      "subsections": [...]
    }
  ],
  "slide_mappings": [
    {"slide_number": 1, "section_path": ["Chapter 1: Introduction"]},
    {"slide_number": 2, "section_path": ["Chapter 1: Introduction", "ML Basics"]}
  ]
}
```

## OUTPUT STRUCTURE:
Return the SAME JSON structure but with content arrays filled:
```json
{
  "sections": [
    {
      "title": "Chapter 1: Introduction",
      "content": [
        "L'intelligence artificielle représente...",
        "Les algorithmes d'apprentissage automatique...",
        "Cette approche révolutionnaire permet..."
      ],
      "subsections": [
        {
          "title": "ML Basics",
          "content": [
            "Le machine learning se définit comme...",
            "Les principales catégories incluent..."
          ],
          "subsections": []
        }
      ]
    }
  ],
  "slide_mappings": [
    // KEEP EXACTLY THE SAME as assistant message
    {"slide_number": 1, "section_path": ["Chapter 1: Introduction"]},
    {"slide_number": 2, "section_path": ["Chapter 1: Introduction", "ML Basics"]}
  ]
}
```

## RULES:
1. For each section in slide_mappings, find corresponding slides from user message
2. Use the slide_mappings to find the content corresponding to each section/subsection
3. Write 5-10 French paragraphs per section (≤5 sentences each)
4. Copy section titles EXACTLY - do not change hierarchy
5. Keep slide_mappings IDENTICAL to assistant message
6. Fill ALL content arrays, of all sections and subsections, even if only 1-2 paragraphs
7. Summarize slide content accurately and comprehensively
8. Use clear, educational language appropriate for students
9. Maintain logical flow between paragraphs within each section
"""

result2 = client.responses.parse(
    model="gpt-4.1-2025-04-14",
    input=[
        {"role": "system",    "content": PHASE_2_SYSTEM_PROMPT},
        {"role": "assistant", "content": assistant_json},
        {"role": "user",      "content": slides_json},
    ],
    text_format=Content,     # guarantees parse‑able output
)

course_draft: Content = result2.output_parsed
print("✅ Phase 2 done – got", len(course_draft.sections), "top‑level sections")

In [173]:
type(course_draft)

course.Content

# Creating the content of the course



In [167]:
print(course_draft.print_content(max_chars_per_line=150))

Course Content:
Introduction
  Ce cours s'inscrit dans l'UE 2 « Les molécules du vivant » et propose d'explorer l'architecture du génome humain, en présentant son organisation,
  ses composants principaux et les méthodes utilisées dans ce domaine.
  L'étude du génome a permis de mieux comprendre la structure et le fonctionnement des organismes vivants, en mettant en lumière le rôle de l'ADN
  comme support de l'information génétique.
  La connaissance approfondie du génome humain est cruciale pour la compréhension du développement, de l'évolution, de la santé et de la maladie chez
  l'homme.
  L'approche du cours mêle des concepts évolutifs (arbre de la vie, adaptation, sélection) avec des bases de génétique moléculaire.
  Les implications médicales et biotechnologiques de la génomique humaine sont également abordées, notamment pour le diagnostic et la recherche.

  Notions fondamentales
    L’arbre de la vie illustre les liens évolutifs entre toutes les formes de vie, à partir du dern

In [174]:
import importlib
importlib.reload(course)    
from course import Course

cours_1 = Course(
    name="Les molécules du vivant",
    subject="Biologie",
    year=2025,
    professor="Professeur 1",

    content=course_draft
)


ValidationError: 1 validation error for Course
content
  Input should be a valid dictionary or instance of Content [type=model_type, input_value=Content(sections=[Content...path=['Introduction'])]), input_type=Content]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type