## Challenge 6: Multi-Source Learning Content Ingestion & Structured Output Generation


In [22]:
!pip -q install --upgrade pip
!pip -q install huggingface_hub transformers
!pip -q install pdfplumber pypdf2 pymupdf pillow pytesseract
!pip -q install yt-dlp ffmpeg-python
!pip -q install git+https://github.com/openai/whisper.git
!pip -q install gradio networkx matplotlib pandas

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [23]:
import os, re, json, gc, textwrap, subprocess
from pathlib import Path

import pdfplumber
try:
    import fitz  # from PyMuPDF
    HAS_PYMUPDF = True
except Exception:
    fitz = None
    HAS_PYMUPDF = False

import pandas as pd
import numpy as np

import networkx as nx
import matplotlib.pyplot as plt

import whisper
import ffmpeg

from huggingface_hub import InferenceClient
from transformers import AutoTokenizer

In [25]:
# ‚¨áÔ∏è Paste your token between quotes (keep it private; don't commit to GitHub)
os.environ["HF_TOKEN"] = os.environ.get("HF_TOKEN", "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")

# Open-source models via HF API
OPEN_API_MODELS = {
    "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",  # default
    "Qwen2.5-7B-Instruct":   "Qwen/Qwen2.5-7B-Instruct",     # slower, better quality
    "TinyLlama-1.1B-Chat":   "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # very fast, lighter quality
}

DEFAULT_API_MODEL_HUMAN = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
CURRENT_API_MODEL = OPEN_API_MODELS[DEFAULT_API_MODEL_HUMAN]

HF_TOKEN = os.getenv("HF_TOKEN", None)
assert HF_TOKEN and HF_TOKEN.startswith("hf_"), "Please set HF_TOKEN with your Hugging Face token."

# Initialize client
HF_CLIENT = InferenceClient(model=CURRENT_API_MODEL, token=HF_TOKEN)

# Tokenizer for chunking only (no weights downloaded)
TOKENIZER_CACHE = {}
def get_tokenizer(model_id):
    if model_id not in TOKENIZER_CACHE:
        TOKENIZER_CACHE[model_id] = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    return TOKENIZER_CACHE[model_id]

TOKENIZER = get_tokenizer(CURRENT_API_MODEL)
print("‚úÖ HF API ready with:", CURRENT_API_MODEL)

KeyError: 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

In [None]:
def clean_text(s: str) -> str:
    if not s: return ""
    s = s.replace("\xa0"," ").replace("\t"," ")
    s = re.sub(r"[ ]{2,}", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def read_pdf(path: str) -> str:
    text = []
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                text.append(page.extract_text() or "")
    except Exception:
        pass
    txt = "\n".join(text).strip()

    # Fallback to PyMuPDF if very short (extraction failed)
    if (not txt or len(txt) < 200) and HAS_PYMUPDF:
        try:
            doc = fitz.open(path)
            txt = "\n".join([page.get_text() for page in doc])
        except Exception:
            pass
    return clean_text(txt or "")

def download_youtube_audio(url: str, out_path="temp_audio.m4a"):
    cmd = ["yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "m4a", "-o", out_path, url]
    subprocess.run(cmd, check=True)
    return out_path

def transcribe_audio_whisper(audio_path: str, model_size="base"):
    model = whisper.load_model(model_size)
    result = model.transcribe(audio_path, fp16=False)
    return clean_text(result.get("text",""))

def youtube_to_text(url: str, whisper_size="base"):
    audio = download_youtube_audio(url)
    return transcribe_audio_whisper(audio, model_size=whisper_size)

In [None]:
def set_hf_api_model(human_choice: str):
    """Switch the API model (e.g., Qwen 1.5B / 7B / TinyLlama)."""
    global CURRENT_API_MODEL, HF_CLIENT, TOKENIZER
    CURRENT_API_MODEL = OPEN_API_MODELS[human_choice]
    HF_CLIENT = InferenceClient(model=CURRENT_API_MODEL, token=HF_TOKEN)
    TOKENIZER = get_tokenizer(CURRENT_API_MODEL)
    return f"‚úÖ Using: {human_choice} ({CURRENT_API_MODEL})"

def extract_chat_content(resp):
    """
    HF client returns an object with .choices[0].message.content (newer),
    or a dict-like structure in older versions. Handle both.
    """
    try:
        return resp.choices[0].message["content"]
    except Exception:
        try:
            return resp.choices[0].message.content
        except Exception:
            # Fallback: string-ish
            return str(resp)

def call_llm_api(prompt: str, max_new_tokens=384, temperature=0.15, system_prompt="You are a precise assistant."):
    """
    Preferred: use chat_completion (provider maps Qwen to 'conversational').
    Falls back to text_generation if needed.
    """
    # Try chat completion
    try:
        resp = HF_CLIENT.chat_completion(
            model=CURRENT_API_MODEL,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt},
            ],
            max_tokens=max_new_tokens,
            temperature=temperature,
        )
        return extract_chat_content(resp)
    except Exception as e_chat:
        # Fallback: try text_generation for models that support it
        try:
            resp = HF_CLIENT.text_generation(
                prompt,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True,
                repetition_penalty=1.1,
                return_full_text=False,
            )
            return resp
        except Exception as e_tg:
            raise RuntimeError(f"HF API failed (chat & text): {e_chat} | {e_tg}")

In [None]:
import ast

def chunk_by_tokens(text: str, tokenizer, max_tokens=800, overlap=80):
    ids = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    i = 0
    while i < len(ids):
        j = min(i + max_tokens, len(ids))
        chunk_ids = ids[i:j]
        chunk_text = tokenizer.decode(chunk_ids)
        chunks.append(chunk_text)
        i = j - overlap
        if i < 0: i = 0
    return chunks

def extract_json_from_text(txt: str):
    """
    1) Grab first {...} or [...] block
    2) json.loads ‚Üí ast.literal_eval
    """
    import json, re
    m = re.search(r'(\{.*\}|\[.*\])', txt, flags=re.S)
    candidate = m.group(1) if m else txt.strip()
    candidate = re.sub(r",\s*([}\]])", r"\1", candidate)
    try:
        return json.loads(candidate)
    except Exception:
        try:
            return ast.literal_eval(candidate)
        except Exception:
            return None

def llm_json(prompt: str, fallback=None, max_new_tokens=512, temperature=0.15):
    resp = call_llm_api(prompt, max_new_tokens=max_new_tokens, temperature=temperature)
    js = extract_json_from_text(resp)
    if js is None:
        return fallback
    return js

In [None]:
def summarize_chunks(text: str, tokenizer, max_tokens=800):
    chunks = chunk_by_tokens(text, tokenizer, max_tokens=max_tokens, overlap=80)
    bullets = []
    for i, ch in enumerate(chunks, 1):
        prompt = f"""Summarize the following content in 5 concise bullet points (no preamble).
Keep it factual and avoid repetition.

CONTENT (part {i}/{len(chunks)}):
{ch}

Bullets:"""
        out = call_llm_api(prompt, max_new_tokens=256)
        bullets.append(out.strip())
    joined = "\n".join(bullets)[:8000]
    meta_prompt = f"""Unify these bullets into a single concise summary with 7 bullets:

BULLETS:
{joined}

Final 7 bullets:"""
    final = call_llm_api(meta_prompt, max_new_tokens=256)
    return final.strip()

def extract_topics_hierarchy(text: str, top_n=20, children_per_topic=5):
    prompt = f"""Extract key learning topics and a hierarchy from the content below.
Return STRICT JSON with this schema:
{{
  "topics": [
    {{
      "name": "string",
      "subtopics": ["string", ...],
      "description": "one-sentence definition"
    }}, ...
  ]
}}
Rules:
- Include up to {top_n} topics
- Each topic may have up to {children_per_topic} subtopics
- Use concise phrases, no duplicates

CONTENT:
{text[:8000]}

JSON ONLY:"""
    fallback = {"topics": []}
    js = llm_json(prompt, fallback=fallback, max_new_tokens=512)
    # Validate & dedupe
    if not isinstance(js, dict) or "topics" not in js:
        js = fallback
    seen = set()
    clean_topics = []
    for t in js.get("topics", []):
        name = (t.get("name") or "").strip()
        if name and name.lower() not in seen:
            seen.add(name.lower())
            st = t.get("subtopics") or []
            st = [s for s in st if isinstance(s, str) and s.strip()][:children_per_topic]
            clean_topics.append({
                "name": name,
                "subtopics": st,
                "description": (t.get("description") or "").strip()
            })
    return {"topics": clean_topics[:top_n]}

def generate_flashcards(text: str, n_cards=20):
    prompt = f"""Create {n_cards} study flashcards from the content below.
Return STRICT JSON array only (no other text).
Each item: {{"question": "Q?", "answer": "A.", "topic": "short tag"}}.
Keep answers 1‚Äì3 sentences, factual, no overlap.

CONTENT:
{text[:8000]}

JSON ARRAY ONLY:"""
    fallback = []
    js = llm_json(prompt, fallback=fallback, max_new_tokens=768)
    # Validate
    cards = []
    if isinstance(js, list):
        for item in js[:n_cards]:
            if isinstance(item, dict):
                q = (item.get("question") or "").strip()
                a = (item.get("answer") or "").strip()
                tp = (item.get("topic") or "").strip() or "general"
                if q and a:
                    cards.append({"question": q, "answer": a, "topic": tp})
    if not cards:
        cards = [{"question": "What is the main idea?",
                  "answer": "Summarize the central concept and its practical implications.",
                  "topic": "overview"}]
    return cards[:n_cards]

In [None]:
def build_concept_graph(topics_json: dict, out_path="outputs/concept_graph.png"):
    G = nx.DiGraph()
    for t in topics_json.get("topics", []):
        topic = t["name"]
        G.add_node(topic)
        for sub in t.get("subtopics", []):
            if sub.strip():
                G.add_node(sub)
                G.add_edge(topic, sub)
    plt.figure(figsize=(10,8))
    pos = nx.spring_layout(G, seed=42, k=0.65)
    nx.draw(
        G, pos,
        with_labels=True,
        node_color='lightblue', node_size=1300,
        font_size=9, arrows=True, arrowstyle='-|>', width=1.2
    )
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)
    plt.close()
    return out_path

In [None]:
def process_corpus(text: str, top_n_topics=18, children_per_topic=4, n_cards=20):
    text = clean_text(text)
    summary = summarize_chunks(text, TOKENIZER)
    topics = extract_topics_hierarchy(text, top_n=top_n_topics, children_per_topic=children_per_topic)
    cards = generate_flashcards(text, n_cards=n_cards)

    summary_json_path = "outputs/summary.json"
    topics_json_path = "outputs/topics.json"
    cards_csv_path = "outputs/flashcards.csv"
    graph_png_path = "outputs/concept_graph.png"

    with open(summary_json_path, "w") as f:
        json.dump({"summary": summary}, f, indent=2)
    with open(topics_json_path, "w") as f:
        json.dump(topics, f, indent=2)
    pd.DataFrame(cards).to_csv(cards_csv_path, index=False)
    build_concept_graph(topics, graph_png_path)

    return {
        "summary": summary,
        "topics": topics,
        "flashcards_csv": cards_csv_path,
        "concept_graph_png": graph_png_path,
        "summary_json": summary_json_path,
        "topics_json": topics_json_path
    }

In [None]:
# import gradio as gr

# def select_engine(api_model_choice):
#     msg = set_hf_api_model(api_model_choice)
#     return msg

# def process_input(pdf_file, youtube_url, raw_text, whisper_size, top_n, children, n_cards):
#     corpus = ""
#     try:
#         if pdf_file is not None:
#             corpus = read_pdf(pdf_file.name)
#         elif youtube_url and youtube_url.strip():
#             corpus = youtube_to_text(youtube_url.strip(), whisper_size=whisper_size)
#         elif raw_text and raw_text.strip():
#             corpus = clean_text(raw_text.strip())
#         else:
#             return "Please provide PDF, YouTube URL, or paste text.", "", "", None

#         if not corpus or len(corpus) < 50:
#             return "The input appears empty or too short after extraction.", "", "", None

#         result = process_corpus(
#             corpus,
#             top_n_topics=int(top_n),
#             children_per_topic=int(children),
#             n_cards=int(n_cards)
#         )
#         return (
#             result["summary"],
#             json.dumps(result["topics"], indent=2),
#             result["flashcards_csv"],
#             result["concept_graph_png"]
#         )
#     except Exception as e:
#         return f"Error: {e}", "", "", None

# with gr.Blocks() as demo:
#     gr.Markdown("## üìö Multi‚ÄëSource Learning Artifact Generator (Qwen via Hugging Face API)")
#     with gr.Row():
#         api_model = gr.Dropdown(choices=list(OPEN_API_MODELS.keys()),
#                                 value=DEFAULT_API_MODEL_HUMAN, label="HF API Model")
#         engine_btn = gr.Button("Use Selected Model")
#         engine_status = gr.Markdown(f"Using: {DEFAULT_API_MODEL_HUMAN} ({CURRENT_API_MODEL})")
#     engine_btn.click(select_engine, inputs=[api_model], outputs=[engine_status])

#     gr.Markdown("### Input Source")
#     with gr.Row():
#         pdf_in = gr.File(label="Upload PDF (optional)")
#         yt_in = gr.Textbox(label="YouTube URL (optional)")
#     txt_in = gr.Textbox(label="Or paste transcript/text", lines=8, placeholder="Paste raw text if not using PDF/YouTube")

#     with gr.Row():
#         whisper_size = gr.Dropdown(choices=["tiny","base","small"], value="base", label="Whisper model (for YouTube audio)")
#         top_n = gr.Slider(5, 30, value=18, step=1, label="Max Topics")
#         children = gr.Slider(2, 8, value=4, step=1, label="Subtopics per Topic")
#         n_cards = gr.Slider(5, 40, value=20, step=1, label="Flashcards Count")

#     run_btn = gr.Button("Process")
#     gr.Markdown("---")

#     summary_out = gr.Textbox(label="Summary (bullets)", lines=12)
#     topics_out = gr.Textbox(label="Topics Hierarchy (JSON)", lines=12)
#     cards_download = gr.File(label="Flashcards CSV")
#     graph_img = gr.Image(label="Concept Graph (PNG)")

#     run_btn.click(
#         fn=process_input,
#         inputs=[pdf_in, yt_in, txt_in, whisper_size, top_n, children, n_cards],
#         outputs=[summary_out, topics_out, cards_download, graph_img]
#     )

# demo.launch(debug=False)

In [None]:

sample = """Transformers use attention mechanisms to weigh relationships between tokens.
They enable parallel processing and have become the backbone for language models.
Key components include multi-head self-attention, positional encoding, and feed-forward layers."""
print(set_hf_api_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0"))
res = process_corpus(sample, top_n_topics=8, children_per_topic=3, n_cards=8)
print(res["summary"][:400])
print(res["flashcards_csv"], res["concept_graph_png"])
