<a href="https://colab.research.google.com/github/creation-extro/ai-nlp/blob/main/day_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install moviepy whisper-timestamped spacy pydantic
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m85.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import whisper_timestamped as whisper
from moviepy.editor import VideoFileClip
import json
import os
import spacy
import sys
from pydantic import BaseModel, Field

# --- LLM OUTPUT STRUCTURE (Day 4 Requirement - Defines the output contract) ---
class QuizOption(BaseModel):
    text: str = Field(description="The text of the answer option.")
    is_correct: bool = Field(description="True if this is the correct answer.")

class Question(BaseModel):
    question: str = Field(description="The multiple-choice question text.")
    options: list[QuizOption] = Field(description="A list of 4 possible answers, with exactly one marked as correct.")

# --- INITIALIZE NLP MODEL (Runs once when the backend starts) ---
try:
    # Load the spaCy model
    nlp = spacy.load("en_core_web_sm")
except Exception as e:
    print(f"FATAL ERROR: spaCy model not loaded. Please run setup commands. Error: {e}", file=sys.stderr)
    nlp = None

# ----------------------------------------------------------------------
# HELPER FUNCTION 1: DAY 2 - Transcription
# ----------------------------------------------------------------------
def get_transcript_data(video_path: str) -> dict | None:
    """Handles audio extraction and Whisper transcription."""

    if not os.path.exists(video_path):
        print(f"ERROR: Video file not found at path: {video_path}")
        return None

    temp_audio_path = "temp_audio_mahesh.mp3"

    # Extract audio
    try:
        video_clip = VideoFileClip(video_path)
        video_clip.audio.write_audiofile(temp_audio_path, logger=None)
        video_clip.close()
    except Exception as e:
        print(f"ERROR: Audio extraction failed. Error: {e}")
        return None

    # Transcribe
    result = None
    try:
        model = whisper.load_model("small")
        result = whisper.transcribe(model, temp_audio_path, language="en", verbose=False)
    except Exception as e:
        print(f"ERROR: Whisper transcription failed. Error: {e}")

    # Clean up the temporary audio file
    if os.path.exists(temp_audio_path):
        os.remove(temp_audio_path)

    return result

# ----------------------------------------------------------------------
# HELPER FUNCTION 2: DAY 3 - Topic Detection
# ----------------------------------------------------------------------
def get_topic_triggers(transcript_data: dict) -> list:
    """Analyzes segments for key nouns/concepts to determine quiz trigger points."""
    if not nlp:
        print("ERROR: Cannot run topic detection. spaCy model not initialized.")
        return []

    topic_segments = []

    for segment in transcript_data.get('segments', []):
        text = segment['text'].strip()
        if not text: continue

        start_time = segment['start']
        doc = nlp(text)

        # Extract Key Nouns/Concepts (Revised logic for technical topics)
        current_concepts = {
            token.text.lower()
            for token in doc
            if token.pos_ in ('NOUN', 'PROPN') and not token.is_stop and token.is_alpha
        }

        if current_concepts:
            concept_list = list(current_concepts)[:5]
            topic_segments.append({
                "start_sec": int(start_time),
                "topic_text": text, # Provide the full text for LLM question generation
                "key_concepts": " | ".join(concept_list),
            })

    return topic_segments

# ----------------------------------------------------------------------
# HELPER FUNCTION 3: DAY 4 - LLM Output Mapping
# ----------------------------------------------------------------------
def map_llm_output_to_quiz_json(llm_output_text: str, trigger_time_sec: int) -> dict:
    """Parses raw LLM JSON output into the final application JSON structure."""

    try:
        llm_data = json.loads(llm_output_text)
        validated_question = Question(**llm_data)

        return {
            "id": f"quiz-{trigger_time_sec}",
            "trigger_time_sec": trigger_time_sec,
            "question": validated_question.question,
            "options": [opt.model_dump() for opt in validated_question.options]
        }

    except Exception as e:
        print(f"ERROR: LLM output mapping failed. Check JSON format/Pydantic validation: {e}")
        return {}


# ----------------------------------------------------------------------
# üéØ MAIN ENTRY POINT FOR THE BACKEND (Day 5 Task Completion)
# ----------------------------------------------------------------------
def run_full_ml_pipeline(video_path: str, simulated_llm_output: str) -> list[dict]:
    """
    The unified function Tejaswi's backend will call.
    It runs the full sequence and returns a list of final quiz objects.

    Args:
        video_path: Local path to the uploaded video file.
        simulated_llm_output: (In production: This step requires a call to Arya's Gemini API logic,
                              but here we use a placeholder.)

    Returns:
        A list of structured quiz dictionaries, or an empty list on failure.
    """

    print(f"\n--- Day 5: Running ML Pipeline for Video: {video_path} ---")

    # 1. DAY 2: Transcription
    raw_transcript_data = get_transcript_data(video_path)
    if not raw_transcript_data:
        print("Pipeline aborted at Transcription stage.")
        return []

    # 2. DAY 3: Topic Trigger Detection
    topic_list = get_topic_triggers(raw_transcript_data)
    if not topic_list:
        print("Pipeline aborted: No topics detected.")
        return []

    # 3. DAY 4: Generate Quizzes (Simulated)
    # In a real scenario, this loop calls Arya's Gemini API for *each* topic segment.
    final_quizzes = []

    # We will simulate only the first quiz for this demo
    first_topic = topic_list[0]

    # NOTE: This is where Tejaswi's API endpoint would call Arya's LLM module,
    # passing in first_topic['topic_text'] and receiving the structured JSON output.

    final_quiz = map_llm_output_to_quiz_json(
        simulated_llm_output,
        first_topic['start_sec']
    )

    if final_quiz:
        final_quizzes.append(final_quiz)

    print(f"--- Pipeline Complete. {len(final_quizzes)} quiz(zes) ready. ---")

    return final_quizzes

# ----------------------------------------------------------------------
# --- DEMO EXECUTION (Simulating Tejaswi's Call) ---
# ----------------------------------------------------------------------
if __name__ == "__main__":

    # --- SIMULATED INPUTS (Update VIDEO_FILE_PATH for real testing) ---
    VIDEO_FILE_PATH_DEMO = 'arrays.mp4' # Replace with your test file path

    # This must be replaced with a loop calling the actual Gemini API in production
    SIMULATED_LLM_OUTPUT_DEMO = """
    {
        "question": "What is the primary benefit of using a small-sized model for transcription?",
        "options": [
            {"text": "Higher overall accuracy", "is_correct": false},
            {"text": "Faster inference and lower hardware requirements", "is_correct": true},
            {"text": "Support for more than 50 languages", "is_correct": false},
            {"text": "Better handling of complex acoustic environments", "is_correct": false}
        ]
    }
    """
    # ------------------------------------------------------------------

    final_result = run_full_ml_pipeline(
        video_path=VIDEO_FILE_PATH_DEMO,
        simulated_llm_output=SIMULATED_LLM_OUTPUT_DEMO
    )

    print("\n\n--- FINAL OUTPUT SENT TO FRONTEND (Kashish) ---")
    if final_result:
        print(json.dumps(final_result, indent=4))
        print(f"\n‚úÖ SUCCESS: Pipeline returned {len(final_result)} quiz object(s).")
    else:
        print("\n‚ùå FAILURE: Pipeline returned an empty list. Check error messages above.")


--- Day 5: Running ML Pipeline for Video: arrays.mp4 ---


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 461M/461M [00:03<00:00, 147MiB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5981/5981 [01:10<00:00, 84.42frames/s]


--- Pipeline Complete. 1 quiz(zes) ready. ---


--- FINAL OUTPUT SENT TO FRONTEND (Kashish) ---
[
    {
        "id": "quiz-0",
        "trigger_time_sec": 0,
        "question": "What is the primary benefit of using a small-sized model for transcription?",
        "options": [
            {
                "text": "Higher overall accuracy",
                "is_correct": false
            },
            {
                "text": "Faster inference and lower hardware requirements",
                "is_correct": true
            },
            {
                "text": "Support for more than 50 languages",
                "is_correct": false
            },
            {
                "text": "Better handling of complex acoustic environments",
                "is_correct": false
            }
        ]
    }
]

‚úÖ SUCCESS: Pipeline returned 1 quiz object(s).
