COURSE VIDEO GENERATOR - Google Colab Application
================================================================================

This application converts books/text into educational videos with AI narration
and provides an interactive chat interface for Q&A.

Models Used (Optimized for T4 GPU - ~16GB VRAM):
- LLM: Qwen/Qwen2.5-1.5B-Instruct (4-bit)
- Image Gen: stabilityai/sdxl-turbo
- TTS: parler-tts/parler_tts_mini_v0.1
- ASR: openai/whisper-tiny


Author: Denis Ngugi Gathondu

#          INSTALL DEPENDENCIES       
# =================================================================

In [None]:
# Run this cell first to install all required packages

!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q --upgrade transformers
!pip install -q accelerate bitsandbytes
!pip install -q diffusers accelerate
!pip install -q parler-tts
!pip install -q gradio>=4.0.0
!pip install -q moviepy pillow
!pip install -q PyPDF2 python-docx
!pip install -q sentencepiece protobuf
!pip install -q scipy soundfile

#        IMPORTS AND CONFIGURATION
# =================================================================

In [None]:
import os
import gc
import re
import json
import tempfile
import warnings
from dataclasses import dataclass, field
from enum import Enum, auto
from typing import Optional, List, Dict, Any, Tuple, Generator
from pathlib import Path

import torch
import numpy as np
from PIL import Image
import soundfile as sf
from google.colab import userdata

# Hugging face
from huggingface_hub import login

# Transformers and ML
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    BitsAndBytesConfig,
)

# Diffusion for image generation
from diffusers import StableDiffusionXLPipeline

# TTS
from parler_tts import ParlerTTSForConditionalGeneration

# Audio processing
import librosa

# Gradio for UI
import gradio as gr

# Video generation
from moviepy.editor import (
    ImageClip,
    AudioFileClip,
    concatenate_videoclips,
)

# Document processing
import PyPDF2
from docx import Document

warnings.filterwarnings("ignore")


# login to hugging face
hf_token = userdata.get("HF_TOKEN")
login(token=hf_token, add_to_git_credential=True)


# CONFIGURATION
# =================================================================

In [None]:
@dataclass
class AppConfig:
    """Application configuration with model names and settings."""

    # Model names (all non-gated, small models for T4 GPU)
    LLM_MODEL: str = "Qwen/Qwen2.5-1.5B-Instruct"
    IMAGE_MODEL: str = "stabilityai/sdxl-turbo"
    TTS_MODEL: str = "parler-tts/parler_tts_mini_v0.1"
    ASR_MODEL: str = "openai/whisper-tiny"

    # Generation settings
    MAX_TOKENS: int = 2048
    TEMPERATURE: float = 0.7
    TOP_P: float = 0.9

    # Video settings
    VIDEO_WIDTH: int = 512
    VIDEO_HEIGHT: int = 512
    FPS: int = 24
    AUDIO_SAMPLE_RATE: int = 24000

    # Section settings
    MAX_SECTION_LENGTH: int = 1000  # Characters per section
    MIN_SECTION_LENGTH: int = 200  # Minimum for a valid section

# MODEL MANAGER
# ================================================================

In [None]:
class ModelState(Enum):
    """Possible states for a model."""

    UNLOADED = auto()
    LOADED = auto()
    LOADING = auto()
    ERROR = auto()


@dataclass
class ModelContainer:
    """Container for a loaded model and its components."""

    name: str
    model: Optional[Any] = None
    tokenizer: Optional[Any] = None
    processor: Optional[Any] = None
    state: ModelState = ModelState.UNLOADED
    error_message: Optional[str] = None


class ModelManager:
    """
    Memory-efficient model manager for T4 GPU.

    Manages loading/unloading of models to stay within VRAM limits.
    Provides phase-based loading for video generation vs chat.
    """
    def __init__(self, config: AppConfig):
        self.config = config
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Initialize model containers
        self._models: Dict[str, ModelContainer] = {
            "llm": ModelContainer(name=self.config.LLM_MODEL),
            "image": ModelContainer(name=self.config.IMAGE_MODEL),
            "tts": ModelContainer(name=self.config.TTS_MODEL),
            "asr": ModelContainer(name=self.config.ASR_MODEL),
        }

        # Quantization config for LLM
        self._quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_type="nf4",
        )

        print(f"üñ•Ô∏è Device: {self.device}")
        if torch.cuda.is_available():
            print(f"üìä GPU: {torch.cuda.get_device_name(0)}")
            print(
                f"üíæ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
            )

    def _clear_memory(self):
        """Force garbage collection and GPU cache clearing."""
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()

    def get_memory_usage(self) -> Dict[str, float]:
        """Get current GPU memory usage in GB."""
        if not torch.cuda.is_available():
            return {"allocated": 0, "reserved": 0, "free": 0}

        allocated = torch.cuda.memory_allocated(0) / 1e9
        reserved = torch.cuda.memory_reserved(0) / 1e9
        total = torch.cuda.get_device_properties(0).total_memory / 1e9

        return {
            "allocated": round(allocated, 2),
            "reserved": round(reserved, 2),
            "free": round(total - reserved, 2),
        }

    # -------------------------------------------------------------------------
    # LLM Methods
    # -------------------------------------------------------------------------

    def load_llm(self) -> bool:
        """Load the LLM with 4-bit quantization."""
        container = self._models["llm"]

        if container.state == ModelState.LOADED:
            return True

        try:
            container.state = ModelState.LOADING
            print(f"üì• Loading LLM ({container.name})...")

            container.tokenizer = AutoTokenizer.from_pretrained(
                container.name,
                trust_remote_code=True,
            )

            container.model = AutoModelForCausalLM.from_pretrained(
                container.name,
                quantization_config=self._quant_config,
                device_map="auto",
                trust_remote_code=True,
            )

            container.state = ModelState.LOADED
            print(f"‚úÖ LLM loaded! Memory: {self.get_memory_usage()}")
            return True

        except Exception as e:
            container.state = ModelState.ERROR
            container.error_message = str(e)
            print(f"‚ùå Failed to load LLM: {e}")
            return False

    def unload_llm(self):
        """Unload the LLM and free memory."""
        container = self._models["llm"]
        if container.model is not None:
            del container.model
            container.model = None
        if container.tokenizer is not None:
            del container.tokenizer
            container.tokenizer = None
        container.state = ModelState.UNLOADED
        self._clear_memory()
        print("üì§ LLM unloaded")

    # -------------------------------------------------------------------------
    # Image Generation Methods
    # -------------------------------------------------------------------------

    def load_image_model(self) -> bool:
        """Load the image generation model."""
        container = self._models["image"]

        if container.state == ModelState.LOADED:
            return True

        try:
            container.state = ModelState.LOADING
            print(f"üì• Loading Image Generator ({container.name})...")

            # Load the pipeline. This should include all sub-components like tokenizers.
            container.model = StableDiffusionXLPipeline.from_pretrained(
                container.name,
                torch_dtype=torch.float16,
                use_safetensors=True,
            ).to(self.device)

            # Enable memory-efficient attention
            container.model.enable_attention_slicing()

            container.state = ModelState.LOADED
            print(f"‚úÖ Image model loaded! Memory: {self.get_memory_usage()}")
            return True

        except Exception as e:
            container.state = ModelState.ERROR
            container.error_message = str(e)
            print(f"‚ùå Failed to load image model: {e}")
            return False

    def unload_image_model(self):
        """Unload the image model and free memory."""
        container = self._models["image"]
        if container.model is not None:
            del container.model
            container.model = None
        container.state = ModelState.UNLOADED
        self._clear_memory()
        print("üì§ Image model unloaded")

    # -------------------------------------------------------------------------
    # TTS Methods
    # -------------------------------------------------------------------------

    def load_tts(self) -> bool:
        """Load the TTS model."""
        container = self._models["tts"]

        if container.state == ModelState.LOADED:
            return True

        try:
            container.state = ModelState.LOADING
            print(f"üì• Loading TTS ({container.name})...")

            container.model = ParlerTTSForConditionalGeneration.from_pretrained(
                container.name
            ).to(self.device)

            container.tokenizer = AutoTokenizer.from_pretrained(container.name)

            container.state = ModelState.LOADED
            print(f"‚úÖ TTS loaded! Memory: {self.get_memory_usage()}")
            return True

        except Exception as e:
            container.state = ModelState.ERROR
            container.error_message = str(e)
            print(f"‚ùå Failed to load TTS: {e}")
            return False

    def unload_tts(self):
        """Unload the TTS model and free memory."""
        container = self._models["tts"]
        if container.model is not None:
            del container.model
            container.model = None
        if container.tokenizer is not None:
            del container.tokenizer
            container.tokenizer = None
        container.state = ModelState.UNLOADED
        self._clear_memory()
        print("üì§ TTS unloaded")

    # -------------------------------------------------------------------------
    # ASR Methods
    # -------------------------------------------------------------------------

    def load_asr(self) -> bool:
        """Load the ASR model."""
        container = self._models["asr"]

        if container.state == ModelState.LOADED:
            return True

        try:
            container.state = ModelState.LOADING
            print(f"üì• Loading ASR ({container.name})...")

            container.processor = AutoProcessor.from_pretrained(container.name)
            container.model = AutoModelForSpeechSeq2Seq.from_pretrained(
                container.name,
                torch_dtype=torch.float16,
                use_safetensors=True,
            ).to(self.device)


            container.state = ModelState.LOADED
            print(f"‚úÖ ASR loaded! Memory: {self.get_memory_usage()}")
            return True

        except Exception as e:
            container.state = ModelState.ERROR
            container.error_message = str(e)
            print(f"‚ùå Failed to load ASR: {e}")
            return False

    def unload_asr(self):
        """Unload the ASR model and free memory."""
        container = self._models["asr"]
        if container.model is not None:
            del container.model
            container.model = None
        container.state = ModelState.UNLOADED
        self._clear_memory()
        print("üì§ ASR unloaded")

    # -------------------------------------------------------------------------
    # Phase-Based Loading
    # -------------------------------------------------------------------------

    def load_for_video_generation(self) -> bool:
        """
        Load ALL models needed for video generation phase.
        Models: LLM, Image Gen, TTS
        """
        print("\n" + "=" * 50)
        print("üé¨ Loading Video Generation Phase Models")
        print("=" * 50)

        self.unload_asr()  # Not needed for video generation

        success = True
        success &= self.load_llm()
        success &= self.load_image_model()
        success &= self.load_tts()

        if success:
            print("\n‚úÖ All video generation models loaded!")
            print(f"üìä Final memory: {self.get_memory_usage()}")
        else:
            print("\n‚ùå Some models failed to load")

        return success

    def load_for_chat(self) -> bool:
        """
        Load models needed for chat phase.
        Models: LLM (keep), ASR (add)
        Unloads: Image Gen, TTS
        """
        print("\n" + "=" * 50)
        print("üí¨ Loading Chat Phase Models")
        print("=" * 50)

        # Unload heavy models not needed for chat
        self.unload_image_model()
        self.unload_tts()

        success = True
        success &= self.load_llm()  # Keep loaded
        success &= self.load_asr()  # Add for voice input

        if success:
            print("\n‚úÖ All chat models loaded!")
            print(f"üìä Final memory: {self.get_memory_usage()}")
        else:
            print("\n‚ùå Some models failed to load")

        return success

    def unload_all(self):
        """Unload all models."""
        self.unload_llm()
        self.unload_image_model()
        self.unload_tts()
        self.unload_asr()
        print("üßπ All models unloaded!")

    # -------------------------------------------------------------------------
    # Generation Methods
    # -------------------------------------------------------------------------

    def generate_text(
        self,
        prompt: str,
        system_prompt: Optional[str] = None,
        max_new_tokens: Optional[int] = None
    ) -> str:
        """Generate text using the LLM."""
        container = self._models["llm"]

        if container.state != ModelState.LOADED:
            raise RuntimeError("LLM not loaded")

        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": prompt})

        print("  >> LLM: Applying chat template...")
        text = container.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )

        inputs = container.tokenizer(text, return_tensors="pt").to(
            container.model.device
        )

        generate_kwargs = {
            "max_new_tokens": max_new_tokens if max_new_tokens is not None else self.config.MAX_TOKENS,
            "temperature": self.config.TEMPERATURE,
            "top_p": self.config.TOP_P,
            "do_sample": True,
            "pad_token_id": container.tokenizer.eos_token_id,
        }

        print("  >> LLM: Generating response...")
        with torch.no_grad():
            outputs = container.model.generate(
                **inputs,
                **generate_kwargs
            )
        print("  >> LLM: Decoding response...")
        response = container.tokenizer.decode(
            outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
        )
        print("  >> LLM: Response generated.")

        return response.strip()

    def generate_image(self, prompt: str, negative_prompt: str = "") -> Image.Image:
        """Generate an image from a text prompt."""
        container = self._models["image"]

        if container.state != ModelState.LOADED:
            raise RuntimeError("Image model not loaded")

        with torch.no_grad():
            image = container.model(
                prompt=prompt,
                num_inference_steps=1,
                guidance_scale=0.0,
                height=self.config.VIDEO_HEIGHT,
                width=self.config.VIDEO_WIDTH,
            ).images[0]

        return image

    def generate_audio(
        self, text: str, description: str = None
    ) -> Tuple[np.ndarray, int]:
        """Generate audio from text using TTS."""
        container = self._models["tts"]

        if container.state != ModelState.LOADED:
            raise RuntimeError("TTS model not loaded")

        if description is None:
            description = "A clear, professional instructor explaining educational content in a calm, engaging voice."

        inputs = container.tokenizer(description, return_tensors="pt").to(
            container.model.device
        )
        prompt = container.tokenizer(text, return_tensors="pt").to(
            container.model.device
        )

        with torch.no_grad():
            audio = container.model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                prompt_input_ids=prompt.input_ids,
                prompt_attention_mask=prompt.attention_mask,
            )

        audio_arr = audio.cpu().numpy().squeeze()
        return audio_arr, self.config.AUDIO_SAMPLE_RATE

    def transcribe_audio(self, audio_path: str) -> str:
        """Transcribe."""
        container = self._models["asr"]

        if container.state != ModelState.LOADED:
            raise RuntimeError("ASR model not loaded")

        audio, sr = librosa.load(audio_path, sr=16000)

        inputs = container.processor(
            audio,
            sampling_rate=16000,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            generated_ids = container.model.generate(**inputs)

        transcription = container.processor.batch_decode(
            generated_ids, skip_special_tokens=True
        )[0]

        return transcription

    # -------------------------------------------------------------------------
    # Properties for easy access
    # -------------------------------------------------------------------------

    @property
    def llm_loaded(self) -> bool:
        return self._models["llm"].state == ModelState.LOADED

    @property
    def image_loaded(self) -> bool:
        return self._models["image"].state == ModelState.LOADED

    @property
    def tts_loaded(self) -> bool:
        return self._models["tts"].state == ModelState.LOADED

    @property
    def asr_loaded(self) -> bool:
        return self._models["asr"].state == ModelState.LOADED

# CONTENT PROCESSOR
# ================================================================

In [None]:
class ContentProcessor:
    """
    Processes uploaded documents and prepares them for video generation.
    Handles PDF, DOCX, TXT files.
    """

    def __init__(self, model_manager: ModelManager):
        self.manager = model_manager

    def extract_text(self, file_path: str) -> str:
        """Extract text from various file formats."""
        path = Path(file_path)
        suffix = path.suffix.lower()

        if suffix == ".pdf":
            return self._extract_from_pdf(file_path)
        elif suffix == ".docx":
            return self._extract_from_docx(file_path)
        elif suffix == ".txt":
            return self._extract_from_txt(file_path)
        else:
            raise ValueError(f"Unsupported file format: {suffix}")

    def _extract_from_pdf(self, file_path: str) -> str:
        """Extract text from PDF file."""
        text = []
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                text.append(page.extract_text())
        return "\n\n".join(text)

    def _extract_from_docx(self, file_path: str) -> str:
        """Extract text from DOCX file."""
        doc = Document(file_path)
        return "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])

    def _extract_from_txt(self, file_path: str) -> str:
        """Extract text from TXT file."""
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()

    def create_sections(
        self, text: str, max_length: int = 1000
    ) -> List[Dict[str, str]]:
        """
        Break text into logical sections for video generation.

        Returns list of dicts with 'title' and 'content' keys.
        """
        system_prompt = """You are an expert educator who creates clear, engaging course content.
Given text from educational material, your task is to:
1. Break it into logical sections (each section should be a coherent topic)
2. Create a clear, concise title for each section
3. Ensure each section is educational and self-contained

Format your response as JSON array:
[
  {"title": "Section Title", "content": "Section content here..."},
  ...
]"""

        prompt = f"""Please break the following educational content into logical sections.
Each section should be roughly {max_length} characters or less and cover a single coherent topic.

Content to process:
{text[:8000]}  # Limit to avoid token limits

Return ONLY the JSON array, no other text."""

        response = self.manager.generate_text(prompt, system_prompt)

        # Parse JSON from response
        try:
            # Find JSON array in response
            json_match = re.search(r"\[.*\]", response, re.DOTALL)
            if json_match:
                sections = json.loads(json_match.group())
                return sections
        except json.JSONDecodeError:
            pass

        # Fallback: simple paragraph-based splitting
        return self._fallback_sections(text, max_length)

    def _fallback_sections(self, text: str, max_length: int) -> List[Dict[str, str]]:
        """Fallback section creation if LLM parsing fails."""
        paragraphs = text.split("\n\n")
        sections = []

        current_content = ""
        section_num = 1

        for para in paragraphs:
            para = para.strip()
            if not para:
                continue

            if len(current_content) + len(para) > max_length:
                if current_content:
                    sections.append(
                        {
                            "title": f"Section {section_num}",
                            "content": current_content.strip(),
                        }
                    )
                    section_num += 1
                    current_content = para
                else:
                    sections.append(
                        {"title": f"Section {section_num}", "content": para}
                    )
                    section_num += 1
            else:
                current_content += "\n\n" + para

        if current_content.strip():
            sections.append(
                {"title": f"Section {section_num}", "content": current_content.strip()}
            )

        return sections

    def generate_script(self, section: Dict[str, str]) -> str:
        """Generate a narration script from a section."""
        system_prompt = """You are an expert educator and scriptwriter.
Create clear, engaging narration scripts for educational videos.
The script should:
- Be conversational and easy to understand
- Include clear explanations of concepts
- Use analogies and examples where helpful
- Be suitable for text-to-speech narration
- Be concise but comprehensive"""

        prompt = f"""Create a narration script for the following section.
The script should be suitable for an educational video.

Section Title: {section["title"]}
Section Content: {section["content"]}

Write a clear, engaging narration script:"""

        return self.manager.generate_text(prompt, system_prompt)

    def generate_image_prompt(self, section: Dict[str, str], script: str) -> str:
        """Generate an image prompt for a section."""
        system_prompt = """You create image prompts for educational videos.
Given a section of educational content, create a detailed image prompt that:
- Visually represents the key concepts
- Is suitable for AI image generation
- Is educational and appropriate
- Uses clear, descriptive language
- Is concise and focused, preferably under 60 words to avoid truncation."""

        prompt = f"""Create an image prompt for this educational section.
The image should help illustrate the concepts being discussed.

Section Title: {section["title"]}
Script: {script[:500]} # Limit script to avoid overwhelming the prompt

Create a detailed, concise image prompt (max 60 words):"""

        return self.manager.generate_text(prompt, system_prompt, max_new_tokens=60)

# VIDEO GENERATOR
# =================================================================

In [None]:
class VideoGenerator:
    """
    Generates educational videos from processed content.
    Coordinates audio and visual generation.
    """

    def __init__(self, model_manager: ModelManager, content_processor: ContentProcessor):
        self.manager = model_manager
        self.processor = content_processor

    def generate_section_video(
        self, section: Dict[str, str], script: str, output_dir: str, section_num: int
    ) -> Dict[str, str]:
        """
        Generate video components for a single section.

        Returns dict with paths to generated files.
        """
        print(f"\nüé¨ Generating video for section {section_num}: {section['title']}")

        paths = {}

        # Generate image prompt
        print("  üìù Generating image prompt...")
        image_prompt = self.processor.generate_image_prompt(section, script)

        # Clean up image prompt
        image_prompt = image_prompt.strip('"').strip("' ").strip()

        # Generate image
        print("  üé® Generating image...")
        image = self.manager.generate_image(
            prompt=image_prompt,
            negative_prompt="blurry, low quality, distorted, ugly, text, watermark",
        )

        image_path = os.path.join(output_dir, f"section_{section_num}_image.png")
        image.save(image_path)
        paths["image"] = image_path

        # Generate audio
        print("  üîä Generating audio...")
        audio_data, sample_rate = self.manager.generate_audio(script)

        audio_path = os.path.join(output_dir, f"section_{section_num}_audio.wav")
        sf.write(audio_path, audio_data, sample_rate)
        paths["audio"] = audio_path

        # Calculate audio duration
        audio_duration = len(audio_data) / sample_rate
        paths["duration"] = audio_duration

        print(f"  ‚úÖ Section {section_num} complete! Duration: {audio_duration:.1f}s")

        return paths

    def compile_video(
        self, section_files: List[Dict[str, str]], output_path: str, fps: int = 24
    ) -> str:
        """
        Compile all sections into a final video.
        """
        print("\nüé• Compiling final video...")

        clips = []

        for i, files in enumerate(section_files, 1):
            print(f"  üìå Processing section {i}...")

            # Create image clip with audio duration
            img_clip = ImageClip(files["image"]).set_duration(files["duration"])

            # Add audio
            audio_clip = AudioFileClip(files["audio"])
            video_clip = img_clip.set_audio(audio_clip)

            clips.append(video_clip)

        # Concatenate all clips
        print("  üîó Concatenating clips...")
        final_video = concatenate_videoclips(clips, method="compose")

        # Write output
        print(f"  üíæ Saving video to {output_path}...")
        final_video.write_videofile(
            output_path,
            fps=fps,
            codec="libx264",
            audio_codec="aac",
            temp_audiofile="temp-audio.m4a",
            remove_temp=True,
            verbose=False,
            logger=None,
        )

        # Clean up
        final_video.close()
        for clip in clips:
            clip.close()

        print(f"  ‚úÖ Video saved: {output_path}")

        return output_path

# CHAT MANAGER
# =================================================================

In [None]:
class ChatManager:
    """
    Manages the interactive chat interface.
    Maintains conversation context with the course material.
    """

    def __init__(self, model_manager: ModelManager):
        self.manager = model_manager
        self.conversation_history: List[Dict[str, str]] = []
        self.course_context: str = ""
        self.sections: List[Dict[str, str]] = []

    def set_context(self, text: str, sections: List[Dict[str, str]]):
        """Set the course material context for the chat."""
        self.course_context = text[:10000]  # Limit context size
        self.sections = sections
        self.conversation_history = []  # Reset history for new material

    def ask(
        self, question: str, use_voice: bool = False, audio_path: Optional[str] = None
    ) -> Generator[str, None, None]:
        """
        Ask a question about the course material.

        Yields response chunks for streaming.
        """
        # Transcribe audio if voice input
        if use_voice and audio_path:
            print("üé§ Transcribing voice input...")
            question = self.manager.transcribe_audio(audio_path)
            print(f"  üìù Transcribed: {question}")

        # Build system prompt with context
        system_prompt = f"""You are a helpful educational assistant. You have access to course material
and should help the user understand it better.

Course Material Summary:
{self.course_context[:3000]}

Your role is to:
1. Answer questions about the material clearly and thoroughly
2. Provide additional explanations and examples when helpful
3. Relate answers back to the course content when relevant
4. Be encouraging and supportive in your teaching style"""

        # Add conversation history
        messages = [{"role": "system", "content": system_prompt}]
        messages.extend(self.conversation_history)
        messages.append({"role": "user", "content": question})

        # Store user question
        self.conversation_history.append({"role": "user", "content": question})

        # Generate response
        response = self.manager.generate_text(question, system_prompt)

        # Store assistant response
        self.conversation_history.append({"role": "assistant", "content": response})

        # Yield response (could be modified for actual streaming)
        yield response

    def clear_history(self):
        """Clear conversation history."""
        self.conversation_history = []
        print("üßπ Conversation history cleared")


# MAIN APPLICATION
# =================================================================

In [None]:
class CourseVideoApp:
    """
    Main application class that ties everything together.
    """

    def __init__(self):
        print("üöÄ Initializing Course Video Generator...")

        self.config = AppConfig()
        self.manager = ModelManager(self.config)
        self.processor = ContentProcessor(self.manager)
        self.video_gen = VideoGenerator(self.manager, self.processor)
        self.chat = ChatManager(self.manager)

        self.current_text: str = ""
        self.current_sections: List[Dict[str, str]] = []
        self.current_video_path: Optional[str] = None

        print("‚úÖ Application initialized!")

    def process_document(self, file_path: str) -> Tuple[str, str]:
        """Process uploaded document and return summary."""
        print(f"\nüìÑ Processing document: {file_path}")

        # Ensure LLM is loaded for summary generation
        if not self.manager.llm_loaded:
            self.manager.load_llm()

        # Extract text
        text = self.processor.extract_text(file_path)
        self.current_text = text

        # Generate summary
        summary = self.manager.generate_text(
            f"Summarize the following educational content in 2-3 sentences:\n\n{text[:2000]}"
        )

        print(f"‚úÖ Document processed! Length: {len(text)} characters")

        return f"Document loaded successfully!\n\nSummary: {summary}", text[:1000]

    def create_sections(self, progress=gr.Progress()) -> Tuple[str, str]:
        """Create sections from loaded document."""
        if not self.current_text:
            return "Please upload a document first!", ""

        progress(0.1, desc="Creating sections...")

        # Create sections
        self.current_sections = self.processor.create_sections(self.current_text)

        progress(0.5, desc="Generating section summaries...")

        # Format for display
        sections_text = "üìö Created Sections:\n\n"
        for i, section in enumerate(self.current_sections, 1):
            sections_text += f"**Section {i}: {section['title']}**\n"
            sections_text += f"{section['content'][:200]}...\n\n"

        progress(1.0, desc="Done!")

        return f"Created {len(self.current_sections)} sections!", sections_text

    def generate_video(self, progress=gr.Progress()) -> str:
        """Generate complete video from sections."""
        if not self.current_sections:
            return "Please create sections first!"

        # Load video generation models
        progress(0.0, desc="Loading models...")
        self.manager.load_for_video_generation()

        # Create temp directory for output
        output_dir = tempfile.mkdtemp()
        section_files = []

        try:
            total_sections = len(self.current_sections)

            for i, section in enumerate(self.current_sections):
                progress(
                    (i + 0.5) / total_sections,
                    desc=f"Processing section {i + 1}/{total_sections}...",
                )

                # Generate script
                print(f"\nüìù Generating script for section {i + 1}...")
                script = self.processor.generate_script(section)

                # Generate video components
                progress(
                    (i + 0.8) / total_sections,
                    desc=f"Generating media for section {i + 1}/{total_sections}...",
                )

                files = self.video_gen.generate_section_video(
                    section,
                    script,
                    output_dir,
                    i + 1,
                )
                section_files.append(files)

            # Compile final video
            progress(0.95, desc="Compiling final video...")

            self.current_video_path = os.path.join(output_dir, "course_video.mp4")
            final_video_path = self.video_gen.compile_video(section_files, self.current_video_path)

            progress(1.0, desc="Complete!")

            return final_video_path

        except Exception as e:
            return f"Error generating video: {str(e)}"

    def switch_to_chat(self) -> str:
        """Switch to chat mode (unload heavy models)."""
        self.manager.load_for_chat

        # Set context for chat
        if self.current_text and self.current_sections:
            self.chat.set_context(self.current_text, self.current_sections)

        return "‚úÖ Switched to chat mode! You can now ask questions about the material."

    def chat_response(self, message: str, history: List) -> str:
        """Generate chat response."""
        if not self.manager.llm_loaded:
            return "Please switch to chat mode first!"

        response = ""
        for chunk in self.chat.ask(message):
            response = chunk

        return response

    def voice_chat_response(self, audio_path: str, history: List) -> str:
        """Generate chat response from voice input."""
        if not self.manager.asr_loaded:
            return "Please switch to chat mode first!"

        response = ""
        for chunk in self.chat.ask("", use_voice=True, audio_path=audio_path):
            response = chunk

        return response

# GRADIO UI
# =================================================================

In [None]:
def create_ui():
    """Create the Gradio UI."""

    app = CourseVideoApp()

    with gr.Blocks(
        title="Course Video Generator",
        theme=gr.themes.Soft(),
        css="""
        .header {text-align: center; margin-bottom: 20px;}
        .status {padding: 10px; border-radius: 5px; margin: 10px 0;}
        """,
    ) as demo:
        gr.Markdown(
            """
            # üéì Course Video Generator
            Transform your educational materials into engaging video courses with AI narration.
            """
        )

        with gr.Tabs():
            # =================================================================
            # TAB 1: Video Generator
            # =================================================================
            with gr.TabItem("üé¨ Video Generator"):
                with gr.Row():
                    with gr.Column(scale=1):
                        gr.Markdown("### üìÑ Upload Material")

                        file_input = gr.File(
                            label="Upload Document",
                            file_types=[".pdf", ".docx", ".txt"],
                        )

                        upload_btn = gr.Button("üì§ Upload & Process", variant="primary")

                        upload_status = gr.Textbox(
                            label="Status", lines=2, interactive=False
                        )

                        gr.Markdown("### üìö Sections")
                        sections_output = gr.Textbox(
                            label="Generated Sections", lines=10, interactive=False
                        )

                        create_sections_btn = gr.Button("üìñ Create Sections")

                    with gr.Column(scale=1):
                        gr.Markdown("### üé• Video Generation")

                        generate_btn = gr.Button(
                            "üé¨ Generate Video", variant="primary", size="lg"
                        )

                        video_output = gr.Video(label="Generated Video", height=400)

                        gr.Markdown(
                            """
                            ### ‚öôÔ∏è How it works
                            1. Upload a PDF, DOCX, or TXT file
                            2. Click "Create Sections" to break content into parts
                            3. Click "Generate Video" to create the course video
                            4. Switch to Chat tab to ask questions
                            """
                        )

                # Event handlers
                upload_btn.click(
                    fn=app.process_document,
                    inputs=[file_input],
                    outputs=[upload_status, sections_output],
                )

                create_sections_btn.click(
                    fn=app.create_sections, outputs=[upload_status, sections_output]
                )

                generate_btn.click(fn=app.generate_video, outputs=[video_output])

            # =================================================================
            # TAB 2: Interactive Chat
            # =================================================================
            with gr.TabItem("üí¨ Interactive Chat"):
                gr.Markdown(
                    """
                    ### Ask Questions About Your Course Material
                    After generating your video, switch to chat mode to ask questions!
                    """
                )

                switch_mode_btn = gr.Button(
                    "üîÑ Switch to Chat Mode", variant="secondary"
                )
                mode_status = gr.Textbox(label="Mode Status", interactive=False)

                with gr.Row():
                    with gr.Column(scale=3):
                        chatbot = gr.Chatbot(
                            label="Course Assistant", height=500, show_copy_button=True
                        )

                        with gr.Row():
                            msg_input = gr.Textbox(
                                label="Your Question",
                                placeholder="Ask a question about the course material...",
                                scale=4,
                            )
                            submit_btn = gr.Button("Send", variant="primary", scale=1)

                    with gr.Column(scale=1):
                        gr.Markdown("### üé§ Voice Input")

                        audio_input = gr.Audio(
                            sources=["microphone"],
                            type="filepath",
                            label="Record Question",
                        )

                        voice_btn = gr.Button("üé§ Ask by Voice")

                        gr.Markdown("### üõ†Ô∏è Options")
                        clear_btn = gr.Button("üßπ Clear History")

                # Event handlers
                def user_message(message, history):
                    return "", history + [[message, None]]

                def bot_response(history):
                    if len(history) > 0:
                        message = history[-1][0]
                        response = app.chat_response(message, history[:-1])
                        history[-1][1] = response
                    return history

                switch_mode_btn.click(fn=app.switch_to_chat, outputs=[mode_status])

                msg_input.submit(
                    fn=user_message,
                    inputs=[msg_input, chatbot],
                    outputs=[msg_input, chatbot],
                ).then(fn=bot_response, inputs=[chatbot], outputs=[chatbot])

                submit_btn.click(
                    fn=user_message,
                    inputs=[msg_input, chatbot],
                    outputs=[msg_input, chatbot],
                ).then(fn=bot_response, inputs=[chatbot], outputs=[chatbot])

                def voice_message(audio, history):
                    if audio:
                        response = app.voice_chat_response(audio, history)
                        return history + [["üé§ (voice question)", response]]
                    return history

                voice_btn.click(
                    fn=voice_message, inputs=[audio_input, chatbot], outputs=[chatbot]
                )

                clear_btn.click(
                    fn=lambda: app.chat.clear_history() or [], outputs=[chatbot]
                )

            # =================================================================
            # TAB 3: System Info
            # =================================================================
            with gr.TabItem("‚ÑπÔ∏è System Info"):
                gr.Markdown("### üìä System Information")

                def get_system_info():
                    info = []
                    info.append(
                        f"**Device:** {'CUDA' if torch.cuda.is_available() else 'CPU'}"
                    )
                    if torch.cuda.is_available():
                        info.append(f"**GPU:** {torch.cuda.get_device_name(0)}")
                        mem = app.manager.get_memory_usage()
                        info.append(f"**VRAM Used:** {mem['allocated']} GB")
                        info.append(f"**VRAM Free:** {mem['free']} GB")

                    info.append("\n### Model Status")
                    info.append(
                        f"- LLM: {'‚úÖ Loaded' if app.manager.llm_loaded else '‚ùå Not loaded'}"
                    )
                    info.append(
                        f"- Image Gen: {'‚úÖ Loaded' if app.manager.image_loaded else '‚ùå Not loaded'}"
                    )
                    info.append(
                        f"- TTS: {'‚úÖ Loaded' if app.manager.tts_loaded else '‚ùå Not loaded'}"
                    )
                    info.append(
                        f"- ASR: {'‚úÖ Loaded' if app.manager.asr_loaded else '‚ùå Not loaded'}"
                    )

                    return "\n".join(info)

                system_info_output = gr.Markdown(get_system_info())
                refresh_btn = gr.Button("üîÑ Refresh")
                refresh_btn.click(fn=get_system_info, outputs=[system_info_output])

                gr.Markdown(
                    """
                    ### üìã Model Information

                    | Model | Type | Purpose |
                    |-------|------|---------|
                    | Qwen2.5-1.5B-Instruct | LLM | Text generation, Q&A |
                    | stabilityai/sdxl-turbo | Image | Educational visuals |
                    | Parler-TTS Mini | Audio | Voice narration |
                    | Whisper Tiny | ASR | Voice input |
                    """
                )

    return demo


# MAIN ENTRY POINT
# =================================================================

In [None]:
print("=" * 60)
print("üéì Course Video Generator - Google Colab Edition")
print("=" * 60)

demo = create_ui()

demo.launch(
    debug=True,
    share=True,  # Creates public link for Colab
    show_error=True,
)
