# Extremist Content Detection Pipeline

This pipeline transcribes audio/video files using Whisper and will later analyze the content for extremist material.

To run: pip install -r ai_pipeline/requirements.txt

## 1. Setup and Imports

In [None]:
import whisper
import ffmpeg
from pathlib import Path
import os
from typing import Dict, Optional
import warnings
warnings.filterwarnings('ignore')

## 2. Load Whisper Model

In [None]:
# Load Whisper model (base model for speed, can use 'small', 'medium', 'large' for better accuracy)
print("Loading Whisper model...")
model = whisper.load_model("base")
print("Model loaded successfully!")

## 3. Video/Audio Processing Functions

In [None]:
def convert_video_to_audio(video_path: str, output_audio_path: Optional[str] = None) -> str:
    """
    Convert video file to audio (wav format) using ffmpeg.
    
    Args:
        video_path: Path to input video file
        output_audio_path: Optional path for output audio file
        
    Returns:
        Path to the extracted audio file
    """
    video_path = Path(video_path)
    
    if output_audio_path is None:
        output_audio_path = video_path.with_suffix('.wav')
    else:
        output_audio_path = Path(output_audio_path)
    
    try:
        # Extract audio from video
        stream = ffmpeg.input(str(video_path))
        stream = ffmpeg.output(stream, str(output_audio_path), acodec='pcm_s16le', ac=1, ar='16k')
        ffmpeg.run(stream, overwrite_output=True, capture_stdout=True, capture_stderr=True)
        
        print(f"Converted video to audio: {output_audio_path}")
        return str(output_audio_path)
    
    except ffmpeg.Error as e:
        print(f"Error converting video: {e.stderr.decode()}")
        raise

## 4. Transcription Function

In [None]:
def transcribe_file(file_path: str) -> Dict:
    """
    Transcribe audio or video file using Whisper.
    If video file is provided, it will be converted to audio first.
    
    Args:
        file_path: Path to audio or video file
        
    Returns:
        Dictionary containing transcription results:
        - text: Full transcription text
        - language: Detected language
        - segments: List of segments with timestamps and text
    """
    file_path = Path(file_path)
    
    # Check if file exists
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # Video extensions that need conversion
    video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm']
    audio_extensions = ['.wav', '.mp3', '.m4a', '.flac', '.ogg']
    
    # Determine if we need to convert video to audio
    if file_path.suffix.lower() in video_extensions:
        print(f"Video file detected. Converting to audio...")
        audio_path = convert_video_to_audio(str(file_path))
    elif file_path.suffix.lower() in audio_extensions:
        audio_path = str(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_path.suffix}")
    
    # Transcribe
    print(f"Transcribing: {audio_path}")
    result = model.transcribe(audio_path)
    
    # Return structured output
    return {
        'text': result['text'],
        'language': result['language'],
        'segments': result['segments']
    }

## 5. Example Usage

In [None]:
# Example: Transcribe a file
# Replace with your actual file path
file_path = "path/to/your/audio_or_video_file.mp4"

# Transcribe
# result = transcribe_file(file_path)

# Print results
# print(f"Detected language: {result['language']}")
# print(f"\nFull transcription:\n{result['text']}")
# print(f"\nNumber of segments: {len(result['segments'])}")