# Extremist Content Detection Pipeline

This pipeline transcribes audio/video files using Whisper and will later analyze the content for extremist material.

To run: pip install -r ai_pipeline/requirements.txt

## 1. Setup and Imports

In [1]:
import whisper
import ffmpeg
from pathlib import Path
import os
from typing import Dict, Optional
import warnings
import numpy as np
import soundfile as sf
import librosa
warnings.filterwarnings('ignore')

## 2. Load Whisper Model

In [2]:
# Load Whisper model (base model for speed, can use 'small', 'medium', 'large' for better accuracy)
print("Loading Whisper model...")
model = whisper.load_model("base") 
print("Model loaded successfully!")

Loading Whisper model...
Model loaded successfully!


## 3. Video/Audio Processing Functions

In [3]:
def convert_video_to_audio(video_path: str, output_audio_path: str) -> str:
    """
    Convert video file to audio (wav format) using ffmpeg.
    Args:
        video_path: Path to input video file
        output_audio_path: Optional path for output audio file
    Returns:
        Path to the extracted audio file
    """
    video_path = Path(video_path)
    audio_path = Path(output_audio_path)
    # ensure audio directory exists
    audio_path.parent.mkdir(parents=True, exist_ok=True)
    try:
        print(f"[INFO] Extracting audio from video: {video_path}")
        stream = ffmpeg.input(str(video_path))
        stream = ffmpeg.output(stream, str(output_audio_path), acodec='pcm_s16le', ac=1, ar='16k')
        ffmpeg.run(stream, overwrite_output=True, capture_stdout=True, capture_stderr=True)
        print(f"[INFO] Converted video to audio: {output_audio_path}")
        return str(output_audio_path)
    except Exception as e:
        print(f"[ERROR] Error converting video: {e}")
        raise

def split_audio_to_patches(audio_path: str, patch_duration_sec: int = 120, overlap_sec: int = 30):
    """
    Split audio into overlapping patches.
    Args:
        audio_path: Path to audio file
        patch_duration_sec: Duration of each patch in seconds (default 2 min)
        overlap_sec: Overlap between patches in seconds
    Returns:
        List of patch file paths
    """
    print(f"[INFO] Loading audio for patching: {audio_path}")
    y, sr = librosa.load(audio_path, sr=None)
    total_duration = librosa.get_duration(y=y, sr=sr)
    print(f"[INFO] Audio duration: {total_duration:.2f} seconds")
    patch_samples = int(patch_duration_sec * sr)
    overlap_samples = int(overlap_sec * sr)
    step = patch_samples - overlap_samples
    patches = []
    for start in range(0, len(y), step):
        end = min(start + patch_samples, len(y))
        patch_y = y[start:end]
        patch_idx = len(patches)
        patch_path = Path(audio_path).parent / f"patch_{patch_idx:03d}.wav"
        sf.write(str(patch_path), patch_y, sr)
        print(f"[INFO] Saved patch {patch_idx}: {patch_path} ({(end-start)/sr:.2f}s)")
        patches.append(str(patch_path))
        if end == len(y):
            break
    return patches

def remove_long_silence(audio_path: str, silence_thresh: float = 0.01, min_silence_len: float = 2.0):
    """
    Remove long silent breaks from audio.
    Args:
        audio_path: Path to audio file
        silence_thresh: Amplitude threshold for silence
        min_silence_len: Minimum silence length in seconds to remove
    Returns:
        Path to processed audio file
    """
    print(f"[INFO] Removing long silences from: {audio_path}")
    y, sr = librosa.load(audio_path, sr=None)
    intervals = librosa.effects.split(y, top_db=40)
    processed = []
    for start, end in intervals:
        segment = y[start:end]
        if (end-start)/sr > min_silence_len and np.max(np.abs(segment)) < silence_thresh:
            print(f"[INFO] Skipping long silence: {start/sr:.2f}-{end/sr:.2f}s")
            continue
        processed.append(segment)
    if processed:
        y_out = np.concatenate(processed)
    else:
        y_out = y
    out_path = str(Path(audio_path).with_name(Path(audio_path).stem + '_nosilence.wav'))
    sf.write(out_path, y_out, sr)
    print(f"[INFO] Saved audio without long silences: {out_path}")
    return out_path

## 4. Transcription Function

In [13]:
def transcribe_patches(patches, model):
    """
    Transcribe each audio patch with Whisper and format output for later processing.
    Args:
        patches: list of audio patch file paths
        model: loaded Whisper model
    Returns:
        List of dicts, one per patch, with language and word-level timing info.
    """
    all_results = []
    for i, patch_path in enumerate(patches):
        print(f"[INFO] Transcribing patch {i}: {patch_path}")
        result = model.transcribe(patch_path, word_timestamps=True, verbose=False)
        lang = result.get('language', 'unknown')
        words = []
        word_id = 0
        for segment in result.get('segments', []):
            for word in segment.get('words', []):
                words.append({
                    'id': word_id,
                    'word': word.get('word', '').strip(),
                    'start': word.get('start', segment.get('start', None)),
                    'end': word.get('end', segment.get('end', None)),
                    'probability': word.get('probability', None),
                    'phrase_text': segment.get('text', ''),
                    'phrase_start': segment.get('start', None),
                    'phrase_end': segment.get('end', None)
                })
                word_id += 1
        patch_result = {
            'language': lang,
            'words': words,
            'patch_index': i,
            'patch_text': result.get('text', '')
        }
        print(f"[INFO] Patch {i} language: {lang}, words: {len(words)}")
        all_results.append(patch_result)
    return all_results

## 5. Example Usage

In [14]:
# --- FULL PIPELINE DEMO ---
dataset_root = os.getenv("DATASET_ROOT")
dataset_path = Path(f"{dataset_root}/hack_dataset/")
audio_path_output = dataset_path / "hate_audios" / "test_audio_5.wav"
test_video_path = dataset_path / "hate_videos" / "hate_video_5.mp4"


video_path = test_video_path

# 1. Convert video to audio
raw_audio_path = convert_video_to_audio(video_path, output_audio_path=audio_path_output)

# 2. Remove long silences from audio
# processed_audio_path = remove_long_silence(raw_audio_path)
# no preprocessing to keep timestamps of the raw audio
processed_audio_path = raw_audio_path

# 3. Split audio into overlapping 2-min patches
patches = split_audio_to_patches(processed_audio_path, patch_duration_sec=120, overlap_sec=31)

# 4. Transcribe each patch with Whisper
all_results = transcribe_patches(patches, model)

# 5. Save all results to a JSON file
import json
output_json_path = dataset_path / "hate_audios" / "test_audio_5_transcription.json"
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)
print(f"[INFO] Saved all transcription results to {output_json_path}")

# 6. Combine all patch transcriptions (optional print)
for result in all_results:
    print(f"[PATCH] \n          Language: {result['language']},\n          Text: {result.get('full_text','')[:100]}..., \n          Words: {len(result['words'])}")

[INFO] Extracting audio from video: /Users/egor_demin/hack_dataset/hate_videos/hate_video_5.mp4
[INFO] Converted video to audio: /Users/egor_demin/hack_dataset/hate_audios/test_audio_5.wav
[INFO] Loading audio for patching: /Users/egor_demin/hack_dataset/hate_audios/test_audio_5.wav
[INFO] Audio duration: 39.89 seconds
[INFO] Saved patch 0: /Users/egor_demin/hack_dataset/hate_audios/patch_000.wav (39.89s)
[INFO] Transcribing patch 0: /Users/egor_demin/hack_dataset/hate_audios/patch_000.wav
[INFO] Converted video to audio: /Users/egor_demin/hack_dataset/hate_audios/test_audio_5.wav
[INFO] Loading audio for patching: /Users/egor_demin/hack_dataset/hate_audios/test_audio_5.wav
[INFO] Audio duration: 39.89 seconds
[INFO] Saved patch 0: /Users/egor_demin/hack_dataset/hate_audios/patch_000.wav (39.89s)
[INFO] Transcribing patch 0: /Users/egor_demin/hack_dataset/hate_audios/patch_000.wav
Detected language: English
Detected language: English


100%|██████████| 3989/3989 [00:03<00:00, 1054.13frames/s]

[INFO] Patch 0 language: en, words: 38
[INFO] Saved all transcription results to /Users/egor_demin/hack_dataset/hate_audios/test_audio_5_transcription.json
[PATCH] 
          Language: en,
          Text: ..., 
          Words: 38



