# Semantic Video Compilation Pipeline
This notebook summarizes the analytical sequence and parameterization used in the study.

The code cells remain to show the pipeline structure and data flow.

In [None]:

# author: Diemithry Kloppenburg
# date: 2026-02-15


In [None]:
# %%
# ============================================================
# Configuration
# ============================================================
from pathlib import Path
import sys

ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

# Import step modules
from src.steps.step1a_normalize_videos import run as step1a_normalize
from src.steps.step1b_extract_frames import run as step1b_extract_frames
from src.steps.step2_encode_score import run as step2_encode_and_score
from src.steps.step3_aggregate import run as step3_aggregate
from src.steps.step4_reweight import run as step4_reweight
from src.steps.step5_make_strata import run as step5_make_strata
from src.steps.step6_select_compilations import run as step6_select_compilations

# Import utilities
from src.core import set_seeds, summary
from src.data import get_scenes

# Global seed
SEED = 1234

# Output root
OUTPUT_ROOT = Path("../outputs")

# ============================================================
# Step 1a: Video Normalization
# ============================================================
# Use relative path to inputs/videos directory
VIDEOS_DIR = Path("../inputs/videos")
SANITIZE_NAMES = True               # Clean filenames to ASCII
NORMALIZE_VIDEOS = True             # Re-encode to h264/aac
TARGET_RESOLUTION = None            # None=original, or 720, 1080, 1440, 2160
UNIFORM_RESOLUTION = False          # True=resize all to same resolution
FORCE_REIMPORT = False              # True=delete existing output and reimport

# ============================================================
# Step 1b: Frame Extraction
# ============================================================
FPS = 3                             # Frame extraction rate
OVERWRITE_FRAMES = True

# ============================================================
# Step 2: CLIP Encoding
# ============================================================
CLIP_MODEL_NAME = 'ViT-B/16'
SCENE_SOURCE = "../inputs/custom_scenes.csv"  # None=built-in, "path.csv"=from file, ["stmt1",...]]=custom list

# ============================================================
# Step 3: Aggregation (frame → video scores)
# ============================================================
AGGREGATION_METHOD = 'percentile'   # 'percentile', 'mean', 'median', 'max', 'min'
PERCENTILE = 95                     # Only used if method='percentile'

# ============================================================
# Step 4: Representation Reweighting (center + SVD-informed weights)
# ============================================================
REWEIGHT_CENTERING = 'col'          # 'none', 'col', 'row', 'both'
REWEIGHT_VARIANCE_THRESHOLD = 0.95
REWEIGHT_K_CAP = 32
REWEIGHT_BETA = 0.5

# ============================================================
# Step 5: Strata Construction (prototype-based grouping)
# ============================================================
N_STRATA = 12                       # Number of prototype strata
K_EMBED = 16                        # SVD embedding dimension
STRATA_INIT = "medoid"              # "medoid" or "random" for first prototype
STRATA_NORMALIZE = True             # Normalize embedding rows

# ============================================================
# Step 6-7: Compilation Selection & Rendering
# ============================================================
N_COMPILATIONS = 10
COMP_MIN_SECONDS = 55.0
COMP_MAX_SECONDS = 65.0

print("OK: Configuration loaded")

In [None]:
# %%
# ============================================================
# Step 1a: Normalize videos
# ============================================================
set_seeds(SEED)

pipeline = step1a_normalize(
    videos_dir=VIDEOS_DIR,
    sanitize_names=SANITIZE_NAMES,
    normalize=NORMALIZE_VIDEOS,
    target_resolution=TARGET_RESOLUTION,
    uniform_resolution=UNIFORM_RESOLUTION,
    force_reimport=FORCE_REIMPORT,
    output_root=OUTPUT_ROOT
)

summary(pipeline, step='import')


In [None]:
# %%
# ============================================================
# Step 1b: Extract and save frames
# ============================================================
set_seeds(SEED)

pipeline = step1b_extract_frames(
    pipeline,
    fps=FPS,
    overwrite_frames=OVERWRITE_FRAMES
)

summary(pipeline, step='frames')


In [None]:
# %%
# ============================================================
# Step 2: Encode frames with CLIP
# ============================================================
set_seeds(SEED)

scenes = get_scenes(source=SCENE_SOURCE)
print(f"Loaded {len(scenes)} scene statements")

pipeline = step2_encode_and_score(
    pipeline,
    scene_statements=scenes,
    model_name=CLIP_MODEL_NAME
)

summary(pipeline, step='encoding')

In [None]:
# %%
# ============================================================
# Step 3: Aggregate frame scores to video level
# ============================================================
set_seeds(SEED)

pipeline = step3_aggregate(
    pipeline,
    method=AGGREGATION_METHOD,
    percentile=PERCENTILE
)

print("OK: Aggregation complete (raw video scores)")


In [None]:
# %%
# ============================================================
# Step 4: Representation reweighting (centering + SVD-informed weights)
# ============================================================
set_seeds(SEED)

pipeline = step4_reweight(
    pipeline,
    centering=REWEIGHT_CENTERING,
    variance_threshold=REWEIGHT_VARIANCE_THRESHOLD,
    k_cap=REWEIGHT_K_CAP,
    beta=REWEIGHT_BETA
)

summary(pipeline, step='aggregation')


In [None]:
# %%
# ============================================================
# Visualize: Interpret SVD Components
# ============================================================
import numpy as np

# Get data
video_scores = pipeline['video_scores']
svd_scores = pipeline['svd_scores']
scene_statements = pipeline['scene_statements']
video_ids = pipeline['video_ids']

n_components = svd_scores.shape[1]
n_scenes = len(scene_statements)

# Reconstruct component loadings by correlating original scores with SVD components
loadings = np.zeros((n_scenes, n_components))
for i in range(n_components):
    for j in range(n_scenes):
        loadings[j, i] = np.corrcoef(video_scores[:, j], svd_scores[:, i])[0, 1]

# Print detailed component interpretations
print(f"\n{'='*80}")
print(f"SVD COMPONENT INTERPRETATIONS")
print(f"{n_components} components extracted from {n_scenes} scene statements")
print(f"{'='*80}\n")

for i in range(n_components):
    # Get top 5 scenes by absolute correlation
    top_indices = np.argsort(np.abs(loadings[:, i]))[-5:][::-1]
    
    print(f"COMPONENT {i+1}")
    print(f"{'-'*80}")
    
    for rank, idx in enumerate(top_indices, 1):
        scene = scene_statements[idx]
        corr = loadings[idx, i]
        sign = "positive" if corr > 0 else "negative"
        print(f"  {rank}. [{corr:+.3f}] ({sign:>8}) {scene}")
    
    print()

print(f"{'='*80}")
print("OK: Component interpretation complete\n")

In [None]:
# %%
# ============================================================
# Step 5: Build Strata (prototype-based grouping)
# ============================================================
set_seeds(SEED)

pipeline = step5_make_strata(
    pipeline,
    n_strata="auto",
    k_embed=K_EMBED,
    init=STRATA_INIT,
    normalize=STRATA_NORMALIZE,
    min_stratum_size=3,
    target_max_dist=0.35,
    seed=SEED
)

summary(pipeline, step='strata')

In [None]:
# %%
# ============================================================
# Analyze Strata (prototype-based grouping)
# ============================================================
import numpy as np
import pandas as pd

# Get data
embedding_Z = pipeline['embedding_Z']
stratum_labels = pipeline['stratum_labels']
prototype_indices = pipeline['prototype_indices']
video_ids = pipeline['video_ids']
n_strata = pipeline['n_strata']

print(f"\n{'='*80}")
print(f"STRATA SUMMARY")
print(f"{n_strata} prototype strata covering {len(video_ids)} videos")
print(f"{'='*80}\n")

# Compute stratum sizes and statistics
for stratum_id in range(n_strata):
    mask = stratum_labels == stratum_id
    n_videos_in_stratum = mask.sum()
    is_prototype = stratum_id in prototype_indices
    prototype_marker = "[PROTOTYPE]" if is_prototype else ""
    
    # Centroid
    stratum_centroid = embedding_Z[mask].mean(axis=0)
    centroid_norm = np.linalg.norm(stratum_centroid)
    
    print(f"Stratum {stratum_id}: {n_videos_in_stratum} videos {prototype_marker}")
    print(f"  Centroid norm: {centroid_norm:.3f}\n")

print(f"{'='*80}")
print("OK: Strata analysis complete\n")


In [None]:
# %%
# ============================================================
# Prototype Distances & Coverage
# ============================================================
import numpy as np
import pandas as pd

# Get data
embedding_Z = pipeline['embedding_Z']
stratum_labels = pipeline['stratum_labels']
prototype_indices = pipeline['prototype_indices']
video_ids = pipeline['video_ids']
n_strata = pipeline['n_strata']

# Compute distances from each video to its prototype
from sklearn.metrics.pairwise import cosine_distances

D = cosine_distances(embedding_Z, embedding_Z)
prototype_video_embeddings = embedding_Z[prototype_indices]
D_to_prototypes = cosine_distances(embedding_Z, prototype_video_embeddings)

coverage_stats = []
for stratum_id in range(n_strata):
    mask = stratum_labels == stratum_id
    distances = D_to_prototypes[mask, stratum_id]
    coverage_stats.append({
        'stratum_id': stratum_id,
        'n_videos': mask.sum(),
        'mean_dist': distances.mean(),
        'max_dist': distances.max(),
        'min_dist': distances.min()
    })

coverage_df = pd.DataFrame(coverage_stats)

print(f"\n{'='*80}")
print(f"PROTOTYPE COVERAGE ANALYSIS")
print(f"{'='*80}\n")
print(coverage_df.to_string(index=False))
print(f"\n{'='*80}")
print(f"Mean distance to prototype: {coverage_df['mean_dist'].mean():.4f}")
print(f"Max distance to prototype: {coverage_df['max_dist'].max():.4f}")
print("OK: Coverage analysis complete\n")


In [None]:
# %%
# ============================================================
# Step 6: Select compilations from clusters
# ============================================================
set_seeds(SEED)

pipeline = step6_select_compilations(
    pipeline,
    n_compilations=N_COMPILATIONS,
    comp_min_seconds=COMP_MIN_SECONDS,
    comp_max_seconds=COMP_MAX_SECONDS
)

summary(pipeline, step='compilation_selection')

In [None]:
# %%
# ============================================================
# Compilation Interchangeability Analysis
# ============================================================
import numpy as np
import matplotlib.pyplot as plt

# Get data
embedding_Z = pipeline['embedding_Z']
compilations = pipeline.get('compilations', [])

if not compilations:
    print("WARNING: No compilations are present in this outline.")
else:
    # Get video indices for each compilation
    compilation_indices = []
    for comp in compilations:
        comp_video_indices = [member['idx'] for member in comp['members']]
        compilation_indices.append(comp_video_indices)

    n_compilations = len(compilation_indices)
    n_dimensions = embedding_Z.shape[1]

    # Get unique videos across all compilations
    all_compilation_video_indices = []
    for indices in compilation_indices:
        all_compilation_video_indices.extend(indices)
    unique_video_indices = list(set(all_compilation_video_indices))
    n_unique_videos = len(unique_video_indices)

    print(f"\n{'='*80}")
    print(f"COMPILATION INTERCHANGEABILITY ANALYSIS")
    print(f"{n_compilations} compilations across {n_dimensions}-dimensional embedding space")
    print(f"{n_unique_videos} unique videos across all compilations")
    print(f"{'='*80}\n")

    print(f"GOAL: Each compilation should be interchangeable — a representative")
    print(f"      'slice' of the global library, not a special subset.\n")
    print(f"{'='*80}\n")

    # ========================================
    # 1. GLOBAL REFERENCE STATISTICS
    # ========================================
    X = embedding_Z[unique_video_indices]
    mu_global = X.mean(axis=0)
    var_global = np.mean(np.sum((X - mu_global) ** 2, axis=1))

    print(f"GLOBAL REFERENCE (from {n_unique_videos} unique videos):")
    print(f"  Global mean centroid: {n_dimensions}-dimensional")
    print(f"  Global variance:      {var_global:.3f}\n")
    print(f"{'='*80}\n")

    # ========================================
    # HELPER FUNCTIONS
    # ========================================
    def rbf_kernel(A, B, sigma=1.0):
        XX = np.sum(A**2, axis=1)[:, np.newaxis]
        YY = np.sum(B**2, axis=1)[np.newaxis, :]
        XY = A @ B.T
        sq_dists = XX - 2 * XY + YY
        return np.exp(-sq_dists / (2 * sigma**2))

    def compute_mmd(S, X, sigma=1.0):
        K_SS = rbf_kernel(S, S, sigma)
        K_XX = rbf_kernel(X, X, sigma)
        K_SX = rbf_kernel(S, X, sigma)
        mmd_sq = K_SS.mean() - 2 * K_SX.mean() + K_XX.mean()
        return np.sqrt(max(0, mmd_sq))

    def compute_variance_capture(S, mu_global, var_global):
        var_s = np.mean(np.sum((S - mu_global) ** 2, axis=1))
        return var_s / var_global if var_global > 0 else 0.0

    def compute_mean_pairwise_distance(S):
        n = len(S)
        if n <= 1:
            return 0.0
        dists = []
        for i in range(n):
            for j in range(i + 1, n):
                dists.append(np.linalg.norm(S[i] - S[j]))
        return np.mean(dists)

    def compute_centroid_shift(S, mu_global):
        mu_s = S.mean(axis=0)
        return np.linalg.norm(mu_s - mu_global)

    # Auto-select sigma for MMD (median heuristic)
    sample_size = min(100, len(X))
    sample_indices = np.random.choice(len(X), size=sample_size, replace=False)
    sample_X = X[sample_indices]
    pairwise_dists = []
    for i in range(len(sample_X)):
        for j in range(i + 1, len(sample_X)):
            pairwise_dists.append(np.linalg.norm(sample_X[i] - sample_X[j]))
    sigma_mmd = np.median(pairwise_dists) if pairwise_dists else 1.0

    # ========================================
    # 2. OMNIBUS PERMUTATION TEST (PER METRIC)
    # ========================================
    sizes = [len(indices) for indices in compilation_indices]
    unique_sizes = sorted(set(sizes))

    B_NULL = 2000
    B_PERM = 3000
    EPS_STD = 1e-8

    def omnibus_permutation_test(metric_name, metric_func):
        """
        Omnibus test using size-matched nulls:
        - Estimate null mean/std for each size
        - Compute z per compilation
        - Test T_max and T_rms via permutation
        """
        # 1) Null params per size
        null_params = {}
        for n in unique_sizes:
            null_vals = []
            for _ in range(B_NULL):
                idx = np.random.choice(len(X), size=n, replace=False)
                R = X[idx]
                null_vals.append(metric_func(R))
            null_vals = np.array(null_vals)
            mu0 = null_vals.mean()
            sd0 = null_vals.std() + EPS_STD
            null_params[n] = (mu0, sd0)

        # 2) Observed z per compilation
        z_obs = []
        m_obs = []
        for indices in compilation_indices:
            S_c = embedding_Z[indices]
            m_c = metric_func(S_c)
            mu0, sd0 = null_params[len(indices)]
            z_c = (m_c - mu0) / sd0
            z_obs.append(z_c)
            m_obs.append(m_c)
        z_obs = np.array(z_obs)
        m_obs = np.array(m_obs)

        T_max_obs = np.max(np.abs(z_obs))
        T_rms_obs = np.sqrt(np.mean(z_obs**2))

        # 3) Permutation distribution for T_max and T_rms
        T_max_null = []
        T_rms_null = []
        for _ in range(B_PERM):
            z_b = []
            for n in sizes:
                idx = np.random.choice(len(X), size=n, replace=False)
                R = X[idx]
                m_b = metric_func(R)
                mu0, sd0 = null_params[n]
                z_b.append((m_b - mu0) / sd0)
            z_b = np.array(z_b)
            T_max_null.append(np.max(np.abs(z_b)))
            T_rms_null.append(np.sqrt(np.mean(z_b**2)))

        T_max_null = np.array(T_max_null)
        T_rms_null = np.array(T_rms_null)

        p_max = (1 + np.sum(T_max_null >= T_max_obs)) / (B_PERM + 1)
        p_rms = (1 + np.sum(T_rms_null >= T_rms_obs)) / (B_PERM + 1)

        # Summary stats
        print(f"{metric_name}:")
        print(f"  Observed:   mean(m_c)={m_obs.mean():.4f}, std(m_c)={m_obs.std():.4f}")
        print(f"  z-scores:   max|z|={np.max(np.abs(z_obs)):.3f}, rms(z)={np.sqrt(np.mean(z_obs**2)):.3f}")
        print(f"  Omnibus p:  p_max={p_max:.4f}, p_rms={p_rms:.4f}")

        if p_max >= 0.05:
            print("  OK: At α=0.05, fail to reject the null of random size-matched samples (max test).")
        else:
            print("  WARNING: At α=0.05, reject the null of random size-matched samples (max test).")

        print()

        return {
            'metric': metric_name,
            'p_max': p_max,
            'p_rms': p_rms,
            'T_max_obs': T_max_obs,
            'T_rms_obs': T_rms_obs,
            'z_obs': z_obs,
            'm_obs': m_obs
        }

    print("OMNIBUS PERMUTATION TESTS (size-matched nulls):\n")
    print(f"Null: random sets of size n_c sampled without replacement from the {n_unique_videos}-video global pool.")
    print(f"Permutations: B_NULL={B_NULL} (null parameters), B_PERM={B_PERM} (omnibus p-values).\n")

    results = []
    results.append(
        omnibus_permutation_test(
            "Variance Capture",
            lambda S: compute_variance_capture(S, mu_global, var_global)
        )
    )
    results.append(
        omnibus_permutation_test(
            "Mean Pairwise Distance",
            compute_mean_pairwise_distance
        )
    )
    results.append(
        omnibus_permutation_test(
            "Centroid Shift",
            lambda S: compute_centroid_shift(S, mu_global)
        )
    )
    results.append(
        omnibus_permutation_test(
            "MMD to Global Pool",
            lambda S: compute_mmd(S, X, sigma=sigma_mmd)
        )
    )

    # ========================================
    # 3. FINAL STATEMENT
    # ========================================
    print(f"{'='*80}")
    print(f"FINAL STATEMENT (α=0.05):")
    print(f"{'='*80}\n")

    for r in results:
        verdict = "fail to reject" if r['p_max'] >= 0.05 else "reject"
        print(
            f"- {r['metric']}: {verdict} the null that compilations are size-matched random samples "
            f"from the {n_unique_videos}-video global pool (p_max={r['p_max']:.4f})."
        )

    print("\nThese tests provide no evidence that any compilation set deviates from the random-sampling baseline on these metrics.")
    print(f"\n{'='*80}\n")


In [None]:
try:
    # 2. Create statistical summary figures (graphics only)
    print("2. Creating statistical summary...")
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Interchangeability Tests: Statistical Evidence', fontsize=14, fontweight='bold')
    
    metrics = [r['metric'] for r in results]
    p_max_vals = [r['p_max'] for r in results]
    
    # Plot 1: p-values (top-left)
    ax = axes[0, 0]
    x = np.arange(len(metrics))
    ax.bar(x, p_max_vals, alpha=0.8, color='steelblue')
    ax.axhline(y=0.05, color='red', linestyle='--', linewidth=2, label='α=0.05')
    ax.set_ylabel('p-value', fontsize=11)
    ax.set_title('Omnibus p-values', fontsize=11, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics, rotation=45, ha='right', fontsize=9)
    ax.set_ylim([0, 0.3])
    ax.grid(axis='y', alpha=0.3)
    
    # Plot 2: z-scores (top-right)
    ax = axes[0, 1]
    for i, r in enumerate(results):
        z_scores = r['z_obs']
        ax.scatter([i] * len(z_scores), z_scores, alpha=0.6, s=100)
    ax.axhline(y=0, color='black', linestyle='-', linewidth=1)
    ax.set_ylabel('z-score', fontsize=11)
    ax.set_title('Compilation Deviations', fontsize=11, fontweight='bold')
    ax.set_xticks(range(len(metrics)))
    ax.set_xticklabels(metrics, rotation=45, ha='right', fontsize=9)
    ax.grid(axis='y', alpha=0.3)
    
    # Plot 3: Test statistics (bottom-left)
    ax = axes[1, 0]
    t_max_vals = [r['T_max_obs'] for r in results]
    ax.bar(x, t_max_vals, alpha=0.8, color='forestgreen')
    ax.set_ylabel('T_max', fontsize=11)
    ax.set_title('Test Statistics', fontsize=11, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics, rotation=45, ha='right', fontsize=9)
    ax.grid(axis='y', alpha=0.3)
    
    # Empty (bottom-right)
    ax = axes[1, 1]
    ax.axis('off')
    
    plt.tight_layout()
    plt.savefig('../outputs/presentation_frames/02_statistical_summary.png', dpi=150, bbox_inches='tight')
    plt.close()
    print("OK: Statistical summary created (2x2 grid)\n")

except Exception as e:
    print(f"WARNING: Error: {e}\n")
