# AHP + TOPSIS

In [58]:
import numpy as np
import pandas as pd

### 1. CRITERIA & CANDIDATES

In [51]:
criteria = [
    "RTV‑Bench (Acc)", "Temporal Type", 
    "Structured Out", "Inference Cost", "Hardware Req"
]

### Benefit: 1, Cost: -1

In [52]:
is_benefit = [1, -1, 1, -1, -1]

### Models

1. RTV‑Bench https://arxiv.org/abs/2505.02064
2. Duration: seconds (but GPT-4o is 1 frame)
3. Temp Type: Below
4. Structured: 1 (Prompt), 2 (Strict JSON Mode)
5. Inference Cost: $/million tokens (input + output)
6. HW: 1 (Low/API) to 10 (High/Multi-GPU Cluster)

| Rank          | Temporal Representation            | Model         | Notes                                                            |
| ------------- | ---------------------------------- | --------------------- | ---------------------------------------------------------|
| **1 (Best)**  | Absolute time encoding             | Qwen2.5‑VL            | Frame-level timestamp allows precise start/end detection |
| **2**         | Spatial‑Temporal Convolution (STC) | VideoLLaMA2           | Good for motion/action patterns, slight temporal smoothing|
| **3**         | Composite frame representation     | InternLM‑XComposer2.5 | Dense frames preserve granularity, computationally heavy |
| **4**         | Frame sequence representation      | LLaVA‑Video           | Preserves order but may downsample                       |
| **5**         | Compact vision tokens              | VideoLLaMA3           | Efficient for long videos, poor for very short actions |
| **6**         | Proprietary transformer encoding   | Gemini 2.0 Flash      | Unknown internal representation; likely good general temporal understanding, may miss precise short actions |
| **7 (Worst)** | N/A                                | GPT‑4o                | Cannot handle video → cannot detect temporal actions     |


In [66]:
models = {
    # Proprietary Models (API-centric)
    "OpenAI GPT-4o":         [50.0, 7, 2, 20, 1], 
    "Gemini 2.5":      [42.0, 6, 2, 14, 1],
    
    # Open-Source SOTA
    "Qwen2.5-VL":            [40.4, 1, 1, 0.0, 6], # 7B: 16GB VRAM (RTX 4090/A4000)
    "InternLM-XComposer2.5": [47.3, 3, 1, 0.0, 7], # 7B but dense frames: 24GB VRAM (RTX 4090/A5000)
    
    # Specialized/Physical Reasoning
    # "NVIDIA Cosmos Reason 1":[-, 60, 3, 2, 1.2, 8], 
    # "NVIDIA Cosmos Reason 2":[-, 60, 3, 2, 1.5, 9], 
    
    # Efficient Candidates
    "LLaVA-Video":           [34.9, 4, 1, 0.0, 6], # 7B: 16GB VRAM
    "VideoLLaMA2":           [39.6, 2, 1, 0.0, 6], # 7B: 16GB VRAM
    "VideoLLaMA3":           [36.4, 5, 1, 0.0, 5], # 8B but optimized tokens: 12GB VRAM (RTX 3090)
}

In [67]:
def calculate_ahp_weights():
    # 6x6 Comparison Matrix (n=6)
    # Order: [RTV, Duration, TempType, StructOut, Cost, HW]
    n = len(criteria)
    A = np.ones((n, n))  # Initialize with all 1s
    np.fill_diagonal(A, 1)

    # Pairwise Judgments
    A[0, 2] = 2; A[0, 4] = 3;
    A[2, 1] = 4; A[2, 3] = 2;

    # Fill reciprocals
    for i in range(n):
        for j in range(i+1, n):
            A[j, i] = 1 / A[i, j]

    # Priority Vector (Weights)
    weights = (A / A.sum(axis=0)).mean(axis=1)

    # Consistency Check
    eig_val = np.linalg.eig(A)[0].max().real
    ci = (eig_val - n) / (n - 1)
    ri = 1.12
    cr = ci / ri

    return weights, cr

def run_topsis(data, weights, is_benefit):
    # Vector Normalization
    norm_data = data / np.sqrt((data**2).sum(axis=0))
    weighted_norm = norm_data * weights
    
    # Determine Ideal Best (PIS) and Ideal Worst (NIS)
    pis = [weighted_norm[:, j].max() if is_benefit[j] == 1 else weighted_norm[:, j].min() for j in range(len(weights))]
    nis = [weighted_norm[:, j].min() if is_benefit[j] == 1 else weighted_norm[:, j].max() for j in range(len(weights))]
    
    # Euclidean Distances
    s_plus = np.sqrt(((weighted_norm - pis)**2).sum(axis=1))
    s_minus = np.sqrt(((weighted_norm - nis)**2).sum(axis=1))
    
    # Final TOPSIS Score
    return s_minus / (s_plus + s_minus)

In [68]:
# EXECUTION
weights, cr = calculate_ahp_weights()
model_names = list(models.keys())
data_matrix = np.array(list(models.values()))
scores = run_topsis(data_matrix, weights, is_benefit)

In [69]:
# Output for LaTeX
print(f"--- AHP RESULTS ---")
print(f"Consistency Ratio (CR): {cr:.4f} (Acceptable if < 0.1)")
print("Weights per Criteria:")
for c, w in zip(criteria, weights):
    print(f"  {c}: {w:.2%}")

--- AHP RESULTS ---
Consistency Ratio (CR): 0.0544 (Acceptable if < 0.1)
Weights per Criteria:
  RTV‑Bench (Acc): 28.39%
  Temporal Type: 19.04%
  Structured Out: 19.77%
  Inference Cost: 17.23%
  Hardware Req: 15.57%


In [70]:
print(f"\n--- FINAL TOPSIS RANKING ---")
ranked_results = sorted(zip(model_names, scores), key=lambda x: x[1], reverse=True)
for i, (name, score) in enumerate(ranked_results, 1):
    print(f"{i}. {name:22} | Score: {score:.4f}")


--- FINAL TOPSIS RANKING ---
1. Qwen2.5-VL             | Score: 0.6742
2. VideoLLaMA2            | Score: 0.6570
3. InternLM-XComposer2.5  | Score: 0.6276
4. LLaVA-Video            | Score: 0.5978
5. VideoLLaMA3            | Score: 0.5889
6. Gemini 2.5             | Score: 0.4382
7. OpenAI GPT-4o          | Score: 0.3600


## Sensitivity Analysis

In [71]:
def sensitivity_analysis_weights(base_weights, data_matrix, model_names, is_benefit, variation_pct=0.2):
    results = []
    base_scores = run_topsis(data_matrix, base_weights, is_benefit)
    base_ranking = [model_names[i] for i in np.argsort(base_scores)[::-1]]
    
    for criterion_idx in range(len(base_weights)):
        for multiplier in [1 - variation_pct, 1 + variation_pct]:
            # Create modified weights
            modified_weights = base_weights.copy()
            modified_weights[criterion_idx] *= multiplier
            # Re-normalize to sum to 1
            modified_weights /= modified_weights.sum()
            
            # Run TOPSIS with modified weights
            new_scores = run_topsis(data_matrix, modified_weights, is_benefit)
            new_ranking = [model_names[i] for i in np.argsort(new_scores)[::-1]]
            
            # Calculate rank correlation (Spearman)
            from scipy.stats import spearmanr
            rank_corr = spearmanr([base_ranking.index(m) for m in model_names],
                                  [new_ranking.index(m) for m in model_names])[0]
            
            results.append({
                'Criterion': criteria[criterion_idx],
                'Change': f"{multiplier:.1%}",
                'Top_3': new_ranking[:3],
                'Spearman_ρ': rank_corr,
                'Top_Model_Changed': new_ranking[0] != base_ranking[0]
            })
    
    return pd.DataFrame(results)

In [72]:
sensitivity_df = sensitivity_analysis_weights(weights, data_matrix, model_names, is_benefit, 0.2)
print(sensitivity_df)

         Criterion  Change                                             Top_3  \
0  RTV‑Bench (Acc)   80.0%  [Qwen2.5-VL, VideoLLaMA2, InternLM-XComposer2.5]   
1  RTV‑Bench (Acc)  120.0%  [Qwen2.5-VL, VideoLLaMA2, InternLM-XComposer2.5]   
2    Temporal Type   80.0%  [Qwen2.5-VL, VideoLLaMA2, InternLM-XComposer2.5]   
3    Temporal Type  120.0%  [Qwen2.5-VL, VideoLLaMA2, InternLM-XComposer2.5]   
4   Structured Out   80.0%  [Qwen2.5-VL, VideoLLaMA2, InternLM-XComposer2.5]   
5   Structured Out  120.0%  [Qwen2.5-VL, VideoLLaMA2, InternLM-XComposer2.5]   
6   Inference Cost   80.0%  [Qwen2.5-VL, VideoLLaMA2, InternLM-XComposer2.5]   
7   Inference Cost  120.0%  [Qwen2.5-VL, VideoLLaMA2, InternLM-XComposer2.5]   
8     Hardware Req   80.0%  [Qwen2.5-VL, VideoLLaMA2, InternLM-XComposer2.5]   
9     Hardware Req  120.0%  [Qwen2.5-VL, VideoLLaMA2, InternLM-XComposer2.5]   

   Spearman_ρ  Top_Model_Changed  
0    1.000000              False  
1    1.000000              False  
2    0.964286 