# LLM Compression Analysis Notebook

This notebook provides an interactive environment for analyzing compression experiment results.

In [None]:
# Setup
import sys
import os
sys.path.append('..')

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import project modules
from src.models.predictive_masking import PredictiveMaskingCompressor
from src.models.latent_space_quantization import LatentSpaceQuantizationCompressor
from src.evaluation.metrics import CompressionMetrics
from src.visualization.plots import CompressionVisualizer

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## Load Experiment Results

In [None]:
# Load the most recent results
results_dir = Path('../results/results')
result_files = list(results_dir.glob('results_*.json'))

if result_files:
    latest_results = sorted(result_files)[-1]
    print(f"Loading results from: {latest_results}")
    
    with open(latest_results, 'r') as f:
        results = json.load(f)
else:
    print("No results found. Please run experiments first.")

## Summary Statistics

In [None]:
# Create summary dataframe
summary_data = []

for model_method, model_results in results.items():
    for prob, metrics in model_results.items():
        if isinstance(prob, str) and prob.replace('.', '').isdigit():
            prob = float(prob)
            summary_data.append({
                'Model': model_method.split('_')[0],
                'Method': 'Predictive Masking' if 'predictive' in model_method else 'LSQ',
                'Masking Probability': prob,
                'Compression Ratio': metrics.get('compression_ratio', 0),
                'Word Accuracy': metrics.get('word_accuracy', 0),
                'Semantic Similarity': metrics.get('semantic_similarity', 0),
                'ROUGE-1 F1': metrics.get('rouge1_fmeasure', 0),
                'BERT Score F1': metrics.get('bert_score_f1', 0)
            })

df_summary = pd.DataFrame(summary_data)
df_summary.head(10)

## Best Configurations Analysis

In [None]:
# Find best configurations for each model
best_configs = []

for model in df_summary['Model'].unique():
    for method in df_summary['Method'].unique():
        model_data = df_summary[(df_summary['Model'] == model) & (df_summary['Method'] == method)]
        
        if len(model_data) > 0:
            # Best compression ratio with acceptable quality (semantic similarity > 0.8)
            quality_data = model_data[model_data['Semantic Similarity'] > 0.8]
            if len(quality_data) > 0:
                best_idx = quality_data['Compression Ratio'].idxmax()
                best_configs.append(quality_data.loc[best_idx])

best_df = pd.DataFrame(best_configs)
print("Best Configurations (Semantic Similarity > 0.8):")
best_df.sort_values('Compression Ratio', ascending=False)

## Interactive Compression Demo

In [None]:
# Interactive compression demo
def compress_and_display(text, model_name='bert-base-uncased', masking_prob=0.5):
    """Compress text and display results."""
    
    # Initialize compressor
    compressor = PredictiveMaskingCompressor(model_name)
    
    # Compress
    compressed = compressor.compress(text, masking_probability=masking_prob)
    
    # Decompress
    reconstructed = compressor.decompress(compressed)
    
    # Calculate metrics
    metrics_calc = CompressionMetrics()
    metrics = metrics_calc.calculate_all_metrics(text, reconstructed, compressed)
    
    # Display results
    print(f"Model: {model_name}")
    print(f"Masking Probability: {masking_prob}")
    print(f"\nOriginal Text ({len(text)} chars):")
    print(text)
    print(f"\nReconstructed Text ({len(reconstructed)} chars):")
    print(reconstructed)
    print(f"\nMetrics:")
    print(f"  - Compression Ratio: {metrics['compression_ratio']:.2f}")
    print(f"  - Word Accuracy: {metrics['word_accuracy']:.3f}")
    print(f"  - Semantic Similarity: {metrics['semantic_similarity']:.3f}")
    print(f"  - ROUGE-1 F1: {metrics['rouge1_fmeasure']:.3f}")
    
    return compressed, reconstructed, metrics

# Example usage
sample_text = """The compression of natural language text using neural networks represents 
a fascinating intersection of information theory and deep learning. By leveraging the 
predictive capabilities of transformer models, we can achieve significant compression 
ratios while maintaining semantic coherence."""

compressed, reconstructed, metrics = compress_and_display(sample_text, masking_prob=0.5)

## Compression Trade-off Analysis

In [None]:
# Analyze compression vs quality trade-off
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Compression Ratio vs Semantic Similarity
for model in df_summary['Model'].unique():
    model_data = df_summary[df_summary['Model'] == model]
    pm_data = model_data[model_data['Method'] == 'Predictive Masking']
    
    if len(pm_data) > 0:
        axes[0].plot(pm_data['Compression Ratio'], 
                    pm_data['Semantic Similarity'],
                    marker='o', label=model, linewidth=2)

axes[0].set_xlabel('Compression Ratio')
axes[0].set_ylabel('Semantic Similarity')
axes[0].set_title('Compression vs Semantic Preservation')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Masking Probability vs Multiple Metrics
model_data = df_summary[df_summary['Model'] == 'bert-base-uncased']
pm_data = model_data[model_data['Method'] == 'Predictive Masking'].sort_values('Masking Probability')

ax2 = axes[1]
ax2_twin = ax2.twinx()

ax2.plot(pm_data['Masking Probability'], pm_data['Compression Ratio'], 
         'b-', marker='o', label='Compression Ratio')
ax2_twin.plot(pm_data['Masking Probability'], pm_data['Semantic Similarity'], 
             'r-', marker='s', label='Semantic Similarity')

ax2.set_xlabel('Masking Probability')
ax2.set_ylabel('Compression Ratio', color='b')
ax2_twin.set_ylabel('Semantic Similarity', color='r')
ax2.set_title('BERT: Masking Probability Effects')
ax2.tick_params(axis='y', labelcolor='b')
ax2_twin.tick_params(axis='y', labelcolor='r')

# Add legends
lines1, labels1 = ax2.get_legend_handles_labels()
lines2, labels2 = ax2_twin.get_legend_handles_labels()
ax2.legend(lines1 + lines2, labels1 + labels2, loc='center left')

plt.tight_layout()
plt.show()

## Statistical Analysis

In [None]:
# Statistical comparison of models
from scipy import stats

# Compare models at 50% masking probability
mask_prob = 0.5
comparison_data = df_summary[(df_summary['Masking Probability'] == mask_prob) & 
                            (df_summary['Method'] == 'Predictive Masking')]

print(f"Model Comparison at {mask_prob:.0%} Masking Probability:\n")
print(comparison_data[['Model', 'Compression Ratio', 'Word Accuracy', 
                      'Semantic Similarity', 'ROUGE-1 F1']].to_string(index=False))

# Calculate correlations
print("\n\nCorrelation Analysis:")
correlation_cols = ['Masking Probability', 'Compression Ratio', 
                   'Word Accuracy', 'Semantic Similarity', 'ROUGE-1 F1']
correlations = df_summary[correlation_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlations, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Metric Correlations')
plt.tight_layout()
plt.show()

## Export Results for Paper

In [None]:
# Create LaTeX table for paper
latex_data = []

for model in ['bert-base-uncased', 'roberta-base', 'distilbert-base-uncased']:
    for prob in [0.3, 0.5, 0.7]:
        row_data = df_summary[(df_summary['Model'] == model) & 
                             (df_summary['Masking Probability'] == prob) &
                             (df_summary['Method'] == 'Predictive Masking')]
        
        if len(row_data) > 0:
            row = row_data.iloc[0]
            latex_data.append({
                'Model': model.split('-')[0].upper(),
                'Masking': f"{prob:.0%}",
                'Compression': f"{row['Compression Ratio']:.2f}",
                'Word Acc.': f"{row['Word Accuracy']:.3f}",
                'Semantic': f"{row['Semantic Similarity']:.3f}",
                'ROUGE-1': f"{row['ROUGE-1 F1']:.3f}"
            })

latex_df = pd.DataFrame(latex_data)
print("LaTeX Table:")
print(latex_df.to_latex(index=False, escape=False))

# Save to file
latex_df.to_latex('../results/results_table.tex', index=False, escape=False)
print("\nTable saved to results/results_table.tex")

## Conclusion

This analysis demonstrates the trade-offs between compression ratio and reconstruction quality across different models and masking probabilities. Key findings can be summarized and used for the MSc thesis.