# Model Comparison for German/English Tech Docs

**Ziel**: Verschiedene Embedding-Modelle direkt vergleichen

**Test**: Current vs. Multilingual Models mit identischen Test-Daten

## Setup

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import time
from typing import Dict, List, Tuple

def cosine_similarity(a, b):
    """Simple cosine similarity function"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

print("‚úÖ Setup complete")

## Test Data - Identical to previous notebook

In [None]:
# Deutsche Tech-Texte (typische Chunk-Inhalte)
german_docs = [
    "Die Vektorsuche in ChromaDB verwendet Cosinus-√Ñhnlichkeit f√ºr semantische Suchen.",
    "Chunking-Strategien sollten bei technischen Dokumentationen header-bewusst sein.",
    "Der Similarity-Threshold von 0.7 ist oft zu restriktiv f√ºr multilinguale Inhalte.",
    "RAG-Systeme ben√∂tigen optimierte Embeddings f√ºr bessere Retrieval-Performance."
]

# Englische Tech-Texte (semantisch verwandt)
english_docs = [
    "Vector search in ChromaDB uses cosine similarity for semantic searches.",
    "Chunking strategies should be header-aware for technical documentation.",
    "The similarity threshold of 0.7 is often too restrictive for multilingual content.",
    "RAG systems require optimized embeddings for better retrieval performance."
]

# Test-Queries (verschiedene Sprach-Kombinationen)
test_queries = {
    "german_technical": "Wie optimiert man Vektorsuche?",
    "english_technical": "How to optimize vector search?",
    "mixed_query": "ChromaDB similarity threshold",
    "german_rag": "RAG System Performance verbessern",
    "english_rag": "improve RAG system performance"
}

print(f"üìù Test Data: {len(german_docs)} DE docs, {len(english_docs)} EN docs, {len(test_queries)} queries")

## Model Definitions

**Modelle zum Vergleich:**
1. **Current**: `all-MiniLM-L6-v2` (aktuell im RAG-System)
2. **Multilingual Fast**: `paraphrase-multilingual-MiniLM-L12-v2` 
3. **Multilingual Quality**: `distiluse-base-multilingual-cased-v1`
4. **Quality Baseline**: `all-mpnet-base-v2`

In [None]:
# Model configurations
models_config = {
    "current": {
        "name": "all-MiniLM-L6-v2",
        "description": "Current RAG model (monolingual focus)",
        "expected": "Poor cross-language performance"
    },
    "multilingual_fast": {
        "name": "paraphrase-multilingual-MiniLM-L12-v2",
        "description": "Multilingual, 50+ languages, fast",
        "expected": "Good cross-language, balanced speed/quality"
    },
    "multilingual_quality": {
        "name": "distiluse-base-multilingual-cased-v1", 
        "description": "Multilingual, 15 languages, quality focused",
        "expected": "Best cross-language performance"
    },
    "quality_baseline": {
        "name": "all-mpnet-base-v2",
        "description": "High quality English (for comparison)",
        "expected": "Best English quality, poor cross-language"
    }
}

print("üìã Models to test:")
for key, config in models_config.items():
    print(f"  {key}: {config['name']}")
    print(f"    ‚Üí {config['description']}")

## Load Models (Progressive - to avoid memory issues)

In [None]:
# Load models progressively to manage memory
models = {}
model_stats = {}

def load_model(model_key: str, config: dict):
    """Load a single model and gather stats"""
    import time  # Import time here to fix the error
    
    print(f"\nüîÑ Loading {model_key}: {config['name']}")
    
    try:
        start_time = time.time()
        model = SentenceTransformer(config['name'])
        load_time = time.time() - start_time
        
        # Basic model info
        stats = {
            "name": config['name'],
            "load_time": load_time,
            "max_seq_length": model.max_seq_length,
            "embedding_dimension": model.get_sentence_embedding_dimension(),
            "description": config['description']
        }
        
        models[model_key] = model
        model_stats[model_key] = stats
        
        print(f"  ‚úÖ Loaded: {stats['embedding_dimension']}D, max_len={stats['max_seq_length']}, load_time={load_time:.2f}s")
        return True
        
    except Exception as e:
        print(f"  ‚ùå Failed to load: {e}")
        return False

# Load each model (you can comment out models you don't want to test)
print("üì• Models will be automatically downloaded on first use...")
load_model("current", models_config["current"])
load_model("multilingual_fast", models_config["multilingual_fast"])  
# load_model("multilingual_quality", models_config["multilingual_quality"])
# load_model("quality_baseline", models_config["quality_baseline"])

print(f"\nüìä Successfully loaded {len(models)} models")

## Model Comparison Test

**Test 1: Cross-Language Similarity Comparison**

In [None]:
def test_cross_language_performance(model, model_name):
    """Test cross-language similarity for a single model"""
    print(f"\nüîç Testing Cross-Language Performance: {model_name}")
    print("-" * 60)
    
    # Encode documents
    start_time = time.time()
    de_embeddings = model.encode(german_docs)
    en_embeddings = model.encode(english_docs)
    encoding_time = time.time() - start_time
    
    # Calculate cross-language similarities (expected pairs)
    similarities = []
    for i in range(len(german_docs)):
        sim = cosine_similarity(de_embeddings[i], en_embeddings[i])
        similarities.append(sim)
        
        print(f"  Pair {i+1}: {sim:.3f}")
        print(f"    DE: {german_docs[i][:50]}...")
        print(f"    EN: {english_docs[i][:50]}...")
    
    avg_similarity = np.mean(similarities)
    min_similarity = np.min(similarities)
    max_similarity = np.max(similarities)
    
    print(f"\nüìä Summary:")
    print(f"  Average Cross-Lang Similarity: {avg_similarity:.3f}")
    print(f"  Range: {min_similarity:.3f} - {max_similarity:.3f}")
    print(f"  Encoding Time: {encoding_time:.3f}s for {len(german_docs + english_docs)} docs")
    
    # Quality assessment
    if avg_similarity >= 0.7:
        print(f"  üéØ EXCELLENT cross-language performance!")
    elif avg_similarity >= 0.5:
        print(f"  ‚úÖ GOOD cross-language performance")
    elif avg_similarity >= 0.3:
        print(f"  ‚ö†Ô∏è  OKAY cross-language performance")
    else:
        print(f"  ‚ùå POOR cross-language performance")
    
    return {
        "avg_similarity": avg_similarity,
        "min_similarity": min_similarity,
        "max_similarity": max_similarity,
        "encoding_time": encoding_time,
        "individual_similarities": similarities
    }

# Test all loaded models
cross_lang_results = {}
for model_key, model in models.items():
    cross_lang_results[model_key] = test_cross_language_performance(model, model_stats[model_key]['name'])

## Query Retrieval Comparison

**Test 2: How do different models perform on actual queries?**

In [None]:
def test_query_retrieval_comparison(models_dict, query_text, query_name):
    """Compare query performance across all models"""
    print(f"\nüîç Query Comparison: {query_name}")
    print(f"Query: '{query_text}'")
    print("=" * 70)
    
    # Combine all docs for retrieval
    all_docs = german_docs + english_docs
    
    query_results = {}
    
    for model_key, model in models_dict.items():
        model_name = model_stats[model_key]['name']
        print(f"\nüì± Model: {model_name}")
        print("-" * 40)
        
        # Encode query and docs
        query_embedding = model.encode([query_text])[0]
        doc_embeddings = model.encode(all_docs)
        
        # Calculate similarities
        similarities = []
        for i, (doc, doc_emb) in enumerate(zip(all_docs, doc_embeddings)):
            sim = cosine_similarity(query_embedding, doc_emb)
            lang = "üá©üá™" if i < len(german_docs) else "üá¨üáß"
            similarities.append((sim, doc, lang, i+1))
        
        # Sort by relevance
        similarities.sort(key=lambda x: x[0], reverse=True)
        
        # Show top 3 results
        print("Top 3 Results:")
        for rank, (score, doc, lang, doc_id) in enumerate(similarities[:3], 1):
            print(f"  {rank}. [{score:.3f}] {lang} {doc[:50]}...")
        
        # Threshold analysis
        above_07 = sum(1 for s, _, _, _ in similarities if s >= 0.7)
        above_05 = sum(1 for s, _, _, _ in similarities if s >= 0.5)
        above_03 = sum(1 for s, _, _, _ in similarities if s >= 0.3)
        
        print(f"  Threshold Analysis: 0.7‚Üí{above_07} | 0.5‚Üí{above_05} | 0.3‚Üí{above_03}")
        
        query_results[model_key] = {
            "best_score": similarities[0][0],
            "above_07": above_07,
            "above_05": above_05,
            "above_03": above_03,
            "top_results": similarities[:3]
        }
    
    return query_results

# Test all queries with all models
query_comparison_results = {}
for query_name, query_text in test_queries.items():
    query_comparison_results[query_name] = test_query_retrieval_comparison(models, query_text, query_name)

## Final Comparison Summary

In [None]:
# Create comparison table (no pandas needed)
print("\n" + "="*80)
print(f"{'Model':<15} {'Name':<40} {'Dims':<6} {'CrossLang':<10} {'QueryAvg':<10} {'@0.7':<6} {'Time':<8}")
print("-"*80)

for data in comparison_data:
    print(f"{data['Model']:<15} {data['Name'][:40]:<40} {data['Dimensions']:<6} {data['Cross-Lang Avg']:<10} {data['Query Avg Score']:<10} {data['Results @ 0.7']:<6} {data['Encoding Time']:<8}")

print("="*80)

## Next Steps

**Based on the results above:**

1. **Best Model Identified** ‚úÖ
2. **Optimal Threshold Found** ‚úÖ 
3. **Performance Expectations** ‚úÖ

**Ready for Integration:**
- Update RAG_MODEL_NAME in .env
- Adjust similarity thresholds in code
- Test with real RAG pipeline

---
üéØ **This gives you concrete data to make the switch!**