In [15]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

import PyPDF2
import fitz
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

PDF_PATH = "deepseek_R1.pdf"
CHUNK_SIZE = 500
OVERLAP = 50
TOP_K = 3

MODELS = {
    'sentence-bert-small': 'all-MiniLM-L6-v2',
    'sentence-bert-large': 'all-mpnet-base-v2',
    'e5-small': 'intfloat/e5-small-v2',
    'e5-base': 'intfloat/e5-base-v2',
    'bge-small': 'BAAI/bge-small-en-v1.5',
    'bge-base': 'BAAI/bge-base-en-v1.5',
}

TEST_QUESTIONS = [
    "Who are the authors of this paper?",
    "What is reasoning in case of an LLM",
    "What is the abstract?",
    "What are the conclusions?",
]

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text()
    return text

def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        if end < len(text):
            last_period = chunk.rfind('.')
            if last_period > start + chunk_size // 2:
                end = start + last_period + 1
                chunk = text[start:end]
        chunks.append(chunk.strip())
        start = end - overlap
        if start >= len(text):
            break
    return chunks

class EmbeddingModel:
    def __init__(self, model_name):
        self.model_name = model_name
        self.model = None
        self.load_time = 0
        
    def load_model(self):
        start_time = time.time()
        if any(x in self.model_name for x in ['all-MiniLM', 'all-mpnet', 'e5-', 'bge-']):
            self.model = SentenceTransformer(self.model_name)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModel.from_pretrained(self.model_name)
        self.load_time = time.time() - start_time
        
    def encode(self, texts):
        if isinstance(self.model, SentenceTransformer):
            return self.model.encode(texts)
        else:
            inputs = self.tokenizer(texts, padding=True, truncation=True, 
                                  return_tensors='pt', max_length=512)
            with torch.no_grad():
                outputs = self.model(**inputs)
            return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

def search_chunks(query_embedding, chunk_embeddings, chunks, top_k=3):
    similarities = cosine_similarity([query_embedding], chunk_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    results = []
    for i, idx in enumerate(top_indices):
        results.append({
            'rank': i + 1,
            'chunk': chunks[idx],
            'score': similarities[idx],
            'chunk_id': idx
        })
    return results

def run_comparison():
    print("Loading PDF...")
    text = extract_text_from_pdf(PDF_PATH)
    chunks = chunk_text(text, CHUNK_SIZE, OVERLAP)
    print(f"Created {len(chunks)} chunks")
    
    all_results = {}
    model_stats = []
    
    for model_key, model_name in MODELS.items():
        print(f"\nTesting {model_key}...")
        
        try:
            model = EmbeddingModel(model_name)
            model.load_model()
            
            print(f"Encoding chunks...")
            chunk_embeddings = model.encode(chunks)
            
            model_results = []
            total_search_time = 0
            
            for question in TEST_QUESTIONS:
                start_time = time.time()
                query_embedding = model.encode([question])[0]
                search_results = search_chunks(query_embedding, chunk_embeddings, chunks, TOP_K)
                search_time = time.time() - start_time
                total_search_time += search_time
                
                model_results.append({
                    'question': question,
                    'results': search_results,
                    'avg_score': np.mean([r['score'] for r in search_results])
                })
            
            all_results[model_key] = model_results
            
            model_stats.append({
                'model': model_key,
                'load_time': model.load_time,
                'avg_search_time': total_search_time / len(TEST_QUESTIONS),
                'avg_relevance': np.mean([r['avg_score'] for r in model_results]),
                'embedding_dim': chunk_embeddings.shape[1]
            })
            
        except Exception as e:
            print(f"Failed: {e}")
            continue
    
    return all_results, pd.DataFrame(model_stats)

def print_results(all_results, stats_df):
    print("\n" + "="*100)
    print("MODEL PERFORMANCE COMPARISON")
    print("="*100)
    
    print("\nMODEL STATISTICS:")
    print(stats_df.round(3).to_string(index=False))
    
    print("\n" + "="*100)
    print("DETAILED RESULTS BY QUESTION")
    print("="*100)
    
    for question in TEST_QUESTIONS:
        print(f"\nQUESTION: {question}")
        print("-" * 80)
        
        for model_key in all_results.keys():
            model_results = all_results[model_key]
            question_result = next((r for r in model_results if r['question'] == question), None)
            
            if question_result:
                print(f"\n{model_key.upper()} (Score: {question_result['avg_score']:.3f}):")
                for result in question_result['results']:
                    chunk_preview = result['chunk'][:100] + "..." if len(result['chunk']) > 100 else result['chunk']
                    print(f"  {result['rank']}. [{result['score']:.3f}] {chunk_preview}")
    
    print("\n" + "="*100)
    print("RECOMMENDATIONS")
    print("="*100)
    
    best_relevance = stats_df.loc[stats_df['avg_relevance'].idxmax()]
    fastest_search = stats_df.loc[stats_df['avg_search_time'].idxmin()]
    fastest_load = stats_df.loc[stats_df['load_time'].idxmin()]
    
    print(f"Best Relevance: {best_relevance['model']} (Score: {best_relevance['avg_relevance']:.3f})")
    print(f"Fastest Search: {fastest_search['model']} ({fastest_search['avg_search_time']*1000:.1f}ms)")
    print(f"Fastest Load: {fastest_load['model']} ({fastest_load['load_time']:.1f}s)")


results, stats = run_comparison()
print_results(results, stats)

Loading PDF...
Created 127 chunks

Testing sentence-bert-small...
Encoding chunks...

Testing sentence-bert-large...
Encoding chunks...

Testing e5-small...
Encoding chunks...

Testing e5-base...
Encoding chunks...

Testing bge-small...
Encoding chunks...

Testing bge-base...
Encoding chunks...

MODEL PERFORMANCE COMPARISON

MODEL STATISTICS:
              model  load_time  avg_search_time  avg_relevance  embedding_dim
sentence-bert-small      3.910            0.044          0.331            384
sentence-bert-large      4.791            0.049          0.363            768
           e5-small      4.298            0.031          0.832            384
            e5-base      4.784            0.032          0.793            768
          bge-small      4.235            0.031          0.649            384
           bge-base      5.198            0.030          0.647            768

DETAILED RESULTS BY QUESTION

QUESTION: Who are the authors of this paper?
---------------------------------

[31mERROR: Could not find a version that satisfies the requirement faiss (from versions: none)[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[31mERROR: No matching distribution found for faiss[0m[31m
[0m