In [9]:
import asyncio
from sentence_transformers import SentenceTransformer
from anthropic import AsyncAnthropic
import torch
from retriever import ChainOfThoughtRetriever
from preprocessing import AsyncDocumentProcessor
import os
from pathlib import Path
import numpy as np

In [None]:
async def initialize_search_system(processed_documents, api_key):
    # Set up the embedding model
    embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    
    # Determine the best available device
    if torch.cuda.is_available():
        device = 'cuda'
        embedding_model.to('cuda')
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        device = 'mps'
        embedding_model.to('mps')
    else:
        device = 'cpu'
        embedding_model.to('cpu')
    
    # Initialize the Anthropic client
    anthropic_client = AsyncAnthropic(api_key=api_key)
    
    # Create the retriever
    retriever = ChainOfThoughtRetriever(
        documents=processed_documents,
        embedding_model=embedding_model,
        anthropic_client=anthropic_client,
        device=device,  # Pass the device explicitly
        max_iterations=1,
        results_per_step=5
    )
        # In your main code, after initializing the retriever
    # print(f"FAISS index dimension: {retriever.combined_faiss_index.d}")
    print(f"Embedding model dimension: {retriever.embedding_model.get_sentence_embedding_dimension()}")
    
    return retriever

In [6]:
processor = AsyncDocumentProcessor()
print(f"Preprocessing model name: {processor.embedding_model_name}")
output_dir = Path("processed_documents")

# Load indices from disk
await processor.load_indices(str(output_dir))

processed_documents = processor.documents # Your preprocessed documents
# After loading your documents
for doc_path, doc_data in processed_documents.items():
    for chunk in doc_data['chunks']:
        embedding = chunk['embedding']
        print(f"Document: {doc_path}")
        print(f"Chunk embedding shape: {np.array(embedding).shape}")
        break  # Just check the first chunk
    break  # Just check the first document

api_key = os.getenv("ANTHROPIC_API_KEY")

# Initialize the retriever
retriever = await initialize_search_system(
    processed_documents=processed_documents,
    api_key=api_key
)

# Perform a search
results, reasoning_steps = await retriever.search(
    "what are the use cases",
    return_steps=True
)

INFO:preprocessing:Using Apple Silicon with Metal Performance Shaders
INFO:preprocessing:Selected device for computation: mps
INFO:preprocessing:Using device: mps
INFO:preprocessing:Initializing with 7 processes
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/test_text.txt
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/1.pdf
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/2.pdf
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Preprocessing model name: sentence-transformers/all-mpnet-base-v2
Document: /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/test_text.txt
Chunk embedding shape: ()
Embedding model dimension: 768
Getting results for query: what are the use cases


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.24it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.33it/s]
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Getting results for query: "what are some potential use cases for a local file search engine that uses natural language processing and semantic embeddings?"

CONFIDENCE: 4


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.80it/s]
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Getting results for query: "What are some key use cases and applications for a local file search engine that leverages natural language processing and semantic embeddings across different domains (personal, enterprise, research)? What are the potential challenges or limitations of this approach?"

CONFIDENCE: 4


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.48s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.85it/s]
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Getting results for query: "What are some key use cases and applications for a local file search engine that leverages natural language processing and semantic embeddings across different domains (personal, enterprise, research)? What are the potential challenges or limitations of this approach? How would the system handle different file types and formats, and what specific NLP techniques or models would be used?"

CONFIDENCE: 4


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.47it/s]
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Getting results for query: "What are the key use cases for a local file search engine leveraging natural language processing and semantic embeddings across personal, enterprise, and research domains? What are the potential challenges and limitations of this approach? How would it handle different file formats? What specific NLP models or techniques could be employed?"

CONFIDENCE: 4


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.97it/s]
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


In [7]:
print("Results:", results)

Results: [
Search Result #20
Score: 0.032
Source: 1.pdf
Text: Challenges and Risks
•Computational Limitations: G..., 
Search Result #12
Score: 0.032
Source: 1.pdf
Text: operations.
Architecture/Process:
1. Data Ingestio..., 
Search Result #4
Score: 0.032
Source: 1.pdf
Text: effective retrieval.
•Create a simple user interfa..., 
Search Result #22
Score: 0.032
Source: 1.pdf
Text: Data Requirements:
•Access to a diverse set of loc..., 
Search Result #3
Score: 0.031
Source: 1.pdf
Text: environments. Our project aims to address this cha...]


In [8]:
count = 0
for step in reasoning_steps:
    print(f"Step {count}")
    print(f"query: {step.query}")
    print("results")
    for i in range(len(step.results)):
        print(step.results[i])
    print(step.reasoning)
    print(f"combined_socre: {step.combined_scores}\n")
    count += 1

Step 0
query: what are the use cases
results

Search Result #20
Score: 0.032
Source: 1.pdf
Text: Challenges and Risks
•Computational Limitations: G...

Search Result #12
Score: 0.032
Source: 1.pdf
Text: operations.
Architecture/Process:
1. Data Ingestio...

Search Result #4
Score: 0.032
Source: 1.pdf
Text: effective retrieval.
•Create a simple user interfa...

Search Result #22
Score: 0.032
Source: 1.pdf
Text: Data Requirements:
•Access to a diverse set of loc...

Search Result #3
Score: 0.031
Source: 1.pdf
Text: environments. Our project aims to address this cha...

Reasoning Analysis

Confidence Score: 0.50

Relevance Findings:

Identified Gaps:
  • - Only one specific use case is provided (retrieving class notes). Additional use cases in different domains or scenarios would be helpful.
  • - Details on how the search engine would handle different file types or formats are missing.
  • - Information on the user interface and how users would interact with the search engine is lacking.

In [1]:
from temp import SearchResult, ReasoningStep, SearchIteration
import time

result = SearchResult(1, "Example text", "Some context", 0.95, "/path/to/doc.txt")
print(result)
# Output: SearchResult(id=1, score=0.950, source='doc.txt', context: Some context..., text='Example text')
example_step = ReasoningStep(
    relevance_findings={"result_1": 0.95, "result_2": 0.85},
    gaps_identified=["Lack of context in result_1", "Ambiguous phrasing in result_2"],
    redundant_content=[("result_1", "result_2")],
    suggested_refinement="Merge findings for conciseness.",
    reasoning_explanation="This step evaluates the overlap between results to refine the retrieval strategy for better accuracy.",
    confidence_score=0.92
)

print(example_step)
# Shows structured representation with all components

# iteration = SearchIteration("query", [result], reasoning, {"1": 0.9}, time.time())
# print(iteration)

  from .autonotebook import tqdm as notebook_tqdm



Search Result #1
Score: 0.950
Source: doc.txt
Text: Example text
Context: Some context

Reasoning Analysis

Confidence Score: 0.92

Relevance Findings:
  • result_1: 0.95
  • result_2: 0.85

Identified Gaps:
  • Lack of context in result_1
  • Ambiguous phrasing in result_2

Redundant Content:
  • Results result_1 and result_2 overlap

Suggested Refinement: Merge findings for conciseness.

Reasoning:
  This step evaluates the overlap between results to refine the retrieval strategy for better accuracy...


In [6]:
# Create a small test document
test_doc_content = """
Artificial Intelligence Overview

AI is a broad field of computer science focused on creating intelligent machines.
Machine learning is a subset of AI that uses data to improve performance.
Deep learning is a type of machine learning using neural networks.
"""

# Create test directory and document
test_dir = Path("test_documents")
test_dir.mkdir(exist_ok=True)
test_file = test_dir / "test_article.txt"
with open(test_file, "w") as f:
    f.write(test_doc_content)

In [16]:
from preprocessing import AsyncDocumentProcessor, BatchConfig
from temp import ChainOfThoughtRetriever, SearchResult

In [9]:
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [24]:
# Initialize processor for document processing
processor = AsyncDocumentProcessor(
    embedding_model_name="sentence-transformers/all-mpnet-base-v2",
    anthropic_api_key=api_key,
    device='cpu',  # Using CPU for testing
    batch_config=BatchConfig(
        embeddings=32,
        context=10,
        faiss=1000,
        documents=5,
        process=4
    ),
    chunk_size=10,
    chunk_overlap=2
)
test_dir = Path("processed_documents")
# Process test document
logger.info("Processing test document...")
processed_docs = await processor.load_indices(str(test_dir))

INFO:preprocessing:Selected device for computation: cpu
INFO:preprocessing:Using device: cpu
INFO:preprocessing:Initializing with 7 processes
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:__main__:Processing test document...
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/test_text.txt
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/1.pdf
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/2.pdf


In [27]:
print(f"Processed documents: {processed_docs}")
processed_docs = processor.documents
print(f"Processed documents: {processed_docs}")

Processed documents: None
Processed documents: {'/Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/test_text.txt': {'metadata': {'file_path': '/Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/test_text.txt', 'file_name': 'test_text.txt', 'file_type': '.txt', 'created_time': '2024-11-24T11:57:27.521049', 'modified_time': '2024-11-24T11:57:27.521049', 'size_bytes': 475, 'num_chunks': 1, 'processing_time': 5.821021, 'batch_sizes': {'embeddings': 32, 'context': 10, 'faiss': 1000, 'documents': 5, 'process': 2}}, 'chunks': [{'chunk_id': 0, 'text': '"""The quick brown fox jumps over the lazy dog. This sentence contains every letter of the English alphabet, making it a popular pangram. It\'s often used for typing practice, font displays, and testing equipment. While simple, this sentence serves as a great tool for showcasing how all the letters are used in different contexts.\n    It’s a fun and quirky way to test a variety of systems and applications that require the use of all character

In [28]:
logger.info("Initializing retriever...")
retriever = ChainOfThoughtRetriever(
    documents=processed_docs,
    embedding_model=processor.embedding_model,
    anthropic_client=processor.client,
    device='cpu'
)
# 1. Test _initialize_indices
logger.info("\nTesting _initialize_indices...")
# This was called during initialization, let's verify the structures
assert len(retriever.all_chunks) > 0, "Chunks were not initialized"
assert len(retriever.doc_indices) > 0, "Document indices were not initialized"
assert len(retriever.bm25_indices) > 0, "BM25 indices were not initialized"
logger.info("Index initialization successful")

INFO:__main__:Initializing retriever...
INFO:__main__:
Testing _initialize_indices...
INFO:__main__:Index initialization successful


In [31]:
logger.info("\nTesting _get_dense_results...")
dense_results = await retriever._get_dense_results(
    query="what are the use cases?",
    k=3
)
logger.info(f"Found {len(dense_results)} dense results")

INFO:__main__:
Testing _get_dense_results...
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.12it/s]
INFO:__main__:Found 3 dense results


In [37]:
for result in dense_results:
    print(result)


Search Result #22
Score: 0.032
Source: 1.pdf
Text: Data Requirements:
•Access to a diverse set of loc...

Search Result #20
Score: 0.032
Source: 1.pdf
Text: Challenges and Risks
•Computational Limitations: G...

Search Result #12
Score: -1.401
Source: 1.pdf
Text: operations.
Architecture/Process:
1. Data Ingestio...


In [None]:
dense_scores = {r.chunk_id: (i + 1, r) for i, r in enumerate(dense_results)}
sparse_scores = {r.chunk_id: (i + 1, r) for i, r in enumerate(sparse_results)}

# Compute reciprocal rank fuAMssion scores
fusion_scores = {}
for chunk_id in set(dense_scores.keys()) | set(sparse_scores.keys()):
    print(dense_scores.get(chunk_id, (len(dense_results) + 1, None)))
    dense_rank = dense_scores.get(chunk_id, (len(dense_results) + 1, None))[0]
    sparse_rank = sparse_scores.get(chunk_id, (len(sparse_results) + 1, None))[0]
    print(f"Chunk ID: {chunk_id}, Dense Rank: {dense_rank}, Sparse Rank: {sparse_rank}")
    # RRF formula with k=60 (default constant)
    fusion_scores[chunk_id] = 1 / (60 + dense_rank) + 1 / (60 + sparse_rank)
    print(fusion_scores[chunk_id])



(2, 
Search Result #20
Score: 0.032
Source: 1.pdf
Text: Challenges and Risks
•Computational Limitations: G...)
Chunk ID: 20, Dense Rank: 2, Sparse Rank: 4
0.031754032258064516
(4, None)
Chunk ID: 4, Dense Rank: 4, Sparse Rank: 1
0.032018442622950824
(1, 
Search Result #22
Score: 0.032
Source: 1.pdf
Text: Data Requirements:
•Access to a diverse set of loc...)
Chunk ID: 22, Dense Rank: 1, Sparse Rank: 4
0.032018442622950824
(4, None)
Chunk ID: 21, Dense Rank: 4, Sparse Rank: 3
0.03149801587301587
(3, 
Search Result #12
Score: -1.401
Source: 1.pdf
Text: operations.
Architecture/Process:
1. Data Ingestio...)
Chunk ID: 12, Dense Rank: 3, Sparse Rank: 4
0.03149801587301587
(4, None)
Chunk ID: 30, Dense Rank: 4, Sparse Rank: 2
0.031754032258064516


In [49]:
print(fusion_scores)

{20: 0.031754032258064516, 4: 0.032018442622950824, 22: 0.032018442622950824, 21: 0.03149801587301587, 12: 0.03149801587301587, 30: 0.031754032258064516}


In [45]:
merged_results = retriever._merge_results(
    dense_results=dense_results,
    sparse_results=sparse_results,
    k=3
)

In [46]:
# ogger.info("\nTesting _check_redundancy...")
redundant_pairs = retriever._check_redundancy(merged_results)
logger.info(f"Found {len(redundant_pairs)} redundant pairs")

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
INFO:__main__:Found 0 redundant pairs


In [38]:
logger.info("\nTesting _get_sparse_results...")
sparse_results = retriever._get_sparse_results(
    query="What is machine learning?",
    k=3
)
logger.info(f"Found {len(sparse_results)} sparse results")
for result in sparse_results:
    print(result)
    print(result.text)

INFO:__main__:
Testing _get_sparse_results...
INFO:__main__:Found 3 sparse results



Search Result #4
Score: 6.764
Source: 1.pdf
Text: effective retrieval.
•Create a simple user interfa...
effective retrieval.
•Create a simple user interface facilitating interaction with the search system.
•Evaluate and optimize the search engine’s performance using rigorous metrics to ensure high accuracy
and efficiency.
Use Cases
1. Contextual Retrieval of Class Notes
Query: ”What were the main topics covered in last week’s Machine Learning lecture?”
•Expected Results: Class notes or lecture slides from the most recent Machine Learning lecture.

Search Result #30
Score: 3.533
Source: 2.pdf
Text: TCP/IP configurations. Building upon this backgrou...
TCP/IP configurations. Building upon this background, I did some creative projects, including
constructing my customized Linux traceroute and creating a static library that uses a reliable pro-
tocol over UDP Sockets.
Furthermore, I explored the vast fields of computer science. I got a glimpse into data science
when I took artificial inte

In [47]:
reasoning_step = await retriever._get_reasoned_analysis(
            query="What is AI?",
            results=merged_results,
            previous_steps=[]
        )
print(reasoning_step)

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.04it/s]
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"



Reasoning Analysis

Confidence Score: 0.50

Relevance Findings:

Identified Gaps:
  • [Definition and explanation of what AI (artificial intelligence) is]
  • [Examples or use cases of AI]
  • [Overview of different types/approaches to AI]
  • [History and evolution of AI]
  • [Current state and future potential of AI]

Redundant Content:

Suggested Refinement: "What is artificial intelligence (AI)? Definition, examples, types, history and future."

CONFIDENCE: 0.9

Reasoning:
  The provided search results are completely irrelevant to answering the query "What is AI?". They app...


In [48]:
combined_scores = retriever._combine_scores(
            dense_results=dense_results,
            sparse_results=sparse_results
        )
print(combined_scores)

{'20': 0.29752258064516124, '4': 0.003624828439237144, '22': 0.6, '21': 0.080889347737263, '12': -8.753511169433592, '30': 0.2}


In [50]:
results, steps = await retriever.search(
            query="tell me about usecases",
            return_steps=True
        )

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.44s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.22it/s]
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.65it/s]
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.26it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.39it/s]
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


In [51]:
print(results, steps)

[
Search Result #3
Score: 0.032
Source: 1.pdf
Text: environments. Our project aims to address this cha..., 
Search Result #0
Score: 0.032
Source: test_text.txt
Text: """The quick brown fox jumps over the lazy dog. Th..., 
Search Result #11
Score: 0.031
Source: 1.pdf
Text: LLM(s) and Techniques: The project will utilize st..., 
Search Result #4
Score: 0.031
Source: 1.pdf
Text: effective retrieval.
•Create a simple user interfa..., 
Search Result #23
Score: 0.031
Source: 1.pdf
Text: •Final Report:
–Introduction outlining project goa...] [
Search Iteration

Timestamp: 2024-11-29 16:18:25
Query: tell me about usecases

Results Summary:
Total Results: 5
Top Results:

Search Result #3
Score: 0.032
Source: 1.pdf
Text: environments. Our project aims to address this cha...

Search Result #0
Score: 0.032
Source: test_text.txt
Text: """The quick brown fox jumps over the lazy dog. Th...

Search Result #11
Score: 0.031
Source: 1.pdf
Text: LLM(s) and Techniques: The project will utilize st...

Combi

In [35]:
logger.info("\nTesting _merge_results...")
merged_results = retriever._merge_results(
    dense_results=dense_results,
    sparse_results=sparse_results,
    k=3
)
logger.info(f"Merged into {len(merged_results)} results")
for result in merged_results:
    print(result)
    print(result.text)

INFO:__main__:
Testing _merge_results...
INFO:__main__:Merged into 3 results



Search Result #4
Score: 0.032
Source: 1.pdf
Text: effective retrieval.
•Create a simple user interfa...
effective retrieval.
•Create a simple user interface facilitating interaction with the search system.
•Evaluate and optimize the search engine’s performance using rigorous metrics to ensure high accuracy
and efficiency.
Use Cases
1. Contextual Retrieval of Class Notes
Query: ”What were the main topics covered in last week’s Machine Learning lecture?”
•Expected Results: Class notes or lecture slides from the most recent Machine Learning lecture.

Search Result #22
Score: 0.032
Source: 1.pdf
Text: Data Requirements:
•Access to a diverse set of loc...
Data Requirements:
•Access to a diverse set of local files for testing (e.g., documents, reports).
•Pre-trained LLMs accessible locally without relying on external APIs.
Expected Deliverables
•Code:
–Modular and well-documented scripts for data preprocessing, embedding generation, indexing,
query processing, and search.
–A user interface 

In [76]:
import time
import aiohttp
import asyncio
from typing import List

class OllamaTester:
    def __init__(self, base_url: str = "http://localhost:11434"):
        self.base_url = base_url

    async def test_model(self, model_name: str, prompt: str, max_tokens: int = 50, temperature: float = 0.7):
        """Test the speed of a specific model."""
        url = f"{self.base_url}/api/completions"
        payload = {
            "model": model_name,
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stream": False
        }
        async with aiohttp.ClientSession() as session:
            try:
                start_time = time.time()
                async with session.post(url, json=payload, timeout=30) as response:
                    if response.status != 200:
                        print(f"Error for model {model_name}: {response.status} - {await response.text()}")
                        return None
                    result = await response.json()
                    end_time = time.time()
                    elapsed_time = end_time - start_time
                    print(result)
                    return elapsed_time, result.get('response', '')
            except Exception as e:
                print(f"Error testing model {model_name}: {str(e)}")
                return None

    async def test_models(self, models: List[str], prompt: str, max_tokens: int = 50, temperature: float = 0.7):
        """Test multiple models and print their speeds."""
        results = {}
        for model_name in models:
            print(f"Testing model: {model_name}...")
            elapsed_time, response = await self.test_model(model_name, prompt, max_tokens, temperature)
            if elapsed_time is not None:
                results[model_name] = {
                    "time": elapsed_time,
                    "response_preview": response[:100]  # Preview first 100 characters
                }
                print(f"{model_name} completed in {elapsed_time:.2f} seconds.")
            else:
                results[model_name] = {"time": None, "response_preview": "Error"}
        return results

# Define the models and prompt to test
models_to_test = ["qwen:0.5b", "llama3.2:1b", "llama3.2:latest", "mistral:7b", "phi3:mini"]
test_prompt = "Explain the concept of chain of thought reasoning in AI."



In [None]:
# "qwen:0.5b", 

In [77]:

tester = OllamaTester()
results = await tester.test_models(models_to_test, test_prompt)

print("\nTest Results:")
for model, result in results.items():
    print(f"{model} - Time: {result['time']:.2f} seconds, Response Preview: {result['response_preview']}")


Testing model: qwen:0.5b...
Error for model qwen:0.5b: 404 - 404 page not found


TypeError: cannot unpack non-iterable NoneType object

In [63]:
llm_client = LocalLLMClient()
prompt = "What is the use of AI?"

In [64]:
context = await llm_client.generate_response(
            prompt=prompt,
            max_tokens=100,
            temperature=0.3
        )

In [65]:
context

'Artificial intelligence (AI) has a wide range of applications and uses, which can be broadly categorized into several areas. Here are some examples:\n\n1. **Virtual Assistants**: AI-powered virtual assistants like Siri, Google Assistant, and Alexa enable users to perform tasks, set reminders, send messages, and more with just their voice commands.\n2. **Image and Video Recognition**: AI algorithms can recognize objects, scenes, and emotions in images and videos, which has numerous applications in areas like surveillance, self-driving cars, and entertainment.\n3. **Natural Language Processing (NLP)**: AI-powered NLP enables computers to understand, interpret, and generate human language, making it a crucial component in chatbots, sentiment analysis, and text translation.\n4. **Predictive Analytics**: AI can analyze large datasets to predict future trends, behaviors, and outcomes, which is widely used in business, finance, and healthcare.\n5. **Robotics and Automation**: AI-powered robo

In [83]:
import time
import statistics
import json
from typing import List, Dict
import asyncio
from ollama import Client
import pandas as pd
from datetime import datetime

class OllamaBenchmark:
    def __init__(self):
        # Initialize the Ollama client - we'll use a single client instance
        self.client = Client(host='http://localhost:11434')
        
        # Models to test - these need to be available in Ollama
        self.models = [
            'qwen:0.5b',
            'llama3.2:1b',
            'phi3:mini'
        ]
        
        # Test prompts of varying complexity for benchmarking
        self.test_prompts = [
            "What is 2+2?",  # Simple arithmetic
            "Explain how photosynthesis works in three sentences.",  # Medium complexity
            "Write a short story about a robot learning to paint.",  # Creative/complex
        ]
    
    def load_model(self, model_name: str) -> float:
        """
        Measure time taken to load a model. Using synchronous client methods
        since the async methods are currently having compatibility issues.
        """
        start_time = time.time()
        try:
            # Pull the model if not already present
            # Note: Using the synchronous pull method instead of async
            self.client.pull(model=model_name)
            end_time = time.time()
            return end_time - start_time
        except Exception as e:
            print(f"Error loading model {model_name}: {str(e)}")
            return None

    def run_inference(self, model_name: str, prompt: str) -> Dict:
        """
        Run a single inference and measure metrics using synchronous methods.
        Returns timing and response length information.
        """
        start_time = time.time()
        try:
            # Generate response using synchronous generate method
            response = self.client.generate(
                model=model_name,
                prompt=prompt,
                stream=False  # Important: Keep this false for accurate timing
            )
            end_time = time.time()
            
            # Extract the actual response text from the response object
            response_text = response['response'] if isinstance(response, dict) else str(response)
            
            return {
                'time': end_time - start_time,
                'response_length': len(response_text)
            }
        except Exception as e:
            print(f"Error during inference with {model_name}: {str(e)}")
            return None

    def benchmark_model(self, model_name: str, num_runs: int = 3) -> Dict:
        """
        Run complete benchmark suite for a single model, including load time
        and multiple inference runs with different prompts.
        """
        results = {
            'model_name': model_name,
            'load_time': None,
            'inference_times': [],
            'tokens_per_second': [],
            'avg_response_length': [],
            'errors': []
        }
        
        # First, measure load time
        print(f"\nLoading model {model_name}...")
        load_time = self.load_model(model_name)
        results['load_time'] = load_time
        
        if load_time is None:
            results['errors'].append("Failed to load model")
            return results
        
        # Run inference benchmarks for each prompt
        for i, prompt in enumerate(self.test_prompts):
            print(f"Running prompt {i+1}/{len(self.test_prompts)}...")
            prompt_results = []
            response_lengths = []
            
            for run in range(num_runs):
                # Add a small delay between runs to prevent overloading
                if run > 0:
                    time.sleep(1)
                
                inference_result = self.run_inference(model_name, prompt)
                if inference_result:
                    prompt_results.append(inference_result['time'])
                    response_lengths.append(inference_result['response_length'])
                else:
                    results['errors'].append(f"Failed inference on prompt {i+1}, run {run+1}")
            
            if prompt_results:
                avg_time = statistics.mean(prompt_results)
                results['inference_times'].append(avg_time)
                avg_length = statistics.mean(response_lengths)
                results['avg_response_length'].append(avg_length)
                # Estimate tokens per second (assuming ~4 chars per token)
                tokens_per_second = (avg_length / 4) / avg_time
                results['tokens_per_second'].append(tokens_per_second)
        
        return results

    def run_benchmarks(self, num_runs: int = 3) -> None:
        """
        Run benchmarks for all models and save detailed results to files.
        """
        all_results = []
        
        for model in self.models:
            print(f"\nBenchmarking {model}...")
            results = self.benchmark_model(model, num_runs)
            all_results.append(results)
            
        # Create summary DataFrame
        df = pd.DataFrame(all_results)
        
        # Calculate aggregate metrics
        df['avg_inference_time'] = df.apply(
            lambda x: statistics.mean(x['inference_times']) if x['inference_times'] else None, 
            axis=1
        )
        df['avg_tokens_per_second'] = df.apply(
            lambda x: statistics.mean(x['tokens_per_second']) if x['tokens_per_second'] else None, 
            axis=1
        )
        
        # Save results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save detailed results as JSON
        with open(f'benchmark_results_{timestamp}.json', 'w') as f:
            json.dump(all_results, f, indent=2)
        
        # Save summary as CSV
        summary_df = df[['model_name', 'load_time', 'avg_inference_time', 'avg_tokens_per_second']]
        summary_df.to_csv(f'benchmark_summary_{timestamp}.csv', index=False)
        
        # Print summary
        print("\nBenchmark Results Summary:")
        print(summary_df.to_string(index=False))
        
        # Print any errors that occurred
        for result in all_results:
            if result['errors']:
                print(f"\nErrors for {result['model_name']}:")
                for error in result['errors']:
                    print(f"- {error}")

In [84]:
benchmark = OllamaBenchmark()
await benchmark.run_benchmarks(num_runs=3)



Benchmarking qwen:0.5b...

Loading model qwen:0.5b...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/pull "HTTP/1.1 200 OK"


Running prompt 1/3...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Running prompt 2/3...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Running prompt 3/3...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"



Benchmarking llama3.2:1b...

Loading model llama3.2:1b...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/pull "HTTP/1.1 200 OK"


Running prompt 1/3...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Running prompt 2/3...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Running prompt 3/3...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"



Benchmarking phi3:mini...

Loading model phi3:mini...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/pull "HTTP/1.1 200 OK"


Running prompt 1/3...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Running prompt 2/3...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Running prompt 3/3...


INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"



Benchmark Results Summary:
 model_name  load_time  avg_inference_time  avg_tokens_per_second
  qwen:0.5b   0.645327            1.251593             210.353411
llama3.2:1b   0.588126            7.590198             112.837565
  phi3:mini   0.711140           24.214501              58.294165


TypeError: object NoneType can't be used in 'await' expression

In [None]:

llm = Llama.from_pretrained(
    repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
    filename="*q8_0.gguf",
    verbose=False
)

In [5]:


llm = Llama(
      model_path="./model/Meta-Llama-3-8B.Q2_K.gguf",
      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)
output = llm(
      "Q: Name the planets in the solar system? A: ", # Prompt
      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)

llama_load_model_from_file: using device Metal (Apple M1) - 5455 MiB free
llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from ./model/Meta-Llama-3-8B.Q2_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 128256
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 14336
llama_m

{'id': 'cmpl-1da11021-1701-419b-a90a-a61e1a631558', 'object': 'text_completion', 'created': 1732940472, 'model': './model/Meta-Llama-3-8B.Q2_K.gguf', 'choices': [{'text': 'Q: Name the planets in the solar system? A: 12. ', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 13, 'completion_tokens': 4, 'total_tokens': 17}}


In [8]:
output = llm(
      "Q: Name the planets in the solar system? (list in a single line) ", # Prompt
      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)

Llama.generate: 10 prefix-match hit, remaining 9 prompt tokens to eval
llama_perf_context_print:        load time =    6692.17 ms
llama_perf_context_print: prompt eval time =       0.00 ms /    10 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    31 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =  145672.95 ms /    41 tokens


{'id': 'cmpl-8400da84-ed10-4e5b-ba16-8251c9227522', 'object': 'text_completion', 'created': 1732940720, 'model': './model/Meta-Llama-3-8B.Q2_K.gguf', 'choices': [{'text': 'Q: Name the planets in the solar system? (list in a single line) 1) Mercury 2) Venus 3) Earth 4) Mars 5) Jupiter 6) Saturn 7) Uranus 8) Neptune', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 19, 'completion_tokens': 32, 'total_tokens': 51}}


In [1]:
import requests

def test_llm_server(url, prompt, max_tokens=50, temperature=0.7):
    """Sends a prompt to the LLM server and prints the response."""
    try:
        # Define the payload for the POST request
        payload = {
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature
        }

        # Send the POST request
        response = requests.post(url, json=payload)

        # Check for errors
        if response.status_code == 200:
            # Print the response from the server
            data = response.json()
            print("Response from LLM:")
            print(data.get("response", "No response found in the output."))
        else:
            print(f"Error: Server returned status code {response.status_code}")
            print("Details:", response.text)

    except Exception as e:
        print("An error occurred:", str(e))


# Server URL and prompt
server_url = "http://localhost:8000//v1/completions"  # Replace with your endpoint if different
test_prompt = "What is the capital of France?"

# Call the function to test the server
test_llm_server(server_url, test_prompt)

KeyboardInterrupt: 

In [None]:
from llama_cpp import Llama
llm = Llama(
      model_path="model/Meta-Llama-3-8B.Q2_K.gguf",
      chat_format="llama-2"
)
result = llm.create_chat_completion(
      messages = [
          {"role": "system", "content": "You are an assistant who adds context to chunk based on surronunding context."},
          {
              "role": "user",
              "content": "previous text : There is a significant gap in the availability of local search engines that can interpret natural\nlanguage queries and retrieve files based on semantic relevance rather than mere keyword occurrence. Ex-\nisting solutions do not effectively leverage advancements in Large Language Models (LLMs) for local data\n\ncurrent text : environments. Our project aims to address this challenge by developing a novel local search engine that\nharnesses the power of LLMs to enable efficient, context-aware retrieval of files, enhancing productivity and\ndata accessibility.\nProject Objectives\n\u2022Design and develop an innovative local file search engine that interprets natural language queries using\nLLMs.\n\u2022Implement a semantic indexing mechanism that transforms local files into meaningful embeddings for\neffective retrieval. \n next text: "
          }
      ]
)

llama_load_model_from_file: using device Metal (Apple M1) - 5454 MiB free
llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from model/Meta-Llama-3-8B.Q2_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 128256
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 14336
llama_mod

KeyboardInterrupt: 

In [None]:
""