In [1]:
import asyncio
from sentence_transformers import SentenceTransformer
from anthropic import AsyncAnthropic
import torch
from retriever import ChainOfThoughtRetriever
from preprocessing import AsyncDocumentProcessor
import os
from pathlib import Path
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
async def initialize_search_system(processed_documents, api_key):
    # Set up the embedding model
    embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    
    # Determine the best available device
    if torch.cuda.is_available():
        device = 'cuda'
        embedding_model.to('cuda')
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        device = 'mps'
        embedding_model.to('mps')
    else:
        device = 'cpu'
        embedding_model.to('cpu')
    
    # Initialize the Anthropic client
    anthropic_client = AsyncAnthropic(api_key=api_key)
    
    # Create the retriever
    retriever = ChainOfThoughtRetriever(
        documents=processed_documents,
        embedding_model=embedding_model,
        anthropic_client=anthropic_client,
        device=device,  # Pass the device explicitly
        max_iterations=1,
        results_per_step=5
    )
        # In your main code, after initializing the retriever
    # print(f"FAISS index dimension: {retriever.combined_faiss_index.d}")
    print(f"Embedding model dimension: {retriever.embedding_model.get_sentence_embedding_dimension()}")
    
    return retriever

In [6]:
processor = AsyncDocumentProcessor()
print(f"Preprocessing model name: {processor.embedding_model_name}")
output_dir = Path("processed_documents")

# Load indices from disk
await processor.load_indices(str(output_dir))

processed_documents = processor.documents # Your preprocessed documents
# After loading your documents
for doc_path, doc_data in processed_documents.items():
    for chunk in doc_data['chunks']:
        embedding = chunk['embedding']
        print(f"Document: {doc_path}")
        print(f"Chunk embedding shape: {np.array(embedding).shape}")
        break  # Just check the first chunk
    break  # Just check the first document

api_key = os.getenv("ANTHROPIC_API_KEY")

# Initialize the retriever
retriever = await initialize_search_system(
    processed_documents=processed_documents,
    api_key=api_key
)

# Perform a search
results, reasoning_steps = await retriever.search(
    "what are the use cases",
    return_steps=True
)

INFO:preprocessing:Using Apple Silicon with Metal Performance Shaders
INFO:preprocessing:Selected device for computation: mps
INFO:preprocessing:Using device: mps
INFO:preprocessing:Initializing with 7 processes
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/test_text.txt
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/temp_uploads/Lecture 1.pdf
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/1.pdf
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/2.pdf
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load 

Preprocessing model name: sentence-transformers/all-mpnet-base-v2
Document: /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/test_text.txt
Chunk embedding shape: ()


INFO:retriever:Performing initial retrieval with query: what are the use cases


Embedding model dimension: 768


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.28it/s]
INFO:retriever:Using fusion k value: 100
Batches: 100%|██████████| 1/1 [00:00<00:00, 53.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 60.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 56.31it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 59.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.47it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 59.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.83it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 60.62it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.42it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 56.98it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.20it/s]
INFO:httpx:HTTP Request:

In [None]:
print("Results:", results)

Results: [
Search Result #9
Score: 0.010
Source: 1.pdf
Text: Efficient Local File Search Engine Using Large Lan..., 
Search Result #12
Score: 0.010
Source: 1.pdf
Text: effective retrieval.
•Create a simple user interfa..., 
Search Result #3
Score: 0.010
Source: Lecture 1.pdf
Text: oSchedule meetings for you 
oSearch the Internet 
..., 
Search Result #20
Score: 0.010
Source: 1.pdf
Text: operations.
Architecture/Process:
1. Data Ingestio..., 
Search Result #22
Score: 0.010
Source: 1.pdf
Text: 6. User Interface and Results Presentation: Develo...]


: 

In [8]:
count = 0
for step in reasoning_steps:
    print(f"Step {count}")
    print(f"query: {step.query}")
    print("results")
    for i in range(len(step.results)):
        print(step.results[i])
    print(step.reasoning)
    print(f"combined_socre: {step.combined_scores}\n")
    count += 1

Step 0
query: what are the use cases
results

Search Result #20
Score: 0.032
Source: 1.pdf
Text: Challenges and Risks
•Computational Limitations: G...

Search Result #12
Score: 0.032
Source: 1.pdf
Text: operations.
Architecture/Process:
1. Data Ingestio...

Search Result #4
Score: 0.032
Source: 1.pdf
Text: effective retrieval.
•Create a simple user interfa...

Search Result #22
Score: 0.032
Source: 1.pdf
Text: Data Requirements:
•Access to a diverse set of loc...

Search Result #3
Score: 0.031
Source: 1.pdf
Text: environments. Our project aims to address this cha...

Reasoning Analysis

Confidence Score: 0.50

Relevance Findings:

Identified Gaps:
  • - Only one specific use case is provided (retrieving class notes). Additional use cases in different domains or scenarios would be helpful.
  • - Details on how the search engine would handle different file types or formats are missing.
  • - Information on the user interface and how users would interact with the search engine is lacking.

In [1]:
from temp import SearchResult, ReasoningStep, SearchIteration
import time

result = SearchResult(1, "Example text", "Some context", 0.95, "/path/to/doc.txt")
print(result)
# Output: SearchResult(id=1, score=0.950, source='doc.txt', context: Some context..., text='Example text')
example_step = ReasoningStep(
    relevance_findings={"result_1": 0.95, "result_2": 0.85},
    gaps_identified=["Lack of context in result_1", "Ambiguous phrasing in result_2"],
    redundant_content=[("result_1", "result_2")],
    suggested_refinement="Merge findings for conciseness.",
    reasoning_explanation="This step evaluates the overlap between results to refine the retrieval strategy for better accuracy.",
    confidence_score=0.92
)

print(example_step)
# Shows structured representation with all components

# iteration = SearchIteration("query", [result], reasoning, {"1": 0.9}, time.time())
# print(iteration)

  from .autonotebook import tqdm as notebook_tqdm



Search Result #1
Score: 0.950
Source: doc.txt
Text: Example text
Context: Some context

Reasoning Analysis

Confidence Score: 0.92

Relevance Findings:
  • result_1: 0.95
  • result_2: 0.85

Identified Gaps:
  • Lack of context in result_1
  • Ambiguous phrasing in result_2

Redundant Content:
  • Results result_1 and result_2 overlap

Suggested Refinement: Merge findings for conciseness.

Reasoning:
  This step evaluates the overlap between results to refine the retrieval strategy for better accuracy...


In [6]:
# Create a small test document
test_doc_content = """
Artificial Intelligence Overview

AI is a broad field of computer science focused on creating intelligent machines.
Machine learning is a subset of AI that uses data to improve performance.
Deep learning is a type of machine learning using neural networks.
"""

# Create test directory and document
test_dir = Path("test_documents")
test_dir.mkdir(exist_ok=True)
test_file = test_dir / "test_article.txt"
with open(test_file, "w") as f:
    f.write(test_doc_content)

In [16]:
from preprocessing import AsyncDocumentProcessor, BatchConfig
from temp import ChainOfThoughtRetriever, SearchResult

In [9]:
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [24]:
# Initialize processor for document processing
processor = AsyncDocumentProcessor(
    embedding_model_name="sentence-transformers/all-mpnet-base-v2",
    anthropic_api_key=api_key,
    device='cpu',  # Using CPU for testing
    batch_config=BatchConfig(
        embeddings=32,
        context=10,
        faiss=1000,
        documents=5,
        process=4
    ),
    chunk_size=10,
    chunk_overlap=2
)
test_dir = Path("processed_documents")
# Process test document
logger.info("Processing test document...")
processed_docs = await processor.load_indices(str(test_dir))

INFO:preprocessing:Selected device for computation: cpu
INFO:preprocessing:Using device: cpu
INFO:preprocessing:Initializing with 7 processes
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:__main__:Processing test document...
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/test_text.txt
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/1.pdf
INFO:preprocessing:Successfully loaded indices for /Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/2.pdf


In [27]:
print(f"Processed documents: {processed_docs}")
processed_docs = processor.documents
print(f"Processed documents: {processed_docs}")

Processed documents: None
Processed documents: {'/Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/test_text.txt': {'metadata': {'file_path': '/Users/battalavamshi/Desktop/TAMU/LLMs/Project/pdfs/test_text.txt', 'file_name': 'test_text.txt', 'file_type': '.txt', 'created_time': '2024-11-24T11:57:27.521049', 'modified_time': '2024-11-24T11:57:27.521049', 'size_bytes': 475, 'num_chunks': 1, 'processing_time': 5.821021, 'batch_sizes': {'embeddings': 32, 'context': 10, 'faiss': 1000, 'documents': 5, 'process': 2}}, 'chunks': [{'chunk_id': 0, 'text': '"""The quick brown fox jumps over the lazy dog. This sentence contains every letter of the English alphabet, making it a popular pangram. It\'s often used for typing practice, font displays, and testing equipment. While simple, this sentence serves as a great tool for showcasing how all the letters are used in different contexts.\n    It’s a fun and quirky way to test a variety of systems and applications that require the use of all character

In [28]:
logger.info("Initializing retriever...")
retriever = ChainOfThoughtRetriever(
    documents=processed_docs,
    embedding_model=processor.embedding_model,
    anthropic_client=processor.client,
    device='cpu'
)
# 1. Test _initialize_indices
logger.info("\nTesting _initialize_indices...")
# This was called during initialization, let's verify the structures
assert len(retriever.all_chunks) > 0, "Chunks were not initialized"
assert len(retriever.doc_indices) > 0, "Document indices were not initialized"
assert len(retriever.bm25_indices) > 0, "BM25 indices were not initialized"
logger.info("Index initialization successful")

INFO:__main__:Initializing retriever...
INFO:__main__:
Testing _initialize_indices...
INFO:__main__:Index initialization successful


In [31]:
logger.info("\nTesting _get_dense_results...")
dense_results = await retriever._get_dense_results(
    query="what are the use cases?",
    k=3
)
logger.info(f"Found {len(dense_results)} dense results")

INFO:__main__:
Testing _get_dense_results...
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.12it/s]
INFO:__main__:Found 3 dense results


In [37]:
for result in dense_results:
    print(result)


Search Result #22
Score: 0.032
Source: 1.pdf
Text: Data Requirements:
•Access to a diverse set of loc...

Search Result #20
Score: 0.032
Source: 1.pdf
Text: Challenges and Risks
•Computational Limitations: G...

Search Result #12
Score: -1.401
Source: 1.pdf
Text: operations.
Architecture/Process:
1. Data Ingestio...


In [6]:
from typing import Set
from nltk import word_tokenize, pos_tag, RegexpParser


def _extract_key_concepts( query: str) -> Set[str]:
    """
    Extract important concepts and terms from a query using NLP techniques.
    
    This function uses multiple approaches to identify key concepts:
    1. Named entity recognition
    2. Noun phrase extraction
    3. Important keyword identification
    """
    key_concepts = set()
    
    # Tokenize and tag parts of speech
    tokens = word_tokenize(query.lower())
    pos_tags = pos_tag(tokens)
    
    # Extract noun phrases using chunking
    grammar = """
        NP: {<DT>?<JJ>*<NN.*>+}     # Chunk determiners, adjectives, and nouns
        CP: {<JJR|JJS><IN><NN.*>+}  # Comparative phrases
    """
    chunk_parser = RegexpParser(grammar)
    tree = chunk_parser.parse(pos_tags)
    
    # Extract concepts from noun phrases
    for subtree in tree.subtrees(filter=lambda t: t.label() in {'NP', 'CP'}):
        concept = ' '.join(word for word, tag in subtree.leaves())
        if len(concept.split()) > 1:  # Only keep multi-word concepts
            key_concepts.add(concept)
    
    # Add single important terms (nouns, verbs, adjectives)
    important_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'JJ'}
    for word, tag in pos_tags:
        if tag[:2] in important_tags and len(word) > 3:
            key_concepts.add(word)
    
    return key_concepts


In [11]:
from sklearn.metrics.pairwise import cosine_similarity


def _construct_enhanced_query(
    original_query: str,
    key_concepts: Set[str],
    max_concepts: int = 3
) -> str:
    """
    Construct an enhanced search query that combines the original query with key concepts.
    
    The function creates a more comprehensive query by:
    1. Keeping the original query intent
    2. Adding the most relevant discovered concepts
    3. Maintaining a natural language structure
    """
    # Remove concepts that are already in the original query
    original_lower = original_query.lower()
    new_concepts = {
        concept for concept in key_concepts
        if concept.lower() not in original_lower
    }
    
    # Ensure the embedding model is available
    embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

    # Score concepts by relevance to original query
    concept_scores = {}
    for concept in new_concepts:
        # Calculate semantic similarity using embeddings
        concept_embedding = embedding_model.encode([concept])[0]
        query_embedding = embedding_model.encode([original_query])[0]
        similarity = cosine_similarity(
            concept_embedding.reshape(1, -1),
            query_embedding.reshape(1, -1)
        )[0][0]
        
        # Score also considers concept specificity
        specificity = len(concept.split())  # Multi-word concepts are usually more specific
        concept_scores[concept] = similarity * (1 + 0.1 * specificity)
    
    # Select top concepts
    top_concepts = sorted(
        concept_scores.items(),
        key=lambda x: x[1],
        reverse=True
    )[:max_concepts]
    
    # Construct enhanced query
    if original_query.lower().startswith(('what', 'who', 'where', 'when', 'why', 'how')):
        # For question queries, append concepts naturally
        enhanced_query = f"{original_query} considering {', '.join(c[0] for c in top_concepts)}"
    else:
        # For keyword queries, combine with AND logic
        enhanced_query = f"{original_query} AND ({' OR '.join(c[0] for c in top_concepts)})"
    
    return enhanced_query

In [13]:
concepts = _extract_key_concepts("what are the use cases?")
print(concepts)
print(_construct_enhanced_query("what are the use cases?", concepts))

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


{'cases', 'the use cases'}
what are the use cases? considering 


In [9]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/battalavamshi/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [16]:
from datetime import datetime, timedelta
import re
from typing import Optional, Tuple
import spacy
from dateparser import parse as date_parse

class TemporalQueryParser:
    """
    Parses natural language queries to extract temporal expressions and constraints.
    
    This class handles various ways users might express time in their queries:
    - Relative times ("last week", "past 3 days")
    - Specific dates ("since January 1st")
    - Time periods ("between March and April")
    - Informal expressions ("recent", "latest", "new")
    """
    
    def __init__(self):
        # Load spaCy model for natural language processing
        self.nlp = spacy.load("en_core_web_sm")
        
        # Define temporal pattern matching rules
        self.relative_patterns = {
            # Last/past period patterns
            r'last\s+(\d+)\s+(day|week|month|year)s?': self._handle_last_n_period,
            r'past\s+(\d+)\s+(day|week|month|year)s?': self._handle_last_n_period,
            r'previous\s+(\d+)\s+(day|week|month|year)s?': self._handle_last_n_period,
            
            # Single period patterns
            r'last (day|week|month|year)': self._handle_last_period,
            r'yesterday': self._handle_yesterday,
            r'today': self._handle_today,
            
            # Since patterns
            r'since\s+(.+?)(?=\s+and|\s+or|$)': self._handle_since,
            
            # Between patterns
            r'between\s+(.+?)\s+and\s+(.+?)(?=\s+and|\s+or|$)': self._handle_between
        }
        
        # Patterns for informal temporal expressions
        self.informal_patterns = {
            'recent': timedelta(days=7),    # Consider "recent" as last 7 days
            'latest': timedelta(days=3),    # "latest" as last 3 days
            'new': timedelta(days=1),       # "new" as last 24 hours
            'current': timedelta(days=1)    # "current" as last 24 hours
        }

    def parse_temporal_query(self, query: str) -> Tuple[str, Optional[datetime], Optional[datetime]]:
        """
        Extract temporal constraints from a natural language query and return the cleaned query
        along with start and end dates.
        
        Args:
            query: Natural language query with potential temporal expressions
            
        Returns:
            Tuple of (cleaned query, start_date, end_date)
        """
        # Start with current time as reference
        now = datetime.now()
        start_date = None
        end_date = now
        
        # Process query with spaCy for better linguistic understanding
        doc = self.nlp(query)
        
        # First, check for explicit patterns
        for pattern, handler in self.relative_patterns.items():
            matches = re.finditer(pattern, query, re.IGNORECASE)
            for match in matches:
                # Extract temporal information using the appropriate handler
                temp_start, temp_end = handler(match)
                if temp_start:
                    # If we find multiple temporal expressions, use the most restrictive
                    start_date = temp_start if not start_date else max(start_date, temp_start)
                    end_date = temp_end if temp_end else now
                # Remove the temporal expression from query
                query = query.replace(match.group(0), '').strip()

        # Check for informal temporal expressions
        for term, delta in self.informal_patterns.items():
            if term in query.lower():
                start_date = now - delta
                query = re.sub(r'\b' + term + r'\b', '', query, flags=re.IGNORECASE).strip()

        # Additional check for date entities using spaCy
        for ent in doc.ents:
            if ent.label_ in ('DATE', 'TIME'):
                parsed_date = date_parse(ent.text)
                if parsed_date:
                    # Use as start date if it's a single date reference
                    start_date = parsed_date
                    query = query.replace(ent.text, '').strip()

        return query.strip(), start_date, end_date

    def _handle_last_n_period(self, match) -> Tuple[Optional[datetime], Optional[datetime]]:
        """Handle patterns like 'last 3 days', 'past 2 weeks'"""
        now = datetime.now()
        number = int(match.group(1))
        period = match.group(2).lower()
        
        if period == 'day':
            delta = timedelta(days=number)
        elif period == 'week':
            delta = timedelta(weeks=number)
        elif period == 'month':
            delta = timedelta(days=number * 30)  # Approximate
        else:  # year
            delta = timedelta(days=number * 365)  # Approximate
            
        return now - delta, now

    def _handle_last_period(self, match) -> Tuple[Optional[datetime], Optional[datetime]]:
        """Handle patterns like 'last week', 'last month'"""
        now = datetime.now()
        period = match.group(1).lower()
        
        if period == 'day':
            delta = timedelta(days=1)
        elif period == 'week':
            delta = timedelta(weeks=1)
        elif period == 'month':
            delta = timedelta(days=30)  # Approximate
        else:  # year
            delta = timedelta(days=365)  # Approximate
            
        return now - delta, now

    def _handle_yesterday(self, match) -> Tuple[Optional[datetime], Optional[datetime]]:
        """Handle 'yesterday' references"""
        now = datetime.now()
        start = now - timedelta(days=1)
        start = start.replace(hour=0, minute=0, second=0, microsecond=0)
        end = start + timedelta(days=1)
        return start, end

    def _handle_today(self, match) -> Tuple[Optional[datetime], Optional[datetime]]:
        """Handle 'today' references"""
        now = datetime.now()
        start = now.replace(hour=0, minute=0, second=0, microsecond=0)
        return start, now

    def _handle_since(self, match) -> Tuple[Optional[datetime], Optional[datetime]]:
        """Handle 'since' expressions"""
        date_str = match.group(1)
        parsed_date = date_parse(date_str)
        return parsed_date, None

    def _handle_between(self, match) -> Tuple[Optional[datetime], Optional[datetime]]:
        """Handle 'between' expressions"""
        start_str = match.group(1)
        end_str = match.group(2)
        start_date = date_parse(start_str)
        end_date = date_parse(end_str)
        return start_date, end_date

In [19]:
from datetime import datetime, timedelta
import re
from typing import Optional, Tuple
from dateparser import parse as date_parse

class TemporalQueryParser:
    """
    Parses natural language queries to extract temporal expressions and constraints.
    Uses regex patterns and dateparser for robust temporal understanding.
    """
    
    def __init__(self):
        # Define temporal pattern matching rules with clear, descriptive names
        self.relative_patterns = {
            # Patterns for specific time periods with numbers
            'numbered_period': r'(?:last|past|previous)\s+(\d+)\s+(day|week|month|year)s?',
            
            # Patterns for single time periods
            'single_period': r'last (day|week|month|year)',
            
            # Patterns for specific time references
            'specific_day': r'yesterday|today|tomorrow',
            
            # Patterns for time ranges
            'since_pattern': r'since\s+(.+?)(?=\s+and|\s+or|$)',
            'between_pattern': r'between\s+(.+?)\s+and\s+(.+?)(?=\s+and|\s+or|$)',
            
            # Patterns for relative time expressions
            'relative_time': r'(\d+)\s+(day|week|month|year)s?\s+ago'
        }
        
        # Map informal temporal expressions to timedeltas
        self.informal_patterns = {
            'recent': timedelta(days=7),    # "recent" → last 7 days
            'latest': timedelta(days=3),    # "latest" → last 3 days
            'new': timedelta(days=1),       # "new" → last 24 hours
            'current': timedelta(days=1)    # "current" → last 24 hours
        }

    def parse_temporal_query(self, query: str) -> Tuple[str, Optional[datetime], Optional[datetime]]:
        """
        Extracts temporal constraints from a natural language query.
        
        Args:
            query: Natural language query that might contain temporal expressions
            
        Returns:
            Tuple containing:
            - Cleaned query with temporal expressions removed
            - Start date (None if not specified)
            - End date (defaults to current time if not specified)
        """
        now = datetime.now()
        start_date = None
        end_date = now
        working_query = query.lower()  # Work with lowercase for easier matching

        # Handle numbered periods (e.g., "last 3 days")
        numbered_matches = re.search(self.relative_patterns['numbered_period'], working_query)
        if numbered_matches:
            number = int(numbered_matches.group(1))
            period = numbered_matches.group(2)
            start_date = self._calculate_period_start(number, period)
            working_query = re.sub(self.relative_patterns['numbered_period'], '', working_query)

        # Handle single periods (e.g., "last week")
        single_match = re.search(self.relative_patterns['single_period'], working_query)
        if single_match and not start_date:  # Only if no start date set
            period = single_match.group(1)
            start_date = self._calculate_period_start(1, period)
            working_query = re.sub(self.relative_patterns['single_period'], '', working_query)

        # Handle specific day references
        day_match = re.search(self.relative_patterns['specific_day'], working_query)
        if day_match and not start_date:
            day_ref = day_match.group(0)
            start_date, end_date = self._handle_specific_day(day_ref)
            working_query = re.sub(day_ref, '', working_query)

        # Handle informal temporal expressions
        for term, delta in self.informal_patterns.items():
            if term in working_query:
                start_date = now - delta
                working_query = re.sub(r'\b' + term + r'\b', '', working_query)

        # Clean up the query
        cleaned_query = ' '.join(working_query.split())
        
        return cleaned_query, start_date, end_date

    def _calculate_period_start(self, number: int, period: str) -> datetime:
        """
        Calculates the start date based on a number and time period.
        """
        now = datetime.now()
        if period == 'day':
            return now - timedelta(days=number)
        elif period == 'week':
            return now - timedelta(weeks=number)
        elif period == 'month':
            return now - timedelta(days=number * 30)  # Approximate
        else:  # year
            return now - timedelta(days=number * 365)  # Approximate

    def _handle_specific_day(self, day_ref: str) -> Tuple[datetime, datetime]:
        """
        Handles specific day references like 'yesterday', 'today'.
        Returns start and end timestamps for the day.
        """
        now = datetime.now()
        if day_ref == 'yesterday':
            start = now - timedelta(days=1)
        elif day_ref == 'today':
            start = now
        else:  # tomorrow
            start = now + timedelta(days=1)
            
        # Set to beginning of the day
        start = start.replace(hour=0, minute=0, second=0, microsecond=0)
        end = start + timedelta(days=1)
        return start, end

In [20]:

parser = TemporalQueryParser()
queries = [
    "show me documents from last week about project updates",
    "what changed in the system in the past 3 days",
    "find meeting notes since March 1st",
    "get recent updates about the database",
    "show changes between January and February",
    "find yesterday's deployment logs"
]

for query in queries:
    cleaned_query, start_date, end_date = parser.parse_temporal_query(query)
    print(f"\nOriginal: {query}")
    print(f"Cleaned: {cleaned_query}")
    print(f"Time range: {start_date} to {end_date}")


Original: show me documents from last week about project updates
Cleaned: show me documents from about project updates
Time range: 2024-11-23 17:58:20.725466 to 2024-11-30 17:58:20.724718

Original: what changed in the system in the past 3 days
Cleaned: what changed in the system in the
Time range: 2024-11-27 17:58:20.725922 to 2024-11-30 17:58:20.725909

Original: find meeting notes since March 1st
Cleaned: find meeting notes since march 1st
Time range: None to 2024-11-30 17:58:20.725955

Original: get recent updates about the database
Cleaned: get updates about the database
Time range: 2024-11-23 17:58:20.726018 to 2024-11-30 17:58:20.726018

Original: show changes between January and February
Cleaned: show changes between january and february
Time range: None to 2024-11-30 17:58:20.726112

Original: find yesterday's deployment logs
Cleaned: find 's deployment logs
Time range: 2024-11-29 00:00:00 to 2024-11-30 00:00:00


In [5]:
from temp import PretrainedQueryClassifier
# Initialize the classifier with the pre-trained model
classifier = PretrainedQueryClassifier()

# Analyze a query
result = classifier.analyze_query("how does machine learning work")
print(f"Query Type: {result.query_type}")
print(f"Confidence: {result.confidence}")
print(f"Weights: {result.weights}")

# Get detailed similarity scores
scores = classifier.get_similarity_scores("what is capital of France")
for query_type, score in scores.items():
    print(f"{query_type}: {score:.3f}")

Query Type: QueryType.REASONING
Confidence: 0.2551538050174713
Weights: {'dense': 0.5510307610034942, 'sparse': 0.44896923899650576}
QueryType.FACTUAL: 0.279
QueryType.REASONING: 0.181
QueryType.COMPARISON: 0.185
QueryType.EXPLORATORY: 0.179
QueryType.PROCEDURAL: 0.177


In [17]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import spacy
import logging
from enum import Enum

logger = logging.getLogger(__name__)

class QueryType(Enum):
    FACTUAL = "factual"         # Looking for specific facts or definitions
    REASONING = "reasoning"      # Seeking explanations or understanding
    COMPARISON = "comparison"    # Wanting to compare or contrast
    EXPLORATORY = "exploratory" # Open-ended information seeking
    PROCEDURAL = "procedural"   # How-to and step-by-step instructions

@dataclass
class QueryAnalysis:
    """Stores the analysis results for a query"""
    query_type: QueryType
    weights: Dict[str, float]
    confidence: float = 1.0
    features: Dict[str, float] = None

class IntentBasedClassifier:
    """
    Classifies queries based on linguistic patterns and semantic markers
    that indicate search intent.
    """
    
    def __init__(self):
        # Load spaCy's English language model for linguistic analysis
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            # If model isn't installed, download it
            import subprocess
            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
            self.nlp = spacy.load("en_core_web_sm")
        
        # Intent markers for different query types
        self.intent_patterns = {
            QueryType.FACTUAL: {
                'question_words': ['what', 'when', 'where', 'who', 'which'],
                'verbs': ['is', 'are', 'was', 'were', 'does'],
                'patterns': ['define', 'meaning of', 'definition of']
            },
            QueryType.REASONING: {
                'question_words': ['why', 'how'],
                'verbs': ['explain', 'causes', 'affects', 'influences', 'works'],
                'patterns': ['reason for', 'because', 'explain', 'understand']
            },
            QueryType.COMPARISON: {
                'markers': ['compare', 'versus', 'vs', 'difference', 'better', 'worse'],
                'patterns': ['compared to', 'differences between', 'pros and cons'],
                'conjunctions': ['and', 'or', 'vs']
            },
            QueryType.EXPLORATORY: {
                'verbs': ['tell', 'describe', 'elaborate', 'discuss'],
                'patterns': ['tell me about', 'what are', 'information about', 'learn about'],
                'markers': ['overview', 'introduction', 'basics']
            },
            QueryType.PROCEDURAL: {
                'markers': ['how to', 'steps', 'guide', 'tutorial', 'instructions'],
                'verbs': ['make', 'create', 'build', 'implement', 'setup', 'configure'],
                'patterns': ['way to', 'process of', 'method of']
            }
        }
        
        # Weights for different retrieval methods based on query type
        self.retrieval_weights = {
            QueryType.FACTUAL: {
                'dense': 0.35,   # Favor keyword matching for facts
                'sparse': 0.65
            },
            QueryType.REASONING: {
                'dense': 0.75,   # Heavily favor semantic for explanations
                'sparse': 0.25
            },
            QueryType.COMPARISON: {
                'dense': 0.60,   # Balance for comparing entities
                'sparse': 0.40
            },
            QueryType.EXPLORATORY: {
                'dense': 0.80,   # Strong semantic preference for exploration
                'sparse': 0.20
            },
            QueryType.PROCEDURAL: {
                'dense': 0.55,   # Slight semantic preference for steps
                'sparse': 0.45
            }
        }

    def _extract_linguistic_features(self, doc) -> Dict[str, bool]:
        """Extract linguistic features from the spaCy document."""
        return {
            'has_wh_question': any(token.tag_ == 'WDT' or token.tag_ == 'WP' or token.tag_ == 'WRB' for token in doc),
            'has_comparison': any(token.dep_ == 'amod' for token in doc),
            'has_action_verb': any(token.pos_ == 'VERB' and token.dep_ != 'aux' for token in doc),
            'has_conjunction': any(token.dep_ == 'cc' for token in doc),
            'is_command': doc[0].pos_ == 'VERB'
        }

    def _calculate_type_scores(
        self,
        query: str,
        doc,
        features: Dict[str, bool]
    ) -> Dict[QueryType, float]:
        """
        Calculate scores for each query type based on linguistic features
        and intent patterns.
        """
        scores = {qt: 0.0 for qt in QueryType}
        query_lower = query.lower()
        
        # Check each query type's patterns
        for query_type, patterns in self.intent_patterns.items():
            score = 0.0
            
            # Check question words and verbs
            for key in ['question_words', 'verbs', 'markers']:
                if key in patterns:
                    score += sum(word in query_lower.split() 
                               for word in patterns[key]) * 0.3
            
            # Check multi-word patterns
            if 'patterns' in patterns:
                score += sum(pattern in query_lower 
                           for pattern in patterns['patterns']) * 0.5
            
            # Add linguistic feature scores
            if query_type == QueryType.FACTUAL and features['has_wh_question']:
                score += 0.4
            elif query_type == QueryType.REASONING and 'why' in query_lower:
                score += 0.6
            elif query_type == QueryType.COMPARISON and features['has_comparison']:
                score += 0.4
            elif query_type == QueryType.PROCEDURAL and features['is_command']:
                score += 0.4
            
            scores[query_type] = min(score, 1.0)  # Normalize to 0-1
        
        return scores

    def analyze_query(self, query: str) -> QueryAnalysis:
        """
        Analyze a query to determine its type and appropriate retrieval weights.
        
        Args:
            query: The search query to analyze
            
        Returns:
            QueryAnalysis object containing type, weights, and confidence
        """
        try:
            # Process query with spaCy
            doc = self.nlp(query)
            
            # Extract linguistic features
            features = self._extract_linguistic_features(doc)
            
            # Calculate scores for each query type
            type_scores = self._calculate_type_scores(query, doc, features)
            
            # Get the highest scoring type
            predicted_type = max(type_scores.items(), key=lambda x: x[1])
            query_type = predicted_type[0]
            confidence = predicted_type[1]
            
            # Get base weights for this type
            weights = self.retrieval_weights[query_type].copy()
            
            # Adjust weights if confidence is low
            if confidence < 0.5:
                # Move weights closer to balanced (0.5/0.5)
                for key in weights:
                    weights[key] = 0.5 + (weights[key] - 0.5) * confidence
            
            return QueryAnalysis(
                query_type=query_type,
                weights=weights,
                confidence=confidence,
                features=type_scores
            )
            
        except Exception as e:
            logger.error(f"Error analyzing query: {str(e)}")
            # Return safe defaults
            return QueryAnalysis(
                query_type=QueryType.EXPLORATORY,
                weights={'dense': 0.5, 'sparse': 0.5},
                confidence=0.0
            )


In [20]:
# Initialize classifier
classifier = IntentBasedClassifier()

# Test queries
test_queries = [
    "what is machine learning",
    "what's the capital of france?",
    "why do neural networks work well for image recognition",
    "compare supervised and unsupervised learning",
    "tell me about deep learning",
    "how to implement a neural network in Python",
    "what causes overfitting in machine learning models",
    "steps to preprocess data for machine learning",
    "difference between CNN and RNN",
    "explain backpropagation algorithm",
    "guide to training deep learning models"
]

print("\nQuery Classification Analysis:")
print("-" * 50)

for query in test_queries:
    analysis = classifier.analyze_query(query)
    
    print(f"\nQuery: {query}")
    print(f"Type: {analysis.query_type.value}")
    print(f"Confidence: {analysis.confidence:.3f}")
    print(f"Weights: {analysis.weights}")
    
    if analysis.features:
        print("\nScores for each type:")
        for qtype, score in analysis.features.items():
            print(f"  {qtype.value}: {score:.3f}")


Query Classification Analysis:
--------------------------------------------------

Query: what is machine learning
Type: factual
Confidence: 1.000
Weights: {'dense': 0.35, 'sparse': 0.65}

Scores for each type:
  factual: 1.000
  reasoning: 0.000
  comparison: 0.000
  exploratory: 0.000
  procedural: 0.000

Query: what's the capital of france?
Type: factual
Confidence: 0.400
Weights: {'dense': 0.44, 'sparse': 0.56}

Scores for each type:
  factual: 0.400
  reasoning: 0.000
  comparison: 0.000
  exploratory: 0.000
  procedural: 0.000

Query: why do neural networks work well for image recognition
Type: reasoning
Confidence: 0.900
Weights: {'dense': 0.75, 'sparse': 0.25}

Scores for each type:
  factual: 0.400
  reasoning: 0.900
  comparison: 0.400
  exploratory: 0.000
  procedural: 0.000

Query: compare supervised and unsupervised learning
Type: comparison
Confidence: 0.700
Weights: {'dense': 0.6, 'sparse': 0.4}

Scores for each type:
  factual: 0.000
  reasoning: 0.000
  comparison: 0.

In [19]:
tokenizer = EfficientTokenizer()

def preprocess_text_worker(text: str) -> List[str]:
    """Worker function for text preprocessing"""
    try:
        return tokenizer.tokenize(text)
    except Exception as e:
        logger.error(f"Error in preprocessing worker: {str(e)}")
        return []

In [6]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [7]:
from rank_bm25 import BM25Okapi
import numpy as np
from typing import Dict, List



class CustomBM25:
    """
    Enhanced BM25 implementation with improved technical term handling.
    """
    
    def __init__(self, corpus):
        self.tokenized_corpus = [preprocess_text_worker(doc) for doc in corpus]
        self.bm25 = BM25Okapi(self.tokenized_corpus)
        
        # Build vocabulary with frequency information
        self.vocabulary = {}
        for doc in self.tokenized_corpus:
            for token in doc:
                self.vocabulary[token] = self.vocabulary.get(token, 0) + 1
    
    def get_scores(self, query: str) -> np.ndarray:
        query_tokens = preprocess_text_worker(query)
        return self.bm25.get_scores(query_tokens)
    
    def get_matching_terms(self, query: str) -> Dict[str, int]:
        """Get matching terms and their corpus frequencies for debugging"""
        query_tokens = preprocess_text_worker(query)
        return {
            token: self.vocabulary.get(token, 0)
            for token in query_tokens
            if token in self.vocabulary
        }
        
    def get_document_terms(self, doc_idx: int) -> List[str]:
        """Get the tokenized terms for a specific document"""
        return self.tokenized_corpus[doc_idx] if 0 <= doc_idx < len(self.tokenized_corpus) else []


In [21]:
# from preprocessing import CustomBM25, preprocess_text_worker
import re
from tokenizer import Tokenizer

tokenizer = Tokenizer()
def preprocess_batch_worker(texts: List[str]) -> List[List[str]]:
    """Worker function for batch text preprocessing"""
    try:
        return tokenizer.batch_tokenize(texts)
    except Exception as e:
        logger.error(f"Error in batch preprocessing worker: {str(e)}")
        return [[] for _ in texts]

def test_preprocessing():
    test_cases = [
        "Testing NV-embed v1.2.3 with PyTorch",
        "Using word2vec and BERT for NLP",
        "Requirements: Python>=3.6, TensorFlow<=2.0",
        "Contact support@example.com or visit https://example.com",
        "Using C++ and C# for ML/AI",
        "pre-trained.models and word_embeddings work well",
        "Testing camelCase and PascalCase",
        "U.S.A. Ph.D. research on GPT-3.5",
    ]
    
    results = preprocess_batch_worker(test_cases)
    for result in results:
        print(result)
        

test_preprocessing()

# test_bm25()

['test', 'nv', 'embed', 'v1.2.3', 'with', 'pytorch']
['use', 'word2vec', 'and', 'bert', 'for', 'nlp']
['requirement', 'python>=3.6', 'tensorflow<=2.0']
['contact', 'support@example.com', 'or', 'visit', 'https://example.com']
['use', 'c++', 'and', 'for', 'ml', 'ai']
['pre-trained.model', 'and', 'word_embedding', 'work', 'well']
['testing', 'camelcase', 'and', 'pascalcase']
['u.s.a.', 'ph.d.', 'research', 'on', 'gpt-3.5']
