In [4]:
from pymilvus import connections, Collection, utility, __version__
import voyageai
import os

def connect_milvus(host='localhost', port='19530'):
    """Setup Milvus connection and collection"""
    # Check if there's an existing connection and disconnect
    if connections.has_connection("default"):
        connections.disconnect("default")
    
    # Try to connect with a timeout
    connections.connect(
        alias="default",
        host=host, 
        port=port,
        timeout=10.0  # Add timeout in seconds
    )
    
    # Verify connection
    if not connections.has_connection("default"):
        raise ConnectionError("Failed to establish connection to Milvus")

    collection_name = "vector_store"
    
    # Check if collection exists
    if not utility.has_collection(collection_name):
        raise ValueError(f"Collection '{collection_name}' does not exist")
        
    # Get existing collection
    collection = Collection(name=collection_name)
    
    # Load the indices explicitly
    collection.load()
    
    return collection

In [3]:
def hybrid_search(collection, query_vector, text_expr=None, metadata_expr=None, timestamp_expr=None, limit=50):
    search_params = {"metric_type": "L2", "params": {"nprobe": 16}}
    
    # Build expression for hybrid filtering
    expr = None
    if text_expr:
        expr = text_expr
    if metadata_expr:
        expr = f"({expr}) && ({metadata_expr})" if expr else metadata_expr
    if timestamp_expr:
        expr = f"({expr}) && ({timestamp_expr})" if expr else timestamp_expr
    
    # results = collection.query(
    #     expr=expr,
    #     output_fields=["text", "metadata", "timestamp"],
    #     limit=limit
    # )
    results = collection.search(
        #data=[],
        data=[query_vector],
        anns_field="embeddings",
        param=search_params,
        limit=limit,

        expr=expr,
        output_fields=["text", "metadata", "timestamp"]
    )
    
    return results

In [11]:
from functools import lru_cache

@lru_cache(maxsize=128)  # You can adjust the maxsize as needed
def embed_query(query):
    vo = voyageai.Client(api_key=os.getenv("VOYAGE_API_KEY"))
    vector = vo.embed([query], model="voyage-3-large", input_type="query").embeddings[0]
    return vector

In [55]:
import pandas as pd

def search_documents(query, limit=3):
    """Search documents using vector similarity"""
    collection = connect_milvus()

    vector = embed_query(query)
    results = hybrid_search(
        collection,
        query_vector=vector,
        limit=limit
    )
    print (results[0][0])
    # Convert the Hits object to a list of dictionaries
    formatted_results = []
    for hit in results[0]:
        # Assuming each hit has attributes like id, distance, and fields
        formatted_results.append({
            "id": hit.id,
            "distance": hit.distance,         
            **{key: hit.entity.get(key) for key in ["text", "metadata", "timestamp"]}            
        })
    
    # Create a DataFrame from the formatted results
    df = pd.DataFrame(formatted_results)
    
    return df

search_documents("proefdieren voorkeursbeleid")

id: 455787340486357583, distance: 0.8478602766990662, entity: {'text': 'van de lab oratoria zodat er alleen proefdiervrij onderzoek plaatsvindt ? Is een actief wervings - en voorkeursbeleid mogelijk voor bedrijven die geen dieren gebruike n? Graag een toe lichting. Indiener A.L. Bakker i _URL_ -draagvlak -in-amsterdamse -gemeenteraad - voor -inzet -op-proefdiervrije -innovatie ii _URL_ -room/20210910IPR11926/meps -demand -eu-action - plan -to-end -the-use-of-animals -in-research -and -testing iii _URL_ -inzake -het-economisch -herstel -en- investeringsplan -ontwik keling -proefdiervrije -onderzoeksmethoden -stimuleren', 'metadata': {'council': 'amsterdam', 'document_type': 'Schriftelijke-vragen', 'year': 2022, 'month': 1, 'topic': None, 'filename': '8+sv+A_L_+Bakker+Experimenten+op+dieren+in+het+Medical+Business+Park', 'chunk_position': {'start': -1, 'end': 552}, 'source_file': 'C:\\politieke-datasets\\amsterdam\\Schriftelijke-vragen\\2022\\01\\8+sv+A_L_+Bakker+Experimenten+op+dieren+i

Unnamed: 0,id,distance,text,metadata,timestamp
0,455787340486357583,0.84786,van de lab oratoria zodat er alleen proefdierv...,"{'council': 'amsterdam', 'document_type': 'Sch...",1738620407
1,455787340486357615,0.951109,combinat ie met kunstmatige intelligentie (AI)...,"{'council': 'amsterdam', 'document_type': 'Sch...",1738620409
2,455787340486209699,0.953733,wethouder Sport en Bewegen. 181.2.6 Een waardi...,"{'council': 'amsterdam', 'document_type': 'Ing...",1738618375


In [56]:
def track_policy_evolution(query, start_year=None, end_year=None, limit=50):
    """
    Track how a specific policy topic has evolved over time.
    
    Args:
        query (str): The policy topic to search for
        start_year (int, optional): Filter results from this year onwards
        end_year (int, optional): Filter results up to this year
        limit (int, optional): Maximum number of results to return
        
    Returns:
        pd.DataFrame: Results sorted chronologically with relevant metadata
    """
    collection = connect_milvus()
    vector = embed_query(query)
    
    # Build timestamp expression if years are specified
    timestamp_expr = None
    if start_year or end_year:
        conditions = []
        if start_year:
            # Convert year to timestamp (assuming start of year)
            start_ts = int(pd.Timestamp(f"{start_year}-01-01").timestamp())
            conditions.append(f"timestamp >= {start_ts}")
        if end_year:
            # Convert year to timestamp (assuming end of year)
            end_ts = int(pd.Timestamp(f"{end_year}-12-31").timestamp())
            conditions.append(f"timestamp <= {end_ts}")
        timestamp_expr = " && ".join(conditions)

    results = hybrid_search(
        collection,
        query_vector=vector,
        timestamp_expr=timestamp_expr,
        limit=limit
    )

    # Format results into a DataFrame
    formatted_results = []
    for hit in results[0]:
        result = {
            "id": hit.id,
            "similarity_score": 1 - hit.distance,  # Convert distance to similarity score
            "text": hit.entity.get("text"),
            "timestamp": pd.Timestamp(hit.entity.get("timestamp"), unit='s'),
            "council": hit.entity.get("metadata", {}).get("council"),
            "document_type": hit.entity.get("metadata", {}).get("document_type"),
            "year": hit.entity.get("metadata", {}).get("year"),
            "month": hit.entity.get("metadata", {}).get("month"),
            "topic": hit.entity.get("metadata", {}).get("topic"),
            "filename": hit.entity.get("metadata", {}).get("filename")
        }
        formatted_results.append(result)
    
    # Create DataFrame and sort chronologically
    df = pd.DataFrame(formatted_results)
    df = df.sort_values('timestamp')
    
    return df

# Example usage:
df = track_policy_evolution("bicycle infrastructure", start_year=2020, end_year=2023)
# display(df)

KeyError: 'timestamp'