In [4]:
from datetime import datetime
import chromadb
client = chromadb.PersistentClient(path='database2')
# Initialize Chroma DB client
# my_collection = client.create_collection(name='my_collection1')
my_collection1 = client.create_collection(
        name="collection2",
        metadata={"hnsw:space": "cosine"} # l2 is the default
    )


In [5]:
# Define your documents, metadata, and IDs
documents = [
    "Analysis of market trends and consumer behavior in the tech industry.",
    "Case study on sustainable development practices in urban planning.",
    "Research findings on the impact of climate change on marine ecosystems.",
    "Comparative analysis of healthcare systems in developed and developing countries.",
    "Report on innovations in renewable energy technologies.",
    "Proposal for enhancing cybersecurity measures in financial institutions.",
    "Study on the effectiveness of remote learning tools during the COVID-19 pandemic.",
    "Analysis of social media trends and their impact on digital marketing strategies.",
    "Case study on the adoption of artificial intelligence in healthcare diagnostics.",
    "Review of sustainable agriculture practices and their economic implications.",
    "Proposal for improving public transportation infrastructure in urban areas."
]
metadata = [
    {
        "source": "Tech Industry Report",
        "author": "David Lee",
        "date_created": 1597171200,  # Example timestamp
        "tags": "Tech"
    },
    {
        "source": "Urban Planning Journal",
        "author": "Emma Johnson",
        "date_created": 1599244800,  # Example timestamp
        "tags": "Sustainable Development"
    },
    {
        "source": "Climate Change Research",
        "author": "Michael Smith",
        "date_created": 1634726400,  # Example timestamp
        "tags": "Climate Change"
    },
    {
        "source": "Healthcare Systems Review",
        "author": "Sophia Brown",
        "date_created": 1636944000,  # Example timestamp
        "tags": "Healthcare"
    },
    {
        "source": "Renewable Energy Innovations",
        "author": "James White",
        "date_created": 1638345600,  # Example timestamp
        "tags": "Tech"
    },
    {
        "source": "Cybersecurity Proposal",
        "author": "Olivia Taylor",
        "date_created": 1641782400,  # Example timestamp
        "tags": "Cybersecurity"
    },
    {
        "source": "Remote Learning Study",
        "author": "William Miller",
        "date_created": 1645180800,  # Example timestamp
        "tags": "Remote Learning"
    },
    {
        "source": "Social Media Analysis",
        "author": "Ava Martinez",
        "date_created": 1648204800,  # Example timestamp
        "tags": "Social Media"
    },
    {
        "source": "AI in Healthcare Diagnostics",
        "author": "Noah Garcia",
        "date_created": 1651305600,  # Example timestamp
        "tags": "Artificial Intelligence"
    },
    {
        "source": "Sustainable Agriculture Review",
        "author": "Isabella Lopez",
        "date_created": 1652620800,  # Example timestamp
        "tags": "Sustainable Agriculture"
    },
    {
        "source": "Urban Transportation Proposal",
        "author": "Ethan Wilson",
        "date_created": 1654867200,  # Example timestamp
        "tags": "Urban Infrastructure"
    }
]

ids = [
    "id5", "id6", "id7", "id8", "id9",
    "id10", "id11", "id12", "id13", "id14", "id15"
]

# Assuming `my_collection` is already defined somewhere in your code
my_collection1.add(documents=documents, metadatas=metadata, ids=ids)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:07<00:00, 10.6MiB/s]


In [38]:
def get_max_date_created(my_collection):
    # Get the results with metadata included
    results = my_collection.get(include=["metadatas"])

    # Extract metadata
    meta = []
    for d in results['metadatas']:
        meta.append(d)

    # Extract date_created values
    date_created_list = [item['date_created'] for item in meta]

    # Return the maximum date_created value
    return max(date_created_list)




# Chroma use pre filtering

In [83]:
current_time = datetime.now()
results = my_collection1.query(
        query_texts="Tell me about covid 19",
        # n_results=2,  # Return only the latest document
        where={"date_created": {"$eq": get_max_date_created(my_collection1)}}  # Filter based on the latest timestamp
    )
print(results)
current_time1 = datetime.now()

execution_time = current_time1 - current_time
print("Execution Time:", execution_time)


{'ids': [['id15']], 'distances': [[0.9880140723406068]], 'metadatas': [[{'author': 'Ethan Wilson', 'date_created': 1654867200, 'source': 'Urban Transportation Proposal', 'tags': 'Urban Infrastructure'}]], 'embeddings': None, 'documents': [['Proposal for improving public transportation infrastructure in urban areas.']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}
Execution Time: 0:00:00.133028


# Applying post filtering manually

In [99]:


def post_filtering(collection_name, query_texts, top_k=3, sort_order='desc', cosine_distance_threshold=None):
    my_collection = client.get_collection(name=collection_name)

    # Perform the query to retrieve documents
    result = my_collection.query(
        query_texts=query_texts,
        n_results=top_k,
    )

    metadatas = result['metadatas'][0]

    # Filter results based on cosine_distance_threshold if provided
    if cosine_distance_threshold is not None and 'distances' in result:
        filtered_metadatas = []
        for doc, distance in zip(metadatas, result['distances'][0]):
            if distance < cosine_distance_threshold:
                filtered_metadatas.append(doc)
        metadatas = filtered_metadatas

    # Sort metadatas based on date_created if sorting is requested
    if sort_order:
        if sort_order == 'desc':
            metadatas = sorted(metadatas, key=lambda x: x['date_created'], reverse=True)
        elif sort_order == 'asc':
            metadatas = sorted(metadatas, key=lambda x: x['date_created'], reverse=False)
        else:
            raise ValueError("Invalid sort_order. Choose 'asc' for ascending or 'desc' for descending.")

    # Print sorted documents and their metadata
    print("Sorted Documents:" if sort_order else "Documents:")
    for idx, doc in enumerate(metadatas, start=1):
        print(f"Document {idx}:")
        print(f"Date Created: {doc['date_created']}")
        print(f"Source: {doc['source']}")
        print(f"Author: {doc['author']}")
        print(f"Tags: {doc['tags']}")
        if 'documents' in result:
            print(f"Document Content: {result['documents'][0][idx-1]}")
        if 'distances' in result:
            print(f"Distance: {result['distances'][0][idx-1]}")
        print()

    return metadatas


# Example usage:
collection_name = "collection2"
top_k_results = 5
sort_order = 'desc'  # Default sorting order is descending
cosine_distance_threshold = 0.9  # threshold value

current_time = datetime.now()
print("Execution Time:", current_time)

sorted_documents = post_filtering(collection_name, "tell me about covid 19", top_k_results, sort_order, cosine_distance_threshold)
current_time1 = datetime.now()
execution_time = current_time1 - current_time
print("Execution Time:", execution_time)


Execution Time: 2024-07-09 09:03:13.515138
Sorted Documents:
Document 1:
Date Created: 1651305600
Source: AI in Healthcare Diagnostics
Author: Noah Garcia
Tags: Artificial Intelligence
Document Content: Study on the effectiveness of remote learning tools during the COVID-19 pandemic.
Distance: 0.5436493481434879

Document 2:
Date Created: 1645180800
Source: Remote Learning Study
Author: William Miller
Tags: Remote Learning
Document Content: Comparative analysis of healthcare systems in developed and developing countries.
Distance: 0.7934484383447122

Document 3:
Date Created: 1641782400
Source: Cybersecurity Proposal
Author: Olivia Taylor
Tags: Cybersecurity
Document Content: Case study on the adoption of artificial intelligence in healthcare diagnostics.
Distance: 0.8607029298704181

Document 4:
Date Created: 1636944000
Source: Healthcare Systems Review
Author: Sophia Brown
Tags: Healthcare
Document Content: Proposal for enhancing cybersecurity measures in financial institutions.
Dist