In [18]:
!pip install chromadb
!pip install scikit-learn



In [19]:
from sklearn.ensemble import IsolationForest
import numpy as np
from chromadb.config import Settings
from chromadb import Client

In [20]:
# Step 1: Initialize ChromaDB client
client = Client(Settings())
collection = client.get_or_create_collection(name="embedding_collection")

In [21]:
# Step 2: Function to add embeddings to the database
def add_embeddings(embedding_list, metadata_list):
    # Updated add method usage, including 'ids' argument
    collection.add(
        ids=[f"embed_{idx}" for idx in range(len(embedding_list))],  # Use generated strings as IDs
        documents=[f"embed_{idx}" for idx in range(len(embedding_list))], # Use documents to store embedding IDs as strings
        embeddings=embedding_list,  # Provide all embeddings as a list
        metadatas=metadata_list  # Provide all metadatas as a list
    )

In [31]:
# Step 3: Function to retrieve all embeddings from the database
def get_all_embeddings():
    # Include 'where' and 'where_document' to specify retrieval criteria if needed
    # Get the total number of embeddings in the collection
    total_embeddings = collection.count()

    if total_embeddings > 0:
        results = collection.get(
            ids=[f"embed_{idx}" for idx in range(total_embeddings)],  # Get all embeddings
            # or use other relevant criteria for where and where_document
        )
        # Check if embeddings are retrieved before accessing them
        if results and results.get("embeddings"):
            embeddings = np.array([result["embedding"] for result in results["embeddings"]])
            metadata = results["metadatas"]  # Assuming this key is correct
            # Reshape embeddings to 2D if necessary
            # Removed redundant reshaping logic as embeddings should already be 2D
            return embeddings, metadata
        else:
            print("No embeddings found in the collection.")
            # Return a single dummy embedding to avoid the error
            return np.array([np.zeros(128)]), [{"info": "dummy_data"}]
    else:
        print("Collection is empty. No embeddings to retrieve.")
        # Return a single dummy embedding to avoid the error
        return np.array([np.zeros(128)]), [{"info": "dummy_data"}]

In [32]:
# Step 4: Outlier detection using Isolation Forest
def detect_outliers():
    embeddings, metadata = get_all_embeddings()

    # Initialize Isolation Forest
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    iso_forest.fit(embeddings)

    # Predict anomalies (-1 for outliers, 1 for inliers)
    predictions = iso_forest.predict(embeddings)

    # Print results
    for i, pred in enumerate(predictions):
        if pred == -1:
            print(f"Outlier detected: {metadata[i]}")

In [33]:
# Step 5: Example usage
if __name__ == "__main__":
    # Example embeddings (replace with real embeddings)
    embedding_list = [
        np.random.rand(128),  # Normal data
        np.random.rand(128),  # Normal data
        np.random.rand(128) * 10  # Outlier data
    ]

    # Metadata for the embeddings
    metadata_list = [
        {"info": "normal_data_1"},
        {"info": "normal_data_2"},
        {"info": "outlier_data"}
    ]

    # Add embeddings to the database
    add_embeddings(embedding_list, metadata_list)

    # Detect outliers
    detect_outliers()




No embeddings found in the collection.
