In [None]:
import sqlite3
import json
import chromadb
import logging
from typing import List
import numpy as np
import numpy.typing as npt
import random


class CustomEmbeddingFunction:
    def _normalize(self, vector: npt.NDArray) -> npt.NDArray:
        """Normalizes a vector to unit length using L2 norm."""
        norm = np.linalg.norm(vector)
        if norm == 0:
            return vector
        return vector / norm

    def create_embeddings(self, input_docs):
        """Creates normalized embeddings from input documents."""
        embeddings = []
        for doc in input_docs:
            emb_idx, sparse_embedding = doc
            embedding = np.zeros(10000, dtype='float32')
            for index, value in sparse_embedding:
                embedding[index] = value
            normalized_embedding = self._normalize(embedding)
            embeddings.append(normalized_embedding.tolist())
        return embeddings


def load_link_embeddings(filename: str):
    """Load the link_embeddings.json and create an inverse mapping."""
    try:
        with open(filename, 'r') as f:
            link_embeddings = json.load(f)
        # Create inverse mapping
        inverse_link_embeddings = {str(v): k for k, v in link_embeddings.items()}
        return inverse_link_embeddings
    except Exception as e:
        logging.error(f"Error loading link embeddings from '{filename}': {e}")
        return {}

def read_embeddings(db_filename: str):
    """Read embeddings from the SQLite database."""
    try:
        conn = sqlite3.connect(db_filename)
        cursor = conn.cursor()
        cursor.execute('SELECT emb_idx, "values" FROM embeddings LIMIT 40000 OFFSET 1200')
        rows = cursor.fetchall()
        conn.close()
        return rows
    except Exception as e:
        logging.error(f"Error reading embeddings from '{db_filename}': {e}")
        return []

def main():
    logging.basicConfig(level=logging.INFO)
    
    db_filename = 'database/final_embedding.db'
    link_embeddings_filename = 'wiki_knowledge/embeddings/link_embeddings.json'
    collection_name = "wikichroma"
    dimension = 10000

    # Load the inverse link embeddings
    inverse_link_embeddings = load_link_embeddings(link_embeddings_filename)

    # Read the embeddings from the database
    rows = read_embeddings(db_filename)

    # Create the custom embedding function instance
    embedding_function = CustomEmbeddingFunction()

    # Create a ChromaDB client and collection
    chroma_client = chromadb.PersistentClient(path="wikichroma")
    collection = chroma_client.get_or_create_collection(name=collection_name)
    if collection is None:
        logging.error("Failed to retrieve or create collection.")
        return

    print(f"The number of elements in the collection before adding: {collection.count()}")

    # Process and prepare vectors for insertion
    documents = []
    ids = []
    embeddings = []

    for emb_idx, sparse_embedding_json in rows:
        name = inverse_link_embeddings.get(str(emb_idx), None)
        if not name:
            continue

        sparse_embedding = json.loads(sparse_embedding_json)
        if not sparse_embedding:
            continue

        documents.append(name)
        ids.append(str(emb_idx))
        embeddings.append((emb_idx, sparse_embedding))

    # Normalize embeddings using the custom embedding function
    normalized_embeddings = embedding_function.create_embeddings(embeddings)

    # Add vectors to the collection
    try:
        collection.add(ids=ids, embeddings=normalized_embeddings, documents=documents)
        print(f"Added {len(normalized_embeddings)} embeddings to the ChromaDB collection.")
    except Exception as e:
        logging.error(f"Error adding documents to the collection: {e}")

    # Verify and print the number of elements in the collection
    try:
        print(f"The number of elements in the collection after adding: {collection.count()}")
    except Exception as e:
        logging.error(f"Error counting elements in the collection: {e}")
    return collection, documents

collection, documents = main()



def get_random_document(collection, total_docs: int):
    """Retrieve a random document from the collection."""
    random_offset = random.randint(0, total_docs - 1)
    result = collection.get(limit=1, offset=random_offset, include=["embeddings", "documents"])
    return result["documents"][0], result["embeddings"][0], result["ids"][0]

def query_collection(collection, query_vector):
    """Query the ChromaDB collection for the nearest neighbors."""
    results = collection.query(query_embeddings=[query_vector])
    return results

doc = get_random_document(collection, 18135)

# Query the collection based on the selected random document
print(f"The document being queried is: {doc[0]}")
results = query_collection(collection, doc[1])
print("Query Results:")
for result, dist in zip(results['documents'], results['distances']):
    print(result, dist)

In [None]:
print(f"The number of elements in the collection are {collection.count()}")

In [None]:
print(collection.peek(limit = 10))

In [None]:
import sqlite3
import json
import chromadb
import logging
from typing import List
import numpy as np
import numpy.typing as npt
import random

def get_random_document(collection, total_docs: int):
    """Retrieve a random document from the collection."""
    random_offset = random.randint(0, total_docs - 1)
    result = collection.get(limit=1, offset=random_offset, include=["embeddings", "documents"])
    return result["documents"][0], result["embeddings"][0], result["ids"][0]

def query_collection(collection, query_vector):
    """Query the ChromaDB collection for the nearest neighbors."""
    results = collection.query(query_embeddings=[query_vector])
    return results


collections = chroma_client.list_collections()
print(f"My collections in this chromadb are {collections}")
print("We have a client")
collection = chroma_client.get_collection(name=collection_name)
print("We have a collection")
total_docs = collection.count()
print(f"The total documents is {total_docs}")


In [None]:
#doc = get_random_document(collection, 10)
print(f"We have a doc {doc[0]}")

# Query the collection based on the selected random document
print(f"The document being queried is: {doc[1]}")
results = query_collection(collection, doc[1])
print("Query Results:")
for result, dist in zip(results['documents'], results['distances']):
    print(result, dist)

In [3]:
import sqlite3
import pandas as pd
import json
import numpy as np
import numpy.typing as npt

class CustomEmbeddingFunction:
    def _normalize(self, vector: npt.NDArray) -> npt.NDArray:
        """Normalizes a vector to unit length using L2 norm."""
        norm = np.linalg.norm(vector)
        if norm == 0:
            return vector
        return vector / norm

    def create_embeddings(self, input_docs):
        """Creates normalized embeddings from input documents."""
        embeddings = []
        for doc in input_docs:
            emb_idx, sparse_embedding = doc
            embedding = np.zeros(10000, dtype='float32')
            for index, value in sparse_embedding:
                embedding[index] = value
            normalized_embedding = self._normalize(embedding)
            embeddings.append(normalized_embedding.tolist())
        return embeddings
        
# Function to query the level1alt table and return a pandas DataFrame
def query_level1_table(db_filename):
    """Query the level1alt table and return its contents as a pandas DataFrame."""
    conn = sqlite3.connect(db_filename)
    
    # Use pandas to read the level1alt table into a DataFrame
    #df = pd.read_sql_query('SELECT * FROM level1final LIMIT 100', conn)
    df = pd.read_sql_query('SELECT * FROM embeddings WHERE emb_idx = 1824508 LIMIT 5', conn)
    # Close the connection
    conn.close()
    
    return df

# Function to get the total number of rows in the level1alt table
def get_row_count(db_filename):
    conn = sqlite3.connect(db_filename)
    
    # Execute a query to count the rows
    cursor = conn.cursor()
    cursor.execute('SELECT COUNT(*) FROM embeddings WHERE LENGTH("values") > 5')
    row_count = cursor.fetchone()[0]
    
    # Close the connection
    conn.close()
    
    return row_count

# Example usage
db_filename = 'database/final_embedding.db'
df_level1 = query_level1_table(db_filename)
# Get and display the total number of rows
#total_rows = get_row_count(db_filename)
#print(f"Total number of rows in the table: {total_rows}")
# Display the head of the DataFrame
#df_level1.head(50)
processor = CustomEmbeddingFunction()
raw = json.loads(df_level1.loc[0, 'values'])
emb = processor.create_embeddings([["1824508", raw]])
print(emb)


[[0.08424245566129684, 0.20713256299495697, 0.0892864242196083, 0.042892202734947205, 0.029188834130764008, 0.10435673594474792, 0.10059372335672379, 0.02465839684009552, 0.036610569804906845, 0.08315368741750717, 0.049502648413181305, 0.01834988407790661, 0.05801945924758911, 0.03850271552801132, 0.009801800362765789, 0.11212022602558136, 0.03215401619672775, 0.04723188653588295, 0.0322667695581913, 0.03329507261514664, 0.01911211758852005, 0.03361256793141365, 0.01680714450776577, 0.05885401740670204, 0.034350715577602386, 0.03298800438642502, 0.02420511282980442, 0.051931172609329224, 0.01856803335249424, 0.045118413865566254, 0.015358840115368366, 0.03034316934645176, 0.0387173630297184, 0.016738252714276314, 0.01746784895658493, 0.026925580576062202, 0.017745116725564003, 0.021575365215539932, 0.055516473948955536, 0.026671152561903, 0.0006042206077836454, 0.007607612293213606, 0.032286129891872406, 0.023221950978040695, 0.024654436856508255, 0.019580746069550514, 0.01144707109779

In [5]:
import sqlite3
import json
import chromadb
import logging
from typing import List
import numpy as np
import numpy.typing as npt
import random

collection_name = "wc_final_3"
chroma_client = chromadb.PersistentClient(path="wikichroma")
collection = chroma_client.get_collection(name=collection_name)

In [None]:
def query_collection(collection, query_vector):
    """Query the ChromaDB collection for the nearest neighbors."""
    results = collection.query(query_embeddings=[query_vector])
    return results
    
results = query_collection(collection, emb[0])
print("Query Results:")
for result, dist in zip(results['documents'], results['distances']):
    print(result, dist)