In [38]:
import os
import time
from dotenv import load_dotenv
load_dotenv()

import pandas as pd
# dbs 
import chromadb
from chromaDB import ChromaDB
# OpenAI
from openai import OpenAI

# Internal 
from utils import num_tokens_from_string, chunks, create_embeddings

In [31]:
db = ChromaDB()
client = chromadb.PersistentClient(path="./data/test_chromadb")
collection_name = 'test_collection'
collection = client.get_or_create_collection(collection_name)
time.sleep(2)
print(collection.count())

0


In [32]:
collection.peek()

{'ids': [],
 'embeddings': [],
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

In [33]:
# get data
df = pd.read_pickle('./data/wikipedia-simple-text-embedding-ada-002-100K.pkl')
df = df.head(10)
df = df.drop(columns=['sparse_values', 'metadata'])
df = df.rename(columns={'values': 'embeddings', 'blob': 'metadatas'})
# make embeddings a list
df['embeddings'] = df['embeddings'].apply(lambda x: x.tolist())
# make metadatas a list contianing a dict
df['metadatas'] = df['metadatas'].apply(lambda x: [x])
for col in df.columns:
    print(col, type(df[col][0]))
    

id <class 'str'>
embeddings <class 'list'>
metadatas <class 'list'>


In [34]:

for i, row in df.iterrows():
    collection.upsert(
        embeddings=row['embeddings'],
        metadatas=row['metadatas'],
        ids=[row['id']]
    )

In [35]:
collection.peek()

{'ids': ['1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7', '1-8', '1-9'],
 'embeddings': [[-0.011254455894231796,
   -0.01698738895356655,
   -0.015781110152602196,
   -0.03248247504234314,
   -0.01371675729751587,
   -0.004977453965693712,
   -0.0030732126906514168,
   -0.011832723394036293,
   -0.01687546633183956,
   -0.0009839877020567656,
   0.015122009441256523,
   0.018815461546182632,
   -0.038053739815950394,
   0.01835533417761326,
   -0.022981474176049232,
   -1.799799792934209e-05,
   0.02748325653374195,
   -0.009992215782403946,
   0.0031089656986296177,
   0.013455604203045368,
   0.00831337459385395,
   -0.010713496245443821,
   -0.01698738895356655,
   -0.0023363877553492785,
   0.005885272286832333,
   0.012448298744857311,
   0.00794029887765646,
   -0.02415044605731964,
   0.0243991632014513,
   -0.019897380843758583,
   -0.032581962645053864,
   0.008307156153023243,
   -0.020941993221640587,
   -0.010495868511497974,
   -0.021165838465094566,
   -0.017634054

In [39]:
# make a query
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def query_embedding(query: str) -> list:
    """Returns a list of embeddings for a given query"""
    embed = client.embeddings.create(
    input=query,
    model="text-embedding-ada-002",
    )
    res = embed.data[0].embedding
    return res

In [41]:
query = "How did the roman empire fall?"
query_embed = query_embedding(query)
query_result = collection.query(
    query_embeddings=query_embed,
    n_results=3
)

query_result

{'ids': [['1-7', '1-6', '1-8']],
 'distances': [[0.4969365636387445, 0.5212051852484866, 0.5330498527533882]],
 'metadatas': [[{'chunk': 7,
    'source': 'https://simple.wikipedia.org/wiki/April',
    'text': "April 7, 1994 - The Rwandan Genocide begins.\n April 9, 1865 - American Civil War: Confederate forces under Robert E. Lee surrender to Union forces.\n April 9, 1940 - World War II: Denmark and Norway are invaded by Nazi Germany.\n April 9, 1989 - April 9 tragedy: In Tbilisi, Georgia, a peaceful demonstration for independence is broken up by the Soviet Army, killing 20 people. The country gains independence on this date exactly two years later.\n April 10, 1815 - Mount Tambora in Indonesia erupts in a huge eruption, affecting the world's climate for at least a year.\n April 10, 2010 - A plane crash near Smolensk, Russia, kills several people who were important in Poland, including President Lech Kaczynski.\n April 11, 1814 - Napoleon Bonaparte is exiled to the island of Elba.\n Ap

In [51]:
# structure the output
for i in range(len(query_result['ids'][0])):
    print(f"{i+1} - {query_result['ids'][0][i]} - {(query_result['distances'][0][i]):.2f}")
    print(f"{query_result['metadatas'][0][i]['title']}")
    print(f"{query_result['metadatas'][0][i]['text'][:100]}...\n")

1 - 1-7 - 0.50
April
April 7, 1994 - The Rwandan Genocide begins.
 April 9, 1865 - American Civil War: Confederate forces...

2 - 1-6 - 0.52
April
April 1, 1918 - The Royal Air Force is founded.
 April 1, 1976 - Apple Inc. is founded.
 April 1, 19...

3 - 1-8 - 0.53
April
April 15, 1912 - The ship RMS Titanic sinks near Newfoundland after hitting an iceberg, resulting in...


# LangChain and Chroma

In [52]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

In [78]:
chromadb_path = "./data/wiki-chromadb"
collection_name = 'wiki-pages'

# local chromadb instance
local_chroma = ChromaDB()

# langchain [openai] embeddings instance
openai_key = os.getenv('OPENAI_API_KEY')
embedding_function = OpenAIEmbeddings(openai_api_key=openai_key)

# langchain [chroma] instance load from disc
db = Chroma(persist_directory=chromadb_path, embedding_function=embedding_function)

db._collection.count()

0

In [80]:
query = "April"
docs = db.similarity_search(query)
docs

[]

### Passing Chroma client into LangChain

In [74]:
persistent_client = chromadb.PersistentClient(chromadb_path)
collection = persistent_client.get_collection(collection_name)

langchain_chroma = Chroma(
    client=persistent_client,
    collection_name=collection_name,
    embedding_function=embedding_function,
)

print("There are", langchain_chroma._collection.count(), "in the collection")

There are 100000 in the collection


In [79]:
# query 
query = "April"
docs = langchain_chroma.similarity_search(query)
docs

ValidationError: 1 validation error for Document
page_content
  none is not an allowed value (type=type_error.none.not_allowed)