In [1]:
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings
import json
import os

In [2]:
# Load the OpenAI API key from a file into an environment variable called OPENAI_API_KEY
%run .load_openai_api_key.py

# Get the API key from the environment
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

# Set a name of the future collection
COLLECTION_NAME = "dataflow-meta-information-embeddings"

In [3]:
client = chromadb.PersistentClient(settings=Settings(anonymized_telemetry=False), path='.chroma.db')

embedding_function = embedding_functions.OpenAIEmbeddingFunction(
	model_name='text-embedding-3-small',
	api_key=OPENAI_API_KEY
)

In [4]:
collection = client.get_or_create_collection(name=COLLECTION_NAME, embedding_function=embedding_function)
if collection:
    print(f"Collection {COLLECTION_NAME} created successfully")

Collection dataflow-meta-information-embeddings created successfully


In [5]:
import json
# Load the data from the file
with open('flat_info_for_embedding.json') as f:
    flat_info_for_embedding = json.load(f)

In [15]:
from uuid import uuid4 as uuid
ids = list()
documents = list()
metadatas = list()

for question, answer in flat_info_for_embedding:
    ids.append(str(uuid()))
    documents.append(str(question))
    metadatas.append({"answer": answer})

In [16]:
# clear the collection if it already exists
try:
	client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_function)
	client.delete_collection(COLLECTION_NAME)
except ValueError:
	pass
collection = client.create_collection(name=COLLECTION_NAME, embedding_function=embedding_function)
print(collection)

Collection(id=98f5d05b-d9ab-4d0f-a2ad-1105bc0f6e60, name=dataflow-meta-information-embeddings)


In [17]:
# Insert the data into the collection
to_vectorize = {'ids': ids, 'documents': documents, 'metadatas': metadatas}
collection.add(**to_vectorize)

In [28]:
query_text = 'What is EXP_DESTINATION?'
result_sets = collection.query(query_texts=[query_text], n_results=1)
print(f"The most similar questions to '{query_text}' is: \n\t{result_sets['documents'][0]}")
print(f"The answer to the question is: \n\t{result_sets['metadatas'][0][0]['answer']}")

The most similar questions to 'What is EXP_DESTINATION?' is: 
	['EXP_DESTINATION']
The answer to the question is: 
	The name that corresponds to the dimension code: 'EXP_DESTINATION' is Destination of expenditure.


In [29]:
query_text = 'What is the code for the destination of the expression?'
result_sets = collection.query(query_texts=[query_text], n_results=1)
print(f"The most similar questions to '{query_text}' is: \n\t{result_sets['documents'][0]}")
print(f"The answer to the question is: \n\t{result_sets['metadatas'][0][0]['answer']}")

The most similar questions to 'What is the code for the destination of the expression?' is: 
	["What name that corresponds to the dimension code: 'EXP_DESTINATION'?"]
The answer to the question is: 
	The name that corresponds to the dimension code: 'EXP_DESTINATION' is Destination of expenditure.


In [30]:
query_text = 'What is the code for the lower secondary education?'
result_sets = collection.query(query_texts=[query_text], n_results=1)
print(f"The most similar questions to '{query_text}' is: \n\t{result_sets['documents'][0]}")
print(f"The answer to the question is: \n\t{result_sets['metadatas'][0][0]['answer']}")

The most similar questions to 'What is the code for the lower secondary education?' is: 
	["What is the code for 'Lower secondary education' within the code list ID 'EDUCATION_LEV'?"]
The answer to the question is: 
	The English name of the code 'ISCED11_2' within the code list ID 'EDUCATION_LEV' is 'Lower secondary education'.
