In [1]:
import chromadb
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path
from pprint import pprint
import re
import ast


In [2]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'mps'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [57]:
vector_store = Chroma(
    collection_name="example",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db"
)


def extract_permalinks(input_string):
    try:
        data = ast.literal_eval(input_string)
        if type(data) == list:
          permalinks = [d['value'] for d in data if 'value' in d]
          result = ', '.join(permalinks)
          return result
        else:
            result = data['value']
            if type(result) != str:
                result = str(result)
            return result
    except:
        return ""

vector_store = {
    "short_description": Chroma(
                         collection_name="short_description",
                         embedding_function=embeddings,
                        persist_directory="./chroma_langchain_db/short_description"
                          ),
    "num_employees_enum": Chroma(
                            collection_name="num_employees_enum",
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/num_employees_enum"
                            ),
    "location_identifiers": Chroma(
                            collection_name="location_identifiers",
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/location_identifiers"
                            ),
    "last_funding_total": Chroma(
                            collection_name="last_funding_total",
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/last_funding_total"
                            ),
    "description": Chroma(
                            collection_name="description",
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/description"
                            ),
    "category_groups": Chroma(
                            collection_name="category_groups",
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/category_groups"
                            ),
    "last_funding_type": Chroma(
                            collection_name="last_funding_type",
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/last_funding_type"
                            ),
    "categories": Chroma(
                            collection_name="categories",
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/categories"
                            ),

    "last_funding_at": Chroma(
                            collection_name="last_funding_at",
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/last_funding_at"
                            ),

                }

loaders = {
    "short_description": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].short_description',
        text_content=False).load(),
    "num_employees_enum": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].num_employees_enum',
        text_content=False).load(),
    "location_identifiers": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].location_identifiers',
        text_content=False).load(),
    "last_funding_total": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].last_funding_total',
        text_content=False).load(),
    "description": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].description',
        text_content=False).load(),
    "category_groups": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].category_groups',
        text_content=False).load(),
    "last_funding_type": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].last_funding_type',
        text_content=False).load(),
    "categories": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].categories',
        text_content=False).load(),
    "last_funding_at": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].last_funding_at',
        text_content=False).load(),
}


for i in range(len(loaders["location_identifiers"])):

    loaders["last_funding_total"][i].page_content = str(loaders["last_funding_total"][i].page_content)
    loaders["location_identifiers"][i].page_content = extract_permalinks(loaders["location_identifiers"][i].page_content)
    loaders["last_funding_total"][i].page_content = extract_permalinks(loaders["last_funding_total"][i].page_content)
    loaders["category_groups"][i].page_content = extract_permalinks(loaders["category_groups"][i].page_content)
    loaders["categories"][i].page_content = extract_permalinks(loaders["categories"][i].page_content)


In [58]:
print(type(loaders["last_funding_total"][0].page_content))
length = len(loaders["short_description"])

batch_size = 5000


for i in range(0, length-batch_size, batch_size):
  for key in vector_store.keys():
    print(key)
    if type(loaders[key][i].page_content) != str:
        loaders[key][i:i+batch_size].page_content = str(loaders[key][i:i+batch_size].page_content)
    vector_store[key].add_documents(documents=loaders[key][i:i+batch_size])

<class 'str'>
short_description
<class 'list'>
num_employees_enum
<class 'list'>
location_identifiers
<class 'list'>
last_funding_total
<class 'list'>
description
<class 'list'>
category_groups
<class 'list'>
last_funding_type
<class 'list'>
categories
<class 'list'>
last_funding_at
<class 'list'>
short_description
<class 'list'>
num_employees_enum
<class 'list'>
location_identifiers
<class 'list'>
last_funding_total
<class 'list'>
description
<class 'list'>
category_groups
<class 'list'>
last_funding_type
<class 'list'>
categories
<class 'list'>
last_funding_at
<class 'list'>
short_description
<class 'list'>
num_employees_enum
<class 'list'>
location_identifiers
<class 'list'>
last_funding_total
<class 'list'>
description
<class 'list'>
category_groups
<class 'list'>
last_funding_type
<class 'list'>
categories
<class 'list'>
last_funding_at
<class 'list'>
short_description
<class 'list'>
num_employees_enum
<class 'list'>
location_identifiers
<class 'list'>
last_funding_total
<class 'l

In [59]:

query_vector = vector_store["description"].similarity_search_with_score(query="AI saas", k=5)

# results = vector_store.similarity_search(query="hello")

In [60]:
query_vector

[(Document(metadata={'seq_num': 15554, 'source': '/Users/adityamakkar/Desktop/CS/HTN2024/spark/vectordb/data/merged.json'}, page_content='SaaS'),
  0.367587149143219),
 (Document(metadata={'seq_num': 3, 'source': '/Users/adityamakkar/Desktop/CS/HTN2024/spark/vectordb/data/merged.json'}, page_content='Sakana AI is an research company focused on creating innovative foundation models inspired by nature. The organization emphasizes evolution and collective intelligence in its AI development processes.'),
  0.70896315574646),
 (Document(metadata={'seq_num': 2901, 'source': '/Users/adityamakkar/Desktop/CS/HTN2024/spark/vectordb/data/merged.json'}, page_content="SAVVI AI helps businesses turn their data into better decisions, powered by Machine Learning. SAVVI's tool is the fastest way for organizations to get results-driven Machine Learning use cases into production, delivering time to value quickly."),
  0.7334390878677368),
 (Document(metadata={'seq_num': 1249, 'source': '/Users/adityamakk