In [2]:
import chromadb
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path
from pprint import pprint
import re
import ast
from uuid import uuid4


In [3]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'mps'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [4]:

def extract_permalinks(input_string):
    try:
        data = ast.literal_eval(input_string)
        if type(data) == list:
          permalinks = [d['value'] for d in data if 'value' in d]
          result = ', '.join(permalinks)
          return result
        else:
            result = data['value']
            if type(result) != str:
                result = str(result)
            return result
    except:
        return ""



loaders = {

    "num_employees_enum": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].num_employees_enum',
        text_content=False).load(),
    "location_identifiers": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].location_identifiers',
        text_content=False).load(),
    "last_funding_total": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].last_funding_total',
        text_content=False).load(),
    "description": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].description',
        text_content=False).load(),
    "last_funding_type": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].last_funding_type',
        text_content=False).load(),
    "categories": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].categories',
        text_content=False).load(),
    "last_funding_at": JSONLoader(
        file_path='./data/merged.json',
        jq_schema='.messages[].last_funding_at',
        text_content=False).load(),
}


for i in range(len(loaders["location_identifiers"])):

    loaders["last_funding_total"][i].page_content = str(loaders["last_funding_total"][i].page_content)
    loaders["location_identifiers"][i].page_content = extract_permalinks(loaders["location_identifiers"][i].page_content)
    loaders["last_funding_total"][i].page_content = extract_permalinks(loaders["last_funding_total"][i].page_content)
    loaders["categories"][i].page_content = extract_permalinks(loaders["categories"][i].page_content)


uuids = [str(uuid4()) for _ in range(len(loaders["location_identifiers"]))]


In [11]:
a = "239ac115-7e4e-48c6-bad9-39a456965d64"

hnsw = {
    "hnsw:space": "cosine",
    "hnsw:construction_ef": 10000,
    "hnsw:M": 16,
    "hnsw:search_ef": 10000,
    "hnsw:num_threads": 8,
}
vector_store = {
    "num_employees_enum": Chroma(
                            collection_name="num_employees_enum" + a,
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/num_employees_enum" + a,
                            collection_metadata=hnsw
                            ),
    "location_identifiers": Chroma(
                            collection_name="location_identifiers" + a,
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/location_identifiers" + a,
                            collection_metadata=hnsw
                            ),
    "last_funding_total": Chroma(
                            collection_name="last_funding_total" + a,
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/last_funding_total" + a,
                            collection_metadata=hnsw
                            ),
    "description": Chroma(
                            collection_name="description" + a,
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/description" + a,
                            collection_metadata=hnsw
                            ),
    "last_funding_type": Chroma(
                            collection_name="last_funding_type" + a,
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/last_funding_type" + a,
                            collection_metadata=hnsw
                            ),
    "categories": Chroma(
                            collection_name="categories" + a,
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/categories" + a,
                            collection_metadata=hnsw
                            ),

    "last_funding_at": Chroma(
                            collection_name="last_funding_at" + a,
                            embedding_function=embeddings,
                            persist_directory="./chroma_langchain_db/last_funding_at" + a,
                            collection_metadata=hnsw
                            ),

                }


In [56]:
with open('./data/merged.json') as json_file:
    companies = json.load(json_file)['messages']


In [55]:
data

[{'uuid': 'd27f66f7-a4a2-4252-da8a-387927f9f0d4',
  'name': 'Groq',
  'type': 'organization',
  'imageUrl': 'https://images.crunchbase.com/image/upload/c_pad,h_25,w_25,f_auto,b_white,q_auto:eco,dpr_1/v1417424257/bgq56bti8dt2ne8omvu7le3jn0kzou7feco18wuo.png',
  'link': 'https://www.crunchbase.com/organization/groq',
  'website': {'value': 'http://groq.com'},
  'identifier': {'permalink': 'groq',
   'image_id': 'le3jn0kzou7feco18wuo',
   'uuid': 'd27f66f7-a4a2-4252-da8a-387927f9f0d4',
   'entity_def_id': 'organization',
   'value': 'Groq'},
  'founder_identifiers': [{'permalink': 'jonathan-ross-80d0',
    'image_id': 'r206wbaqjith9xnmzjpj',
    'uuid': 'ef1a6af0-a180-49e5-82c4-5110ba5280d0',
    'entity_def_id': 'person',
    'value': 'Jonathan Ross'}],
  'linkedin': {'value': 'https://www.linkedin.com/company/groq'},
  'short_description': 'Groq radically simplifies compute to accelerate workloads in artificial intelligence, machine learning, and high-performance computing.',
  'faceboo

In [12]:
print(len(vector_store["num_employees_enum"].get()['documents']))

24844


In [9]:
print(type(loaders["last_funding_total"][0].page_content))
length = len(loaders["description"])
print(length)

import math
# for key in vector_store.keys():
#     print(len(vector_store[key].get()['documents']))

batch_size = 5000


for i in range(1, math.floor(length/batch_size) + 1):
  for key in vector_store.keys():
    print(key)
    end_index = min(i * batch_size + batch_size, length)
    vector_store[key].add_documents(documents=loaders[key][i*batch_size:end_index], ids=uuids[i*batch_size:end_index])


<class 'str'>
24844
num_employees_enum
location_identifiers
last_funding_total
description
last_funding_type
categories
last_funding_at
num_employees_enum
location_identifiers
last_funding_total
description


KeyboardInterrupt: 

In [17]:

query_vector = {}

for key in vector_store.keys():

    try:
        query_vector[key] = vector_store[key].similarity_search_with_score(query="AI saas", k=10000)
        print('done with', key)
    except:
        print("could not do this", key)

# results = vector_store.similarity_search(query="hello")

done with num_employees_enum
done with location_identifiers
done with last_funding_total
done with description
done with last_funding_type
done with categories
done with last_funding_at


In [83]:
import numpy as np
rrf_constant = 60
legnth = len(vector_store['categories'].get()['ids'])
base_value = 0


weights = {
          "location_identifiers": 10,
            "num_employees_enum": 1,
            "last_funding_total": 1,
            "description": 1,
            "last_funding_type": 1,
            "categories": 1,
            "last_funding_at": 1,
           }

value_array ={i: base_value * np.ones((7,)) for i in vector_store['categories'].get()['ids']}
def reciprocal_rank_fusion(query_vector: dict, n:int ) -> dict:
    results = {}
    id_list = vector_store['categories'].get()['ids']
    for _,key in enumerate(query_vector.keys()):
        current_query = query_vector[key]

        for i in range(len(current_query)):

            seqID = current_query[i][0].metadata['seq_num'] - 1 # zero index
            _id = id_list[seqID]
            cos_sim = current_query[i][1]
            value_array[_id][_] = cos_sim

    final_values = [(id, np.sum(value_array[id])) for id in id_list]
    final_values.sort(key=lambda x: x[1], reverse=True)
    final_values = final_values[:n]
    final_companies = [companies[id_list.index(i[0])] for i in final_values]


    return final_companies

In [85]:
query_vector['location_identifiers']


[(Document(metadata={'seq_num': 13386, 'source': '/Users/adityamakkar/Desktop/CS/HTN2024/spark/vectordb/data/merged.json'}, page_content='Iasi, Iasi, Romania, Europe'),
  0.603450357913971),
 (Document(metadata={'seq_num': 14482, 'source': '/Users/adityamakkar/Desktop/CS/HTN2024/spark/vectordb/data/merged.json'}, page_content='Iasi, Iasi, Romania, Europe'),
  0.603450357913971),
 (Document(metadata={'seq_num': 22440, 'source': '/Users/adityamakkar/Desktop/CS/HTN2024/spark/vectordb/data/merged.json'}, page_content='Kuressaare, Saaremaa, Estonia, Europe'),
  0.6829075813293457),
 (Document(metadata={'seq_num': 1938, 'source': '/Users/adityamakkar/Desktop/CS/HTN2024/spark/vectordb/data/merged.json'}, page_content='Siauliai, Siauliu Apskritis, Lithuania, Europe'),
  0.6872613430023193),
 (Document(metadata={'seq_num': 21102, 'source': '/Users/adityamakkar/Desktop/CS/HTN2024/spark/vectordb/data/merged.json'}, page_content='Loria, Veneto, Italy, Europe'),
  0.7043982744216919),
 (Document(me

In [84]:
reciprocal_rank_fusion(query_vector, 10)[0]

{'uuid': '9fcfe049-fc12-a096-b64e-39b08baf148a',
 'name': 'Brave',
 'type': 'organization',
 'imageUrl': 'https://images.crunchbase.com/image/upload/c_pad,h_25,w_25,f_auto,b_white,q_auto:eco,dpr_1/v1417424257/bgq56bti8dt2ne8omvu7g6p1w47fg2wy0ormm3lc.png',
 'link': 'https://www.crunchbase.com/organization/brave-software',
 'website': {'value': 'https://www.brave.com/'},
 'identifier': {'permalink': 'brave-software',
  'image_id': 'g6p1w47fg2wy0ormm3lc',
  'uuid': '9fcfe049-fc12-a096-b64e-39b08baf148a',
  'entity_def_id': 'organization',
  'value': 'Brave'},
 'founder_identifiers': [{'permalink': 'brendan-eich',
   'image_id': 'xox2j5rja272askiled9',
   'uuid': 'c0a45f01-30e6-a8f7-97b8-4749659aad3c',
   'entity_def_id': 'person',
   'value': 'Brendan Eich'},
  {'permalink': 'brian-r-bondy',
   'image_id': 'yykkjbbogfxifehkaxvs',
   'uuid': '253c23da-77db-45b5-e7b5-6c171fee4733',
   'entity_def_id': 'person',
   'value': 'Brian Bondy'}],
 'linkedin': {'value': 'https://www.linkedin.com/co

In [31]:

value_array1 = {i: np.zeros((7,)) for i in vector_store['description'].get()['ids']}

In [45]:
min_val = 1000000
max_val = 0
for i in vector_store['categories'].get()['metadatas']:
  min_val = min(min_val, i['seq_num'])
  max_val = max(max_val, i['seq_num'])

print(min_val, max_val)

1 24844
