In [2]:
# import
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

In [3]:
from jq_mimic import JSONLoader
import json
from pathlib import Path
from pprint import pprint

file_path='./results/generic_name_count.json'

with open(file_path, 'r') as file:
    lines = file.readlines()
    
parsed_data = [json.loads(line) for line in lines]

# Write the parsed data to a new JSON file
output_file_path = './results/formatted_generic_name_count.json'  # Name of the output file
with open(output_file_path, 'w') as file:
    json.dump(parsed_data, file, indent=4)

print(f"JSON data has been formatted and saved to '{output_file_path}'.")

data = json.loads(Path(output_file_path).read_text())

JSON data has been formatted and saved to './results/formatted_generic_name_count.json'.


In [4]:
# Define the metadata extraction function.
def metadata_func(record, metadata):
    metadata["count"] = str(record.get("count"))
    return metadata

loader = JSONLoader(
    file_path=output_file_path,
    content_key="term",
    metadata_func=metadata_func
)
docs = loader.load()

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()
# Vectorstores
from langchain.vectorstores import Chroma
persist_directory = 'vector_db'
# !rm -rf ./docs/chroma  # remove old database files if any
vectordb = Chroma.from_documents(
documents=docs,
embedding=embedding,
persist_directory=persist_directory
)
vectordb.persist()

In [23]:
query = "colonovideoscope"
retrieved_docs = vectordb.similarity_search_with_score(query, k=20)

import ast
import re

# Transforming into the desired dictionary format
issue_dict = {}
for doc in retrieved_docs:
    content = doc[0].page_content
    count = doc[0].metadata['count']
    issue_dict[content] = count

# Print the dictionary
print(issue_dict)

{'VIDEO COLONOSCOPE': '2067', 'COLONOVIDEOSCOPE': '9174', 'GASTROINTESTINAL VIDEOSCOPE': '9826', 'DUODENOVIDEOSCOPE': '6142', 'HDVIDEO COLONOSCOPE 3.8C 13.2T 1700L FWJ': '1186', 'VIDEO COLONOSCOPE 3.8C 2.8C 13.2T FWJ': '1064', 'VIDEO DUODENOSCOPE': '2338', 'ULTRASOUND GASTROVIDEOSCOPE': '3162', 'BRONCHOVIDEOSCOPE': '4768', 'ENDOSCOPIC INSTRUMENT': '6680', 'VIDEO COLONOCOPE - I10 STANDARD': '4304', 'FLEX DEFLECTABLE VIDEOSCOPE': '3588', 'VIDEO GASTROSCOPE - I10 STANDARD': '3462', 'LINEAR CUTTERS - ENDOSCOPIC': '2045', 'ENDOSCOPE AND/OR ACCESSORIES': '4339', 'ENDOSCOPIC STAPLER': '1429', 'HD VIDEO GASTROSCOPE 2.8C 9.8T 1050L': '1391', 'ARTHROSCOPE': '6818', 'ENDOSCOPIC MULTIPLE CLIP APPLIER': '1884', 'ENDOSCOPIC LINEAR CUTTER': '1149'}
