In [1]:
import json
import gzip
import qdrant_client

from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core import StorageContext
from llama_index.core import Document

In [2]:
Settings.llm = Ollama(model="llama3")
Settings.embed_model = HuggingFaceEmbedding("BAAI/bge-base-en-v1.5")

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=Settings.embed_model
)

# also baseline splitter
base_splitter = SentenceSplitter(chunk_size=512)

In [4]:
client = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)

vector_store = QdrantVectorStore(client=client, collection_name="pubmed_demo")
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [5]:
documents = []

jsonfile = "../data/pubmed_cis_json/pubmed24n1073_cis.json.gz" 

In [6]:
with gzip.open(jsonfile) as f:
    data = json.load(f)

In [7]:

for pmid in data:
    #print(pmid)
    abstract = data[pmid]["abstract"]
    journal = data[pmid]["journal"]
    pubdate = data[pmid]["pubdate"]
    document = Document(text=abstract, 
                        metadata = {"pmid": pmid, "journal": journal, "pubdate": pubdate})
    documents.append(document)

In [8]:
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

In [9]:
query_engine = index.as_query_engine()

In [10]:
response = query_engine.query(
    "how to identify cis-regulatory elements in the genome?"
)

In [11]:
print(str(response))

To identify cis-regulatory elements in the genome, open chromatin analysis of primary tissues or single-cell analysis of accessible chromatin (scATAC-seq) can be used. Single-cell analysis, such as SnapATAC, can overcome sample heterogeneity and map regulatory elements controlling cell-type specific gene expression patterns.
