# Load lifesciences domain content into Qdrant

- an overly simple starting point
- a public bucket with read-only access

In [1]:
!aws s3 ls s3://kg-rememberall/

2025-01-14 09:04:45    1567312 2311.08526v1.pdf
2025-01-14 09:04:47     329278 2406.12925v2.pdf
2025-01-13 13:53:19    3165662 2406.13106v3.pdf
2025-01-14 09:04:43     505260 2409.12656v1.pdf
2025-01-14 09:04:44     553008 2410.05046v1.pdf
2025-01-14 09:04:47    3813091 2501.03172v1.pdf
2025-01-13 13:53:20     708813 pmid24378760.pdf
2025-01-13 13:53:21    3138069 pmid27438146.pdf
2025-01-13 13:53:23    1771239 pmid27453043.pdf
2025-01-13 13:53:12     845943 pmid30762338_si.pdf
2025-01-13 13:53:13    2357171 pmid30862715.pdf
2025-01-13 13:53:15    2663787 pmid33077733.pdf
2025-01-13 13:53:16     891075 pmid35559673.pdf
2025-01-13 16:31:13    4102905 s41587-021-01145-6.pdf
2025-01-13 16:31:11    4550676 s41597-023-01960-3.pdf
2025-01-13 13:53:17    1514184 s41698-024-00583-0.pdf


In [2]:
%pip install -qU langchain-text-splitters langchain-qdrant langchain-community langchain-openai qdrant-client PyMuPDF boto3


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import boto3
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore

# Environment variable setup
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
QDRANT_API_URL = os.getenv("QDRANT_API_URL")  # Ensure this is correct

# Set AWS credentials
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")
os.environ["AWS_DEFAULT_REGION"] = 'us-west-2'

# Parameters
COLLECTION_NAME = "life_sciences_pdfs"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

# Initialize S3 client
s3_client = boto3.client('s3', config=boto3.session.Config(signature_version='s3v4'))
bucket_name = 'kg-rememberall'

# Initialize the embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [4]:
print(QDRANT_API_URL)

https://dcc50e13-4537-4069-9b9f-26da8f65900c.us-east4-0.gcp.cloud.qdrant.io


In [5]:
# Initialize the Qdrant client
client = QdrantClient(
    url=QDRANT_API_URL,
    api_key=QDRANT_API_KEY,
    prefer_grpc=True,  # Use gRPC for faster communication
    timeout=30  # Set a timeout to avoid hanging
)

qdrant = QdrantVectorStore(
    client=client,
    collection_name=COLLECTION_NAME,
    embedding=embedding_model,
)

print("Qdrant client initialized for querying.")

Qdrant client initialized for querying.


In [6]:
# Initialize the Qdrant client
client = QdrantClient(
    url=QDRANT_API_URL,
    api_key=QDRANT_API_KEY,
    prefer_grpc=True,  # Use gRPC for faster communication
    timeout=30  # Set a timeout to avoid hanging
)

# Initialize the QdrantVectorStore for querying
qdrant = QdrantVectorStore(
    client=client,
    collection_name=COLLECTION_NAME,
    embedding=embedding_model  # Correct parameter name
)

print("Qdrant client initialized for querying.")

Qdrant client initialized for querying.


In [8]:
# Perform a query to retrieve the most relevant chunks
query_text = "Using LLMs for drug discovery"

# Perform the search
results = qdrant.similarity_search(query_text, k=5)  # Retrieve top 5 results

# Display results
for i, result in enumerate(results, 1):
    print(f"Result {i}:\n{result.page_content}\n")


Result 1:
ﬁles across hundreds of cell lines4–7, machine learning (ML) models have
emerged as a promising approach towards predicting drug response8–13. ML
models for drug response prediction typically integrate omic data from
cancercelllineswithdrugproﬁlestopredictdrugsensitivity,asmeasuredby
IC50 or AUC13,14.
Several studies have so far addressed open questions on how to train
ML models for drug response prediction. Notably, Shariﬁ-Noghabi et al.14
carried out a systematic study on the comparative performance of several
ML models when trained and tested on the most popular cell line datasetsto
predict different measures of drug response. In agreement with previously
reported striking discordances between two large pharmacogenomic
datasets15, namely CGP6 and CCLE5, cross-domain generalization issues
that question the application of ML models in clinically relevant tasks have
been reported14. The use of IC50 as a proxy of therapeutic efﬁcacy has also

Result 2:
was covered by only two 

## Only run when initializing an empty vector store collection

In [None]:
# List all files in the S3 bucket using pagination
# paginator = s3_client.get_paginator('list_objects_v2')
# pages = paginator.paginate(Bucket=bucket_name)

# Process each file in the S3 bucket
for page in pages:
    for obj in page.get('Contents', []):
        file_key = obj['Key']
        
        # Skip non-PDF files
        if not file_key.lower().endswith('.pdf'):
            print(f"Skipping non-PDF file: {file_key}")
            continue

        # Download the file from S3 to a temporary location
        temp_file_path = f"/tmp/{os.path.basename(file_key)}"
        try:
            s3_client.download_file(bucket_name, file_key, temp_file_path)
            print(f"Downloaded {file_key} to {temp_file_path}.")
        except Exception as e:
            print(f"Failed to download {file_key}: {e}")
            continue

        # Load the PDF document
        try:
            loader = PyMuPDFLoader(temp_file_path)
            docs = loader.load()
            print(f"Loaded {file_key} with {len(docs)} pages.")
        except Exception as e:
            print(f"Failed to load {file_key}: {e}")
            os.remove(temp_file_path)
            continue

        # Split the document into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP
        )
        splits = text_splitter.split_documents(docs)
        print(f"Split {file_key} into {len(splits)} chunks.")

        # Store the embeddings in Qdrant
        try:
            qdrant = QdrantVectorStore.from_documents(
                documents=splits,
                embedding=embedding_model,
                collection_name=COLLECTION_NAME,
                url=QDRANT_API_URL,
                api_key=QDRANT_API_KEY,
                prefer_grpc=True
            )
            print(f"Successfully stored embeddings for {file_key} in Qdrant.")
        except Exception as e:
            print(f"Failed to store embeddings for {file_key}: {e}")

        # Clean up the temporary file
        os.remove(temp_file_path)
        print(f"Deleted temporary file {temp_file_path}.")

print("All files have been processed and stored in Qdrant.")