 <h4> Biomedical Document Ingestion & Vector Indexing into Neo4j

This script processes a collection of biomedical PDFs (2020 - 2025) related to Type 2 Diabetes, splits them into semantically meaningful chunks, and stores them in Neo4j with vector embeddings for efficient similarity search.

In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Neo4jVector

# === Config ===
PDF_DIR = "/content/drive/MyDrive/Diabetes_KG_Project/2020_1"

# Embeddings BioBERT 
embedding_model = HuggingFaceEmbeddings(
    model_name="pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True}
)

# Splitter (600 tokens + overlap 120 pour précision)
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=120)

# Lister les PDFs du dossier
pdf_files = sorted([os.path.join(PDF_DIR, f) for f in os.listdir(PDF_DIR) if f.endswith(".pdf")])

all_docs = []

print(f"📂 Chargement de {len(pdf_files)} PDFs depuis {PDF_DIR}")

# 1. Charger et splitter chaque PDF
for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    pages = loader.load()
    docs = splitter.split_documents(pages)
    all_docs.extend(docs)

print(f"📝 Nombre total de chunks : {len(all_docs)}")

# 2. Insérer dans Neo4j
vector_index = Neo4jVector.from_documents(
    all_docs,
    embedding_model,
    url=url,
    username=username,
    password=password,
    node_label="Document",
    text_node_property="text",
    embedding_node_property="embedding"
)

print(f"✅ Insertion terminée : {len(all_docs)} chunks insérés dans Neo4j")


In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Neo4jVector

# === Config ===
PDF_DIR = "/content/drive/MyDrive/Diabetes_KG_Project/2024_2"

# Embeddings BioBERT 
embedding_model = HuggingFaceEmbeddings(
    model_name="pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True}
)

# Splitter (600 tokens + overlap 120  pour précision)
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=120)

# Lister les PDFs du dossier
pdf_files = sorted([os.path.join(PDF_DIR, f) for f in os.listdir(PDF_DIR) if f.endswith(".pdf")])

all_docs = []

print(f"📂 Chargement de {len(pdf_files)} PDFs depuis {PDF_DIR}")

# 1. Charger et splitter chaque PDF
for pdf in pdf_files:
    try:
        loader = PyPDFLoader(pdf)
        pages = loader.load()
        docs = splitter.split_documents(pages)
        if len(docs) > 1:   # ✅ Supprimer uniquement le dernier chunk si >1
            docs = docs[:-1]
        all_docs.extend(docs)
    except Exception as e:
        print(f"⚠️ Erreur avec {pdf}: {e}")

print(f"📝 Nombre total de chunks (après suppression des derniers) : {len(all_docs)}")

batch_size = 200
for i in range(0, len(all_docs), batch_size):
    batch = all_docs[i:i+batch_size]
    vector_index = Neo4jVector.from_documents(
        batch,
        embedding_model,
        url=url,
        username=username,
        password=password,
        node_label="Document",
        text_node_property="text",
        embedding_node_property="embedding"
    )
    print(f"✅ Batch {i//batch_size+1} inséré")
