## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [None]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


## Create a dataframe of all the chunks

In [None]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(23, 3)


Unnamed: 0,text,source,chunk_id
0,Abstract India’s health indicators have improv...,data_input\cureus\cureus-0015-00000040274.txt,985d2fd8af2a4bc5b58611806bf7796a
1,"Categories: Public Health, Epidemiology/Public...",data_input\cureus\cureus-0015-00000040274.txt,7a28fc7c6e0d446692f8f8072b365984
2,Introduction And Background India’s health ind...,data_input\cureus\cureus-0015-00000040274.txt,02d26156892e46f483659adf8ac35fee
3,"An extensive literature search was performed, ...",data_input\cureus\cureus-0015-00000040274.txt,500564eb54f547dab31383db65c31525
4,Review Overview of the public and private heal...,data_input\cureus\cureus-0015-00000040274.txt,dc0bb2442b5d46639fd7ffe3be2b64d7


## Extract Concepts

In [None]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [None]:
## To regenerate the graph with LLM, set this to True
regenerate = False

if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(149, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,india's health indicators,peer nations,continue to lag behind,ae0fd26675d645e787964255667e90f4,4
2,health workers density,doctors and nurses/midwives,"for 10,00 persons",ae0fd26675d645e787964255667e90f4,4
4,skilled health workforce,india,reinforces the central role human resources ha...,ae0fd26675d645e787964255667e90f4,4
5,skewed inter-state,urban-rural,and public-private sector divide,ae0fd26675d645e787964255667e90f4,4
7,health budget,federal,offers an unprecedented opportunity to do this,ae0fd26675d645e787964255667e90f4,4


## Calculating contextual proximity

In [None]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
2827,world-class health facilities,nhm strategies,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",10,contextual proximity
2828,world-class health facilities,rural areas,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",2,contextual proximity
2829,world-class health facilities,social norms,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",2,contextual proximity
2830,world-class health facilities,urban areas,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",2,contextual proximity
2831,world-class health facilities,urban slums,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",2,contextual proximity


### Merge both the dataframes

In [None]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,56 articles,extensive literature search,"d7a3e5085c7f4de4bc28fb0bd9cb0a94,d7a3e5085c7f4...",contextual proximity,2
1,[54],increasing violence against healthcare personnel,"640835e2521045a395ab6465cc1ba4ca,640835e252104...",contextual proximity,2
2,[55],increasing violence against healthcare personnel,"640835e2521045a395ab6465cc1ba4ca,640835e252104...",contextual proximity,2
3,a bad situation,increasing violence against healthcare personnel,"640835e2521045a395ab6465cc1ba4ca,640835e252104...",contextual proximity,2
4,a worrisome new trend,increasing violence against healthcare personnel,"640835e2521045a395ab6465cc1ba4ca,640835e252104...",contextual proximity,2
...,...,...,...,...,...
753,world-class health facilities,nhm strategies,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",contextual proximity,10
754,world-class health facilities,rural areas,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",contextual proximity,2
755,world-class health facilities,social norms,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",contextual proximity,2
756,world-class health facilities,urban areas,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",contextual proximity,2


In [None]:
from pyneoinstance import Neo4jInstance
uri = 'bolt://localhost:7687'
user = 'neo4j'
password = 'password'

graph = Neo4jInstance(uri, user, password)
data = graph.execute_read_query("call dbms.components")
data

Unnamed: 0,name,versions,edition
0,Neo4j Kernel,[5.12.0],enterprise


In [None]:
Topics_Create_Query = """
  WITH $rows AS rows
  UNWIND rows AS row
    merge (source:Topic {name:row.node_1})
    merge (target:Topic {name:row.node_2}) 
    merge (source)-[:related]->(target)
    """

Topics_Load_Result = graph.execute_write_query_with_data(data=dfg, query=Topics_Create_Query)




  return bound(*args, **kwds)


### Neo4j Graph Visualization
Here is a graph visualization of the topics related to each other
![Placeholder for image](./assets/neo4j-browser-visualization.png)