## Setup

In [2]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

# ## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
# ## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

In [3]:
outputdirectory

PosixPath('data_output/cureus')

## Load Documents

In [4]:
## Dir PDF Loader
loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
#loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
#loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


Number of chunks =  99
1 Introduction  
It is generally accepted that education, as a key means of building human capital, matters for both  
individual and economy- wide prosperity. On the individual side, work stemming from Schultz ( 1961),  
Becker ( 1964) and Mincer (1974) has shown that education is important for improving earnings and  
productivity. In micro estimates, the individual returns to educati on have been shown to be large. The 
return to an extra year of schooling is around 9% on average globally, and this has been relatively stable 
over the decades ( Psacharopoulos and Patrinos , 2018). But wider economic externalities and benefits to 
society ar e not captured in these analyses. In particular, they do not capture the potential spillover  effects 
of an individual’s education on other individuals working in the same firm, industry, region or country.  
Marshall (1890 ) was among the first to recognise the social interactions among workers that can 
create learning o

## Create a dataframe of all the chunks

In [5]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(99, 4)


Unnamed: 0,text,source,page,chunk_id
0,Discussion Paper\nNo.176 4 \nApril 2021 \nEduc...,data_input/cureus/ED614082.pdf,0,1a4c59c4427740c6bdd868007a275fa1
1,Ab\nstract \nThis paper summarises the liter...,data_input/cureus/ED614082.pdf,1,c2acb286207d4570b91cde412b41ada8
2,Performance is financed by the Economic and So...,data_input/cureus/ED614082.pdf,1,f3f6ff71ac644899b0da300dce4eb9bc
3,1 Introduction \nIt is generally accepted tha...,data_input/cureus/ED614082.pdf,2,9912f24f28ff4aaba8e7784a93d3a7cf
4,such positive externalities that provides an e...,data_input/cureus/ED614082.pdf,2,a586d5fd04164069a1325f0bbfe332a2


## Extract Concepts

In [6]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [9]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
# print(dfg1.shape)
# dfg1.head()

[
   {
       "node_1": "Discussion Paper",
       "node_2": "Education",
       "edge": "Discussion Papers often focus on analyzing the relationship between education and economic growth."
   },
   {
       "node_1": "Discussion Paper",
       "node_2": "Growth",
       "edge": "Discussion Papers also explore how economic growth can be promoted through various educational policies and initiatives."
   },
   {
       "node_1": "Discussion Paper",
       "node_2": "Economic Growth",
       "edge": "In Discussion Paper No.176, the author examines the impact of education on economic growth in a specific context."
   },
   {
       "node_1": "No.",
       "node_2": "Discussion Paper",
       "edge": "Each Discussion Paper is assigned a unique identification number for easy reference and retrieval."
   },
   {
       "node_1": "No.",
       "node_2": "Month",
       "edge": "The year of publication is also mentioned along with the month in which the paper was published."
   },
   {
       "

## Calculating contextual proximity

In [8]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

NameError: name 'dfg1' is not defined

### Merge both the dataframes

In [8]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,56 articles,extensive literature search,"d7a3e5085c7f4de4bc28fb0bd9cb0a94,d7a3e5085c7f4...",contextual proximity,2
1,[54],increasing violence against healthcare personnel,"640835e2521045a395ab6465cc1ba4ca,640835e252104...",contextual proximity,2
2,[55],increasing violence against healthcare personnel,"640835e2521045a395ab6465cc1ba4ca,640835e252104...",contextual proximity,2
3,a bad situation,increasing violence against healthcare personnel,"640835e2521045a395ab6465cc1ba4ca,640835e252104...",contextual proximity,2
4,a worrisome new trend,increasing violence against healthcare personnel,"640835e2521045a395ab6465cc1ba4ca,640835e252104...",contextual proximity,2
...,...,...,...,...,...
753,world-class health facilities,nhm strategies,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",contextual proximity,10
754,world-class health facilities,rural areas,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",contextual proximity,2
755,world-class health facilities,social norms,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",contextual proximity,2
756,world-class health facilities,urban areas,"0857ab4513ad4383aed095bcf24506fa,0857ab4513ad4...",contextual proximity,2


## Calculate the NetworkX Graph

In [9]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(215,)

In [10]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [11]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  17
[['56 articles', 'analysis', "corresponding authors' experiential knowledge", 'extensive literature search', 'peer-reviewed journals'], ['[54]', '[55]', 'a bad situation', 'a worrisome new trend', 'adequately compensated', 'can reverse the situation', 'defensive medicine practices', 'increasing violence against healthcare personnel', 'intense focus on specialization', 'low physician-to-patient ratio', 'overwhelmed physicians', 'primary care physicians', 'private marketplace', 'protect themselves by ordering unnecessary tests and procedures', 'results in delays in attending patients', 'set in', 'tempted to take on more patients than they can reasonably serve', 'thoughtful approach to government planning', 'underpaid physicians', 'unethical practices by pharmaceutical companies', 'will not be able to solve this'], ['accredit health facilities', 'enforcement of existing rules', 'health insurance scheme for central government employees', 'health system standardi

### Create a dataframe for community colors

In [12]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,56 articles,#db5f57,1
1,analysis,#db5f57,1
2,corresponding authors' experiential knowledge,#db5f57,1
3,extensive literature search,#db5f57,1
4,peer-reviewed journals,#db5f57,1
...,...,...,...
210,rural medical assistants (rmas),#575edb,15
211,limited uptake,#57bcdb,16
212,national health protection mission,#57bcdb,16
213,private health sector systems,#57dbcc,17


### Add colors to the graph

In [13]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [14]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
