## Setup

In [2]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [7]:
loader = PyPDFLoader("/Users/andrea/Documents/git/KG/knowledge_graph/data_input/nutrients-12-02985.pdf")

In [8]:
## Dir PDF Loader
#loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
loader = PyPDFLoader("/Users/andrea/Documents/git/KG/knowledge_graph/data_input/nutrients-12-02985.pdf")
#loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


Number of chunks =  129
Nutrients 2020 ,12, 2985 2 of 37
to harbor PCa is important for early diagnosis and better prognosis [ 1]. PSA screening has signiﬁcantly
increased the number of men treated for PCa, and it has been estimated that in ten years, many of them
would remain asymptomatic, without experiencing PCa-speciﬁc mortality [ 2]. According to a recent
meta-analysis, PSA screening was shown to produce a small reduction in 10-year disease-speciﬁc
mortality, without any beneﬁts towards overall mortality [ 3]. As a result, most clinical practice
guidelines do not advocate for a prostate biopsy based on elevated PSA concentrations alone [ 1,2,4],
whereas based on the available evidence, the use of opportunistic routine PSA screening is also not
recommended for all [ 1,2,5]. Nevertheless, elevated PSA concentrations usually require a second
sample for veriﬁcation purposes, whereas persistent elevated PSA concentrations, are usually followed
by prostate biopsy and diagnostic imaging 

In [9]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(129, 4)


Unnamed: 0,text,source,page,chunk_id
0,nutrients\nReview\nDietary Factors and Supplem...,/Users/andrea/Documents/git/KG/knowledge_graph...,0,0f86fc1b98b64455abdbe4255db938bf
1,"6Division of Transplantation, Immunology and M...",/Users/andrea/Documents/git/KG/knowledge_graph...,0,eab662ea2f6b4848ba0f0969779954b7
2,prevention level. Despite the plethora of tria...,/Users/andrea/Documents/git/KG/knowledge_graph...,0,8cf0dea016fd4833850a1a9b04bec10c
3,"Nutrients 2020 ,12, 2985 2 of 37\nto harbor PC...",/Users/andrea/Documents/git/KG/knowledge_graph...,1,5681a50693e243ddb6d862737c2d1a44
4,"reduce the relative risk for PCa by 9% [ 8], w...",/Users/andrea/Documents/git/KG/knowledge_graph...,1,bca094c78b634d92b615a210a6420226


## Extract Concepts

In [14]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [3]:
## To regenerate the graph with LLM, set this to True
regenerate = False

if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(521, 8)


Unnamed: 0,node_1,node_2,edge,chunk_id,node_3,node_4,weight,count
0,nutrients,psa concentrations,Influencing,0f86fc1b98b64455abdbe4255db938bf,,,,4
1,dietary factors,psa concentrations,Influencing,0f86fc1b98b64455abdbe4255db938bf,,,,4
2,prostate cancer,increased cancer risk,Related to,0f86fc1b98b64455abdbe4255db938bf,,,,4
3,review,randomized controlled trials,Based on,0f86fc1b98b64455abdbe4255db938bf,,,,4
4,maria g. grammatikopoulou,stefanos t. papageorgiou,Co-authors,0f86fc1b98b64455abdbe4255db938bf,,,,4


## Calculating contextual proximity

In [4]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
7049,zn,mc,"7b1493c61dcc40e282c1933582edc1c2,7b1493c61dcc4...",2,contextual proximity
7052,zn,pca,"7b1493c61dcc40e282c1933582edc1c2,7b1493c61dcc4...",4,contextual proximity
7060,α-receptor blocker (αrb),psa concentrations,"82cfa82e61d745048bcb94c0cd7e367d,82cfa82e61d74...",3,contextual proximity
7061,α-receptor blocker (αrb),saw palmetto,"82cfa82e61d745048bcb94c0cd7e367d,82cfa82e61d74...",2,contextual proximity
7066,γ-linolenic acid supplementation,psa concentrations,"565c34bc45d44907b525cc09410c464c,565c34bc45d44...",4,contextual proximity


### Merge both the dataframes

In [5]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,*,dietary factors,"0f86fc1b98b64455abdbe4255db938bf,0f86fc1b98b64...",contextual proximity,2
1,*,dimitrios p . bogdanos,"0f86fc1b98b64455abdbe4255db938bf,0f86fc1b98b64...",contextual proximity,2
2,*,"division of transplantation, immunology and mu...","0f86fc1b98b64455abdbe4255db938bf,0f86fc1b98b64...",contextual proximity,2
3,*,increased cancer risk,"0f86fc1b98b64455abdbe4255db938bf,0f86fc1b98b64...",contextual proximity,2
4,*,konstantinos gkiouras,"0f86fc1b98b64455abdbe4255db938bf,0f86fc1b98b64...",contextual proximity,2
...,...,...,...,...,...
2477,zn,mc,"7b1493c61dcc40e282c1933582edc1c2,7b1493c61dcc4...",contextual proximity,2
2478,zn,pca,"7b1493c61dcc40e282c1933582edc1c2,7b1493c61dcc4...",Studies have shown that low levels of zinc (Zn...,8
2479,α-receptor blocker (αrb),psa concentrations,"82cfa82e61d745048bcb94c0cd7e367d,82cfa82e61d74...","Based on yet another Chinese meta-analysis, α-...",7
2480,α-receptor blocker (αrb),saw palmetto,"82cfa82e61d745048bcb94c0cd7e367d,82cfa82e61d74...",contextual proximity,2


## Calculate the NetworkX Graph

In [30]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()

In [31]:
psa

Unnamed: 0,node_1,node_2,edge,chunk_id,node_3,node_4,weight,count
14,pca,psa,PCa and PSA are both related to prostate healt...,8cf0dea016fd4833850a1a9b04bec10c,,,,4
15,men,psa,Identifying men most likely to have elevated l...,8cf0dea016fd4833850a1a9b04bec10c,,,,4
41,psa,body weight,"In the given context, it is mentioned that wit...",60222cf731ee4261aa55673b1bcfbc87,,,,4
42,psa,pca diagnosis,The text states that the reduction in PSA with...,60222cf731ee4261aa55673b1bcfbc87,,,,4
65,tomato extract,psa,Consumption of tomato extract (30 mg of lycope...,aade5cfe0ef34b1e850beb9b58a88fa7,,,,4
71,tomato extract (30 mg of lycopene),psa,Consumption of tomato extract (30 mg of lycope...,aade5cfe0ef34b1e850beb9b58a88fa7,,,,4
72,lycopene (15 mg),psa,Supplementation with lycopene (15 mg) for appr...,aade5cfe0ef34b1e850beb9b58a88fa7,,,,4
73,lycopene (30 mg),psa,Supplementation with lycopene (30 mg) for appr...,aade5cfe0ef34b1e850beb9b58a88fa7,,,,4
77,lycopene,psa,decreased PSA concentrations observed in inter...,ebcab93107c24944afbde28707c461a7,,,,4
135,psa,psa,Increased in both groups over the intervention...,1c78cba035994466bf189143eb39ed63,,,,4


In [6]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(547,)

In [7]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg1.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [8]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  114
[['*', 'dimitrios p . bogdanos', 'konstantinos gkiouras'], ['1000 iu vitamin d', '12, 2985', '2 x 2 pomx caps', '2000 iu vitamin d', '4000 iu vitamin d', '5-alpha-reductase inhibitors', '60 mg sulforaphane ons', '8 oz liquid pomx', '[90]', '[91]', 'adenocarcinoma', 'agaricus blazei murill', 'age', 'ala', 'androgen axis', 'androgen deprivation therapy', 'animal studies', 'antioxidant ons', 'antioxidants', 'antwi', 'as', 'as for disease management', 'below 0.2 ng/ml', 'benign prostate hyperplasia', 'benign prostatic hyperplasia', 'better prognosis', 'biopsy-proven pca', 'bmi', 'body mass index', 'body mass index (bmi)', 'body weight', 'bosland', 'bph', 'calcitriol', 'caloric restriction', 'cancer inhibitory potential', 'cernitin', 'cicero', 'cipolla', 'combined antioxidant supplementation', 'combined epa and dha supplementation', 'combined supplementation', 'combined tamsulosin and saw palmetto therapy', 'concord grape juice', 'consumed bread again as a mediu

### Create a dataframe for community colors

In [9]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,*,#9057db,1
1,dimitrios p . bogdanos,#9057db,1
2,konstantinos gkiouras,#9057db,1
3,1000 iu vitamin d,#dbb957,2
4,"12, 2985",#dbb957,2
...,...,...,...
542,specific cancers,#57dbce,112
543,soy pro beverages with high micronutrient content,#dadb57,113
544,urban,#dadb57,113
545,tomato oleoresin extract,#ce57db,114


### Add colors to the graph

In [10]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [11]:
from pyvis.network import Network

graph_output_directory = "./docs/index2.html"

net = Network(
    notebook=True,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=True,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
#net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
#net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index2.html


In [16]:
net.write_html('./docs/index.html',open_browser=True)

In [22]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=True,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)