In [7]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

In [8]:
##Setting Directory

In [9]:
## Input data directory
data_dir = "raw_input_data"
inputdirectory = Path(f"./resources/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/")
inputdirectory

PosixPath('resources/raw_input_data')

In [10]:
## Load Documents

In [11]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[0].page_content)

100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 131.15it/s]

Number of chunks =  3
At first, her government, under the administration of the _dogmatists_, was an absolute _despotism_. But, as the legislative continued to show traces of the ancient barbaric rule, her empire gradually broke up, and intestine wars introduced the reign of _anarchy;_ while the _sceptics_, like nomadic tribes, who hate a permanent habitation and settled mode of living, attacked from time to time those who had organized themselves into civil communities. But their number was, very happily, small; and thus they could not entirely put a stop to the exertions of those who persisted in raising new edifices, although on no settled or uniform plan. In recent times the hope dawned upon us of seeing those disputes settled, and the legitimacy of her claims established by a kind of _physiology_ of the human understanding—that of the celebrated Locke. But it was found that—although it was affirmed that this so-called queen could not refer her descent to any higher source than tha




In [12]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

/Users/dominikpichler/Documents/Pet_Projects/Athene
(3, 3)


Unnamed: 0,text,source,chunk_id
0,"At first, her government, under the administra...",resources/raw_input_data/The_critique_of_pure_...,c896433f291c4fdbad27f380a6d77bba
1,"At present, as all methods, according to the g...",resources/raw_input_data/The_critique_of_pure_...,52fb4de12ed44a38a8b940b511d83842
2,For it is in reality vain to profess _indiffer...,resources/raw_input_data/The_critique_of_pure_...,73068922347147f494ef37744ed6e1e4


In [13]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

In [14]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='mistral:latest')
    dfg1 = graph2Df(concepts_list)
    
    
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

    
    
    
    
    
dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

 [
   {
       "node_1": "absolutdespotism",
       "node_2": "her government",
       "edge": "her government was an absolute despotism"
   },
   {
       "node_1": "her government",
       "node_2": "_dogmatists_",
       "edge": "her government, under the administration of the dogmatists, was an absolute despotism"
   },
   {
       "node_1": "ancient barbaric rule",
       "node_2": "legislative",
       "edge": "as the legislative continued to show traces of the ancient barbaric rule"
   },
   {
       "node_1": "intestine wars",
       "node_2": "reign of anarchy",
       "edge": "intestine wars introduced the reign of anarchy"
   },
   {
       "node_1": "_anarchy_",
       "node_2": "her empire",
       "edge": "her empire gradually broke up, and intestine wars introduced the reign of anarchy"
   },
   {
       "node_1": "_sceptics_",
       "node_2": "those who persisted in raising new edifices",
       "edge": "but their number was small; and thus they could not entirely put 

Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,absolutdespotism,her government,her government was an absolute despotism,c896433f291c4fdbad27f380a6d77bba,4
1,her government,_dogmatists_,"her government, under the administration of th...",c896433f291c4fdbad27f380a6d77bba,4
2,ancient barbaric rule,legislative,as the legislative continued to show traces of...,c896433f291c4fdbad27f380a6d77bba,4
3,intestine wars,reign of anarchy,intestine wars introduced the reign of anarchy,c896433f291c4fdbad27f380a6d77bba,4
4,_anarchy_,her empire,"her empire gradually broke up, and intestine w...",c896433f291c4fdbad27f380a6d77bba,4


In [15]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
503,so-called queen,her government,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",2,contextual proximity
508,so-called queen,metaphysics,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",2,contextual proximity
517,those who persisted in raising new edifices,_sceptics_,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",3,contextual proximity
524,those who persisted in raising new edifices,her government,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",2,contextual proximity
529,those who persisted in raising new edifices,metaphysics,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",2,contextual proximity


In [16]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,_anarchy_,_sceptics_,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",contextual proximity,3
1,_anarchy_,her empire,c896433f291c4fdbad27f380a6d77bba,"her empire gradually broke up, and intestine w...",4
2,_anarchy_,her government,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",contextual proximity,2
3,_anarchy_,metaphysics,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",contextual proximity,2
4,_dogmatism_,_sceptics_,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",contextual proximity,3
...,...,...,...,...,...
181,so-called queen,her government,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",contextual proximity,2
182,so-called queen,metaphysics,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",contextual proximity,2
183,those who persisted in raising new edifices,_sceptics_,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",contextual proximity,3
184,those who persisted in raising new edifices,her government,"c896433f291c4fdbad27f380a6d77bba,c896433f291c4...",contextual proximity,2


In [17]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(31,)

In [21]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/2)

In [22]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  4
[['_anarchy_', '_physiology_ of the human understanding', '_sceptics_', 'absolutdespotism', 'ancient barbaric rule', 'civil communities', 'established', 'her', 'her empire', 'her government', 'hope dawned upon us', 'intestine wars', 'legislative', 'legitimacy of her claims', 'metaphysics', 'nomadic tribes', 'recent times', 'reign of anarchy', 'so-called queen', 'those who persisted in raising new edifices'], ['_dogmatism_'], ['_dogmatists_'], ['_indifference_', '_indifferentists_', '_judgement_', 'age', 'critical investigation of pure reason', 'humanity', 'illusory knowledge, baseless assumptions and pretensions', 'metaphysical declarations and propositions', 'reason']]


In [23]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,_anarchy_,#db5f57,1
1,_physiology_ of the human understanding,#db5f57,1
2,_sceptics_,#db5f57,1
3,absolutdespotism,#db5f57,1
4,ancient barbaric rule,#db5f57,1
5,civil communities,#db5f57,1
6,established,#db5f57,1
7,her,#db5f57,1
8,her empire,#db5f57,1
9,her government,#db5f57,1


In [24]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [26]:
pip install pyvis

Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting jsonpickle>=1.4.1 (from pyvis)
  Downloading jsonpickle-3.0.2-py3-none-any.whl.metadata (7.5 kB)
Downloading jsonpickle-3.0.2-py3-none-any.whl (40 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.7/40.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jsonpickle, pyvis
Successfully installed jsonpickle-3.0.2 pyvis-0.3.2
Note: you may need to restart the kernel to use updated packages.


In [28]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
