In [2]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from chonkie import SDPMChunker
from dotenv import load_dotenv

load_dotenv()


True

In [4]:
from pyprojroot import here
import re
with open(here() / "data" / "curve-sim.md", "r+") as f:
    text = f.read()


# Remove all <!-- image --> from the text
text = re.sub(r"<!-- image -->", "", text)
# Remove all `\n`*3 or more from the text
text = re.sub(r"\n{3,}", "\n\n", text)

In [5]:
# Hyper-personalized knowledge graph construction.
import llamabot as lmb

sysprompt = lmb.system("""
my background training is in the following areas:

- machine learning
- biological engineering
- biochemistry

This paper was given to me by my client for review.
It pertains to their work doing chemical modelling.

Help me understand the paper
by first extracting entities from the paper.
Return the results in a structured format.
""")


In [49]:

from poorman_graphrag.entities import Entity, Entities
import llamabot as lmb

entity_extractor = lmb.StructuredBot(
    system_prompt=sysprompt,
    model_name="gemini/gemini-2.0-flash-exp",
    pydantic_model=Entities,
    # stream_target="none",
)


In [50]:
# entities_full_text = entity_extractor(text)

In [51]:
chunker = SDPMChunker(min_sentences=10, threshold=0.4)
chunks = chunker(text)
len(chunks)

26

In [53]:
# Just a test
entity_extractor(chunks[0].text)


{
  "entities": [
    {
      "description": "First author of the paper",
      "entity_type": "author",
      "name": "Paul Faya",
      "quote": "Paul Faya a"
    },
    {
      "description": "Second author of the paper",
      "entity_type": "author",
      "name": "Adam P. Rauk",
      "quote": "Adam P. Rauk a"
    },
    {
      "description": "Third author of the paper",
      "entity_type": "author",
      "name": "Kristi L. Griffiths",
      "quote": "Kristi L. Griffiths a"
    },
    {
      "description": "Fourth author of the paper",
      "entity_type": "author",
      "name": "Bhavin Parekh",
      "quote": "Bhavin Parekh b"
    },
    {
      "description": "Institution of the first three authors",
      "entity_type": "institution",
      "name": "Eli Lilly and Company",
      "quote": "Statistics -Discovery/Development, Eli Lilly and Company, Indianapolis, Indiana, USA"
    },
     {
      "description": "Institution of the fourth author",
      "entity_type": "institu

Entities(entities=[Entity(entity_type='author', name='Paul Faya', description='First author of the paper', quote='Paul Faya a'), Entity(entity_type='author', name='Adam P. Rauk', description='Second author of the paper', quote='Adam P. Rauk a'), Entity(entity_type='author', name='Kristi L. Griffiths', description='Third author of the paper', quote='Kristi L. Griffiths a'), Entity(entity_type='author', name='Bhavin Parekh', description='Fourth author of the paper', quote='Bhavin Parekh b'), Entity(entity_type='institution', name='Eli Lilly and Company', description='Institution of the first three authors', quote='Statistics -Discovery/Development, Eli Lilly and Company, Indianapolis, Indiana, USA'), Entity(entity_type='institution', name='Eli Lilly and Company', description='Institution of the fourth author', quote='Bioproduct R&D -Bioassay, Eli Lilly and Company'), Entity(entity_type='paper', name='parallelism testing in bioassay', description='General topic of the paper', quote='A cur

In [9]:
from tqdm.auto import tqdm

entities_from_chunks = []
for chunk in tqdm(chunks, total=len(chunks)):
    entities = entity_extractor(chunk.text)
    entities_from_chunks.append(entities)


  0%|          | 0/26 [00:00<?, ?it/s]

{
  "entities": [
    {
      "description": "First author of the paper",
      "entity_type": "author",
      "name": "Paul Faya",
      "quote": "Paul Faya a"
    },
    {
      "description": "Second author of the paper",
      "entity_type": "author",
      "name": "Adam P. Rauk",
      "quote": "Adam P. Rauk a"
    },
    {
      "description": "Third author of the paper",
      "entity_type": "author",
      "name": "Kristi L. Griffiths",
      "quote": "Kristi L. Griffiths a"
    },
    {
      "description": "Fourth author of the paper",
      "entity_type": "author",
      "name": "Bhavin Parekh",
      "quote": "Bhavin Parekh b"
    },
    {
      "description": "Institution of the first three authors",
      "entity_type": "institution",
      "name": "Eli Lilly and Company",
      "quote": "Statistics -Discovery/Development, Eli Lilly and Company, Indianapolis, Indiana, USA"
    },
     {
      "description": "Institution of the fourth author",
      "entity_type": "institu

In [60]:
all_entities = Entities(entities=[ent.model_dump() for entities in entities_from_chunks for ent in entities.entities])
len(all_entities)

386

In [64]:
all_entities[39]

Entity(entity_type='statistical_test', name='equivalence approaches', description='A statistical test for comparing curves.', quote='We compare the curve similarity method (frequentist and Bayesian) to the traditional difference testing and equivalence approaches as well as to a recent frequentist test offered by Novick and Yang (2019).')

In [65]:
all_entities.deduplicate().to_jsonl(here() / "data" / "entities.jsonl")

In [68]:
all_entities = Entities.from_jsonl(here() / "data" / "entities.jsonl")
len(all_entities)

280

In [46]:
len(all_entities), len(merged_entities), len(merged_post_fuzzy_entities)

(386, 292, 280)

In [47]:
merged_entities.to_jsonl(here() / "data" / "entities.jsonl")

In [44]:
from poorman_graphrag.entities import Relationships

relationship_extractor = lmb.StructuredBot(
    system_prompt=lmb.system(
        "You are an expert at identifying relationships between entities in scientific papers. "
        "You will be provided with a list of entities and their descriptions. "
        "You will also be provided with a list of relationship types. "
        "For each pair of entities, determine if and how they are related based on the provided relationship types. "
        "Return all relationships you can identify between the entities."
    ),
    model_name="gpt-4o",
    pydantic_model=Relationships,
    stream_target="none",
)


In [72]:
from concurrent.futures import ThreadPoolExecutor
from tqdm.auto import tqdm
import json

def extract_relationships_for_chunk(chunk):
    return relationship_extractor(
        lmb.user(
            "Here are the entities and their descriptions:",
            json.dumps(deduplicated_entities.model_dump_json()),
            "Here is a chunk of text to work with:",
            chunk.text
        )
    )

# Extract relationships in parallel
with ThreadPoolExecutor() as executor:
    batch_relationships = list(tqdm(
        executor.map(extract_relationships_for_chunk, chunks),
        total=len(chunks),
        desc="Extracting relationships"
    ))

# Combine all relationships
relationships = Relationships(
    relationships=[
        rel for batch in batch_relationships
        for rel in batch.relationships
    ]
)


Extracting relationships:   0%|          | 0/26 [00:00<?, ?it/s]

In [76]:
print(relationships)

relationships=[Relationship(source_name='A curve similarity approach to parallelism testing in bioassay', target_name='Paul Faya', relation_type='related_to'), Relationship(source_name='A curve similarity approach to parallelism testing in bioassay', target_name='Adam P. Rauk', relation_type='related_to'), Relationship(source_name='A curve similarity approach to parallelism testing in bioassay', target_name='Kristi L. Griffiths', relation_type='related_to'), Relationship(source_name='A curve similarity approach to parallelism testing in bioassay', target_name='Bhavin Parekh', relation_type='related_to'), Relationship(source_name='Paul Faya', target_name='Eli Lilly and Company', relation_type='related_to'), Relationship(source_name='Adam P. Rauk', target_name='Eli Lilly and Company', relation_type='related_to'), Relationship(source_name='Kristi L. Griffiths', target_name='Eli Lilly and Company', relation_type='related_to'), Relationship(source_name='Bhavin Parekh', target_name='Eli Lill

In [77]:
len(all_entities)

197

In [78]:
len(relationships.relationships)

821

In [79]:
# Create networkx graph from entities and relationships
import networkx as nx
from typing import Dict

# Initialize directed graph
G = nx.DiGraph()

# Add all entities as nodes
entity_nodes: Dict[str, Entity] = {}
for entity in all_entities:
    # Use entity name as node identifier
    node_id = entity.name
    G.add_node(node_id, **entity.model_dump())
    entity_nodes[node_id] = entity

# Add relationships as edges
for rel in relationships.relationships:
    G.add_edge(
        rel.source_name,
        rel.target_name,
        relation_type=rel.relation_type,
    )

print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")


Graph created with 224 nodes and 453 edges


In [1]:
import hvplot.networkx as hvnx

hvnx.draw(
    G,
    layout='kamada_kawai',
    # node_size=300,
    edge_line_width=1,
    arrows=True,
    width=800,
    height=600
)

NameError: name 'G' is not defined