In [1]:
from typing import Any, List, Dict
import re
import html
from collections.abc import Mapping
import networkx as nx
# from networkx.readwrite import json_graph
from openai import OpenAI

In [2]:
DEFAULT_TUPLE_DELIMITER = "<|>"
DEFAULT_RECORD_DELIMITER = "##"
DEFAULT_COMPLETION_DELIMITER = "<|COMPLETE|>"
DEFAULT_ENTITY_TYPES = ["organization", "person", "geo", "event"]

In [3]:
GRAPH_EXTRACTION_PROMPT = """
-Goal-
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
 
-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_type: One of the following types: [{entity_types}]
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
 
2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
 Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)
 
3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
 
4. When finished, output {completion_delimiter}
 
######################
-Examples-
######################
Example 1:
Entity_types: ORGANIZATION,PERSON
Text:
The Verdantis's Central Institution is scheduled to meet on Monday and Thursday, with the institution planning to release its latest policy decision on Thursday at 1:30 p.m. PDT, followed by a press conference where Central Institution Chair Martin Smith will take questions. Investors expect the Market Strategy Committee to hold its benchmark interest rate steady in a range of 3.5%-3.75%.
######################
Output:
("entity"{tuple_delimiter}CENTRAL INSTITUTION{tuple_delimiter}ORGANIZATION{tuple_delimiter}The Central Institution is the Federal Reserve of Verdantis, which is setting interest rates on Monday and Thursday)
{record_delimiter}
("entity"{tuple_delimiter}MARTIN SMITH{tuple_delimiter}PERSON{tuple_delimiter}Martin Smith is the chair of the Central Institution)
{record_delimiter}
("entity"{tuple_delimiter}MARKET STRATEGY COMMITTEE{tuple_delimiter}ORGANIZATION{tuple_delimiter}The Central Institution committee makes key decisions about interest rates and the growth of Verdantis's money supply)
{record_delimiter}
("relationship"{tuple_delimiter}MARTIN SMITH{tuple_delimiter}CENTRAL INSTITUTION{tuple_delimiter}Martin Smith is the Chair of the Central Institution and will answer questions at a press conference{tuple_delimiter}9)
{completion_delimiter}

######################
Example 2:
Entity_types: ORGANIZATION
Text:
TechGlobal's (TG) stock skyrocketed in its opening day on the Global Exchange Thursday. But IPO experts warn that the semiconductor corporation's debut on the public markets isn't indicative of how other newly listed companies may perform.

TechGlobal, a formerly public company, was taken private by Vision Holdings in 2014. The well-established chip designer says it powers 85% of premium smartphones.
######################
Output:
("entity"{tuple_delimiter}TECHGLOBAL{tuple_delimiter}ORGANIZATION{tuple_delimiter}TechGlobal is a stock now listed on the Global Exchange which powers 85% of premium smartphones)
{record_delimiter}
("entity"{tuple_delimiter}VISION HOLDINGS{tuple_delimiter}ORGANIZATION{tuple_delimiter}Vision Holdings is a firm that previously owned TechGlobal)
{record_delimiter}
("relationship"{tuple_delimiter}TECHGLOBAL{tuple_delimiter}VISION HOLDINGS{tuple_delimiter}Vision Holdings formerly owned TechGlobal from 2014 until present{tuple_delimiter}5)
{completion_delimiter}

######################
Example 3:
Entity_types: ORGANIZATION,GEO,PERSON
Text:
Five Aurelians jailed for 8 years in Firuzabad and widely regarded as hostages are on their way home to Aurelia.

The swap orchestrated by Quintara was finalized when $8bn of Firuzi funds were transferred to financial institutions in Krohaara, the capital of Quintara.

The exchange initiated in Firuzabad's capital, Tiruzia, led to the four men and one woman, who are also Firuzi nationals, boarding a chartered flight to Krohaara.

They were welcomed by senior Aurelian officials and are now on their way to Aurelia's capital, Cashion.

The Aurelians include 39-year-old businessman Samuel Namara, who has been held in Tiruzia's Alhamia Prison, as well as journalist Durke Bataglani, 59, and environmentalist Meggie Tazbah, 53, who also holds Bratinas nationality.
######################
Output:
("entity"{tuple_delimiter}FIRUZABAD{tuple_delimiter}GEO{tuple_delimiter}Firuzabad held Aurelians as hostages)
{record_delimiter}
("entity"{tuple_delimiter}AURELIA{tuple_delimiter}GEO{tuple_delimiter}Country seeking to release hostages)
{record_delimiter}
("entity"{tuple_delimiter}QUINTARA{tuple_delimiter}GEO{tuple_delimiter}Country that negotiated a swap of money in exchange for hostages)
{record_delimiter}
{record_delimiter}
("entity"{tuple_delimiter}TIRUZIA{tuple_delimiter}GEO{tuple_delimiter}Capital of Firuzabad where the Aurelians were being held)
{record_delimiter}
("entity"{tuple_delimiter}KROHAARA{tuple_delimiter}GEO{tuple_delimiter}Capital city in Quintara)
{record_delimiter}
("entity"{tuple_delimiter}CASHION{tuple_delimiter}GEO{tuple_delimiter}Capital city in Aurelia)
{record_delimiter}
("entity"{tuple_delimiter}SAMUEL NAMARA{tuple_delimiter}PERSON{tuple_delimiter}Aurelian who spent time in Tiruzia's Alhamia Prison)
{record_delimiter}
("entity"{tuple_delimiter}ALHAMIA PRISON{tuple_delimiter}GEO{tuple_delimiter}Prison in Tiruzia)
{record_delimiter}
("entity"{tuple_delimiter}DURKE BATAGLANI{tuple_delimiter}PERSON{tuple_delimiter}Aurelian journalist who was held hostage)
{record_delimiter}
("entity"{tuple_delimiter}MEGGIE TAZBAH{tuple_delimiter}PERSON{tuple_delimiter}Bratinas national and environmentalist who was held hostage)
{record_delimiter}
("relationship"{tuple_delimiter}FIRUZABAD{tuple_delimiter}AURELIA{tuple_delimiter}Firuzabad negotiated a hostage exchange with Aurelia{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}QUINTARA{tuple_delimiter}AURELIA{tuple_delimiter}Quintara brokered the hostage exchange between Firuzabad and Aurelia{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}QUINTARA{tuple_delimiter}FIRUZABAD{tuple_delimiter}Quintara brokered the hostage exchange between Firuzabad and Aurelia{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}SAMUEL NAMARA{tuple_delimiter}ALHAMIA PRISON{tuple_delimiter}Samuel Namara was a prisoner at Alhamia prison{tuple_delimiter}8)
{record_delimiter}
("relationship"{tuple_delimiter}SAMUEL NAMARA{tuple_delimiter}MEGGIE TAZBAH{tuple_delimiter}Samuel Namara and Meggie Tazbah were exchanged in the same hostage release{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}SAMUEL NAMARA{tuple_delimiter}DURKE BATAGLANI{tuple_delimiter}Samuel Namara and Durke Bataglani were exchanged in the same hostage release{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}MEGGIE TAZBAH{tuple_delimiter}DURKE BATAGLANI{tuple_delimiter}Meggie Tazbah and Durke Bataglani were exchanged in the same hostage release{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}SAMUEL NAMARA{tuple_delimiter}FIRUZABAD{tuple_delimiter}Samuel Namara was a hostage in Firuzabad{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}MEGGIE TAZBAH{tuple_delimiter}FIRUZABAD{tuple_delimiter}Meggie Tazbah was a hostage in Firuzabad{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}DURKE BATAGLANI{tuple_delimiter}FIRUZABAD{tuple_delimiter}Durke Bataglani was a hostage in Firuzabad{tuple_delimiter}2)
{completion_delimiter}

######################
-Real Data-
######################
Entity_types: {entity_types}
Text: {input_text}
######################
Output:"""

CONTINUE_PROMPT = "MANY entities and relationships were missed in the last extraction. Remember to ONLY emit entities that match any of the previously extracted types. Add them below using the same format:\n"
LOOP_PROMPT = "It appears some entities and relationships may have still been missed.  Answer YES | NO if there are still entities or relationships that need to be added.\n"


In [4]:
def clean_str(input: Any) -> str:
    """Clean an input string by removing HTML escapes, control characters, and other unwanted characters."""
    # If we get non-string input, just give it back
    if not isinstance(input, str):
        return input

    result = html.unescape(input.strip())
    # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python
    return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result)

In [5]:
def _unpack_descriptions(data: Mapping) -> list[str]:
    value = data.get("description", None)
    return [] if value is None else value.split("\n")


def _unpack_source_ids(data: Mapping) -> list[str]:
    value = data.get("source_id", None)
    return [] if value is None else value.split(", ")

In [6]:
llm = OpenAI(api_key='sk-Ie9c0c27f1a1444b286ee5efa4e1e6db920a08f193e5ov9A', base_url='https://api.gptsapi.net/v1')

In [7]:
def chat(query: str, prompt: str = '', history: List = []) -> str:
    if prompt:
        messages = [
            *history,
            {'role': 'system', 'content': prompt},
            {'role': 'user', 'content': query}
        ]
    else:
        messages = [
            *history,
            {'role': 'user', 'content': query}
        ]
    response = llm.chat.completions.create(
        model='gpt-4o-mini',
        messages=messages,
        n=1,
        temperature=0
    )
    return response.choices[0].message.content

In [11]:
class GraphExtractor:
    def __init__(
        self, 
    ) -> None:
        self._max_gleanings = 3
        self._join_descriptions = True
        # self.llm = llm
        # self.extraction_prompt = GRAPH_EXTRACTION_PROMPT.format(
        #     entity_types=DEFAULT_ENTITY_TYPES, 
        #     tuple_delimiter=DEFAULT_TUPLE_DELIMITER, 
        #     record_delimiter=DEFAULT_RECORD_DELIMITER, 
        #     completion_delimiter=DEFAULT_COMPLETION_DELIMITER)
    
    def __call__(self, texts: list[str], text_ids: List[int]) -> Any:
        source_doc_map = {}
        results = {}
        for doc_index, text in enumerate(texts):
            source_doc_map[doc_index] = text
            result = self._process_documents(text)
            # results.append(result)
            # text_id = text_ids[i]
            results[doc_index] = result
        graph = self._process_results(results)
        # output = json_graph.tree_data(graph, root=1)
        return source_doc_map, graph

    def _process_documents(self, text: str):
        query = GRAPH_EXTRACTION_PROMPT.format(
            entity_types=DEFAULT_ENTITY_TYPES, 
            tuple_delimiter=DEFAULT_TUPLE_DELIMITER, 
            record_delimiter=DEFAULT_RECORD_DELIMITER, 
            completion_delimiter=DEFAULT_COMPLETION_DELIMITER,
            input_text=text)
        results = chat(query=query)
        history = [
            {"role": "user", "content": query},
            {"role": "assistant", "content": results},
        ]
        for i in range(self._max_gleanings):
            # TODO 以下两个prompt没有history，手动修改promtp加上
            response = chat(CONTINUE_PROMPT, history=history)
            results += response or ""
            history = [
                *history,
                {"role": "user", "content": CONTINUE_PROMPT},
                {"role": "assistant", "content": response},
            ]

            if i >= self._max_gleanings - 1:
                break

            response = chat(
                LOOP_PROMPT,
                history=history
            )
            history = [
                *history,
                {"role": "user", "content": LOOP_PROMPT},
                {"role": "assistant", "content": response},
            ]
            if response != "YES":
                break
        return results
    
    def _process_results(self, results: Dict):
        graph = nx.Graph()
        for source_doc_id, extracted_data in results.items():
            records = [r.strip() for r in extracted_data.split(DEFAULT_RECORD_DELIMITER)]
            # print(records)
            for record in records:
                record = re.sub(r"^\(|\)$", "", record.strip())
                record_attributes = record.split(DEFAULT_TUPLE_DELIMITER)

                if record_attributes[0] == '"entity"' and len(record_attributes) >= 4:
                    # add this record as a node in the G
                    entity_name = clean_str(record_attributes[1].upper())
                    entity_type = clean_str(record_attributes[2].upper())
                    entity_description = clean_str(record_attributes[3])

                    if entity_name in graph.nodes():
                        node = graph.nodes[entity_name]
                        if self._join_descriptions:
                            node["description"] = "\n".join(
                                list({
                                    *_unpack_descriptions(node),
                                    entity_description,
                                })
                            )
                        else:
                            if len(entity_description) > len(node["description"]):
                                node["description"] = entity_description
                        node["source_id"] = ", ".join(
                            list({
                                *_unpack_source_ids(node),
                                str(source_doc_id),
                            })
                        )
                        node["entity_type"] = (
                            entity_type if entity_type != "" else node["entity_type"]
                        )
                    else:
                        graph.add_node(
                            entity_name,
                            type=entity_type,
                            description=entity_description,
                            source_id=str(source_doc_id),
                        )

                if (
                    record_attributes[0] == '"relationship"'
                    and len(record_attributes) >= 5
                ):
                    # add this record as edge
                    source = clean_str(record_attributes[1].upper())
                    target = clean_str(record_attributes[2].upper())
                    edge_description = clean_str(record_attributes[3])
                    edge_source_id = clean_str(str(source_doc_id))
                    try:
                        weight = float(record_attributes[-1])
                    except ValueError:
                        weight = 1.0

                    if source not in graph.nodes():
                        graph.add_node(
                            source,
                            type="",
                            description="",
                            source_id=edge_source_id,
                        )
                    if target not in graph.nodes():
                        graph.add_node(
                            target,
                            type="",
                            description="",
                            source_id=edge_source_id,
                        )
                    if graph.has_edge(source, target):
                        edge_data = graph.get_edge_data(source, target)
                        if edge_data is not None:
                            weight += edge_data["weight"]
                            if self._join_descriptions:
                                edge_description = "\n".join(
                                    list({
                                        *_unpack_descriptions(edge_data),
                                        edge_description,
                                    })
                                )
                            edge_source_id = ", ".join(
                                list({
                                    *_unpack_source_ids(edge_data),
                                    str(source_doc_id),
                                })
                            )
                    graph.add_edge(
                        source,
                        target,
                        weight=weight,
                        description=edge_description,
                        source_id=edge_source_id,
                    )
        return graph

In [9]:
texts = [
    'Brian Johnson( born 1939 or 1940) is a British designer and director of film and television special effects.', 
    'Stuart Rosenberg( August 11, 1927 – March 15, 2007) was an American film and television director whose motion pictures include" Cool Hand Luke"( 1967)," Voyage of the Damned"( 1976)," The Amityville Horror"( 1979), and" The Pope of Greenwich Village"( 1984).', 
    'He was noted for his work with actor Paul Newman.']

In [12]:
extractor = GraphExtractor()
source_doc_map, graph = extractor(texts, list(range(1, len(texts) + 1)))
source_doc_map, graph

({0: 'Brian Johnson( born 1939 or 1940) is a British designer and director of film and television special effects.',
  1: 'Stuart Rosenberg( August 11, 1927 – March 15, 2007) was an American film and television director whose motion pictures include" Cool Hand Luke"( 1967)," Voyage of the Damned"( 1976)," The Amityville Horror"( 1979), and" The Pope of Greenwich Village"( 1984).',
  2: 'He was noted for his work with actor Paul Newman.'},
 <networkx.classes.graph.Graph at 0x7b66cd91bce0>)

In [13]:
for i in graph.nodes():
    print(i)

BRIAN JOHNSON
FILM AND TELEVISION SPECIAL EFFECTS
DIRECTOR
FILM
TELEVISION
STUART ROSENBERG
COOL HAND LUKE
VOYAGE OF THE DAMNED
THE AMITYVILLE HORROR
THE POPE OF GREENWICH VILLAGE
VOYAGE OF THE DAMNED (1976)
THE AMITYVILLE HORROR (1979)
THE POPE OF GREENWICH VILLAGE (1984)
COOL HAND LUKE (1967)
PAUL NEWMAN


In [14]:
for i, node in graph.nodes(data=True):
    print(i, node)

BRIAN JOHNSON {'type': 'PERSON', 'description': 'Brian Johnson is a British designer and director known for his work in film and television special effects.', 'source_id': '0'}
FILM AND TELEVISION SPECIAL EFFECTS {'type': 'EVENT', 'description': 'Film and television special effects refer to the techniques used in the production of films and television shows to create illusions and enhance storytelling.', 'source_id': '0'}
DIRECTOR {'type': 'PERSON', 'description': 'A director is a person who is in charge of the creative aspects of a film or television production, overseeing the artistic and technical elements.', 'source_id': '0'}
FILM {'type': 'EVENT', 'description': 'Film refers to the medium of moving images used to tell stories, entertain, and convey messages, often involving special effects.\nFilm is a medium of storytelling that uses moving images to convey narratives, often produced for entertainment and artistic expression.', 'source_id': '0, 1', 'entity_type': 'EVENT'}
TELEVISI

In [15]:
for i, j, edge in graph.edges(data=True):
    print(i, j, edge)

STUART ROSENBERG COOL HAND LUKE {'weight': 18.0, 'description': 'Stuart Rosenberg directed the film Cool Hand Luke, which is a significant work in his filmography.\nStuart Rosenberg directed the film Cool Hand Luke, which is one of his most recognized works.', 'source_id': '1'}
STUART ROSENBERG VOYAGE OF THE DAMNED {'weight': 18.0, 'description': 'Stuart Rosenberg directed the film Voyage of the Damned, which is notable for its historical context.\nStuart Rosenberg directed the film Voyage of the Damned, which is notable for its historical significance.', 'source_id': '1'}
STUART ROSENBERG THE AMITYVILLE HORROR {'weight': 18.0, 'description': 'Stuart Rosenberg directed the film The Amityville Horror, which is a significant entry in the horror genre.\nStuart Rosenberg directed the film The Amityville Horror, which is a well-known horror film.', 'source_id': '1'}
STUART ROSENBERG THE POPE OF GREENWICH VILLAGE {'weight': 2.0, 'description': 'Stuart Rosenberg directed the film The Pope of 

In [50]:
for line in nx.generate_graphml(graph):
    print(line)

<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">
  <key id="d5" for="edge" attr.name="source_id" attr.type="string" />
  <key id="d4" for="edge" attr.name="description" attr.type="string" />
  <key id="d3" for="edge" attr.name="weight" attr.type="double" />
  <key id="d2" for="node" attr.name="source_id" attr.type="string" />
  <key id="d1" for="node" attr.name="description" attr.type="string" />
  <key id="d0" for="node" attr.name="type" attr.type="string" />
  <graph edgedefault="undirected">
    <node id="BRIAN JOHNSON">
      <data key="d0">PERSON</data>
      <data key="d1">Brian Johnson is a British designer and director known for his work in film and television special effects.</data>
      <data key="d2">0</data>
    </node>
    <node id="BRITISH">
      <data key="d0" />
      <data key="d1" />
      <data

In [16]:
nx.clustering(graph)

{'BRIAN JOHNSON': 0,
 'FILM AND TELEVISION SPECIAL EFFECTS': 0,
 'DIRECTOR': 0,
 'FILM': 0,
 'TELEVISION': 0,
 'STUART ROSENBERG': 0,
 'COOL HAND LUKE': 0,
 'VOYAGE OF THE DAMNED': 0,
 'THE AMITYVILLE HORROR': 0,
 'THE POPE OF GREENWICH VILLAGE': 0,
 'VOYAGE OF THE DAMNED (1976)': 0,
 'THE AMITYVILLE HORROR (1979)': 0,
 'THE POPE OF GREENWICH VILLAGE (1984)': 0,
 'COOL HAND LUKE (1967)': 0,
 'PAUL NEWMAN': 0}

In [17]:
for i, node in graph.nodes(data=True):
    print(i, node)

BRIAN JOHNSON {'type': 'PERSON', 'description': 'Brian Johnson is a British designer and director known for his work in film and television special effects.', 'source_id': '0'}
FILM AND TELEVISION SPECIAL EFFECTS {'type': 'EVENT', 'description': 'Film and television special effects refer to the techniques used in the production of films and television shows to create illusions and enhance storytelling.', 'source_id': '0'}
DIRECTOR {'type': 'PERSON', 'description': 'A director is a person who is in charge of the creative aspects of a film or television production, overseeing the artistic and technical elements.', 'source_id': '0'}
FILM {'type': 'EVENT', 'description': 'Film refers to the medium of moving images used to tell stories, entertain, and convey messages, often involving special effects.\nFilm is a medium of storytelling that uses moving images to convey narratives, often produced for entertainment and artistic expression.', 'source_id': '0, 1', 'entity_type': 'EVENT'}
TELEVISI