In [None]:
!poetry update -q

import os
import sys

project_path = os.path.abspath('..')
if project_path not in sys.path:
    sys.path.insert(0, project_path)

In [None]:
from dotenv import load_dotenv

from configurations.configuration import Configuration
from configurations.chat_openai import ChatOpenAIConfiguration
from configurations.cypher_chat_openai import CypherChatOpenAIConfiguration
from configurations.neo4j import Neo4jConfiguration
from configurations.openai_embeddings import OpenAIEmbeddingsConfiguration

load_dotenv()

configurations = Configuration.load('../configuration.yaml')
neo4j = Neo4jConfiguration.grab(configurations)
chat_openai = ChatOpenAIConfiguration.grab(configurations)
openai_embeddings = OpenAIEmbeddingsConfiguration.grab(configurations)
cypher_chat_openai = CypherChatOpenAIConfiguration.grab(configurations)

In [None]:
f = 0
t = 5

In [23]:
import pyarrow.parquet as pq

link_index = 1
content_index = 4
table = pq.read_table('train-00000-of-00001.parquet', memory_map=True)
strings: list[str] = []
imported_rows: list[str] = []
for index, cell in enumerate(table[content_index][f:t]):
    imported_rows.append(table[link_index][f + index].as_py())
    strings.append(cell.as_py())

In [24]:
from langchain.chains.llm import LLMChain
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from sro import SRO

llm = ChatOpenAI(**chat_openai.dict())

prompt_template = """
Analyze the following text and extract all the facts in a structured way. Provide the facts in the format:

Subject: SUBJECT OF FACT
Relationship: TYPE OF RELATIONSHIP
Object: FACT OBJECT

Examples of facts:
Subject: Alice, Relationship: works at, Object: Acme Corp.
Subject: New York, Relationship: is, Object: city

Text: "{text}"
"""

prompt = PromptTemplate(
    input_variables=['text'],
    template=prompt_template
)

entity_extraction_chain = LLMChain(llm=llm, prompt=prompt)

facts: list[str] = []
for string in strings[f:t]:
    # splitlines = string.splitlines()
    # print(len(splitlines))
    # for index, line in enumerate(splitlines):
    #     print(index)
    #     facts.append(entity_extraction_chain.run(line))
    facts += entity_extraction_chain.run(string).split('\n\n')
facts

['Certainly! Here is a structured extraction of facts from the provided text:',
 '1. Subject: President of the United States (POTUS)\n   Relationship: is\n   Object: head of state and head of government of the United States of America',
 '2. Subject: President\n   Relationship: directs\n   Object: executive branch of the federal government',
 '3. Subject: President\n   Relationship: is\n   Object: commander-in-chief of the United States Armed Forces',
 '4. Subject: Presidential power\n   Relationship: has grown\n   Object: substantially since George Washington took office in 1789',
 '5. Subject: Franklin D. Roosevelt and George W. Bush\n   Relationship: had\n   Object: notable expansions of presidential power',
 "6. Subject: President\n   Relationship: is\n   Object: leader of the world's only remaining superpower",
 '7. Subject: Article II of the Constitution\n   Relationship: establishes\n   Object: the executive branch of the federal government',
 '8. Subject: President\n   Relation

In [None]:
sros = list(filter(lambda sro: sro.is_full(), [SRO(fact) for fact in facts]))
# for sro in sros:
#     print(sro)

In [None]:
from langchain_community.graphs import Neo4jGraph

query = ''
for index, sro in enumerate(sros):
    query += f"""
    MERGE (s{index}:Entity {{name: '{sro.subject}'}})
    MERGE (o{index}:Entity {{name: '{sro.object}'}})
    MERGE (s{index})-[:{sro.relation.replace(' ', '_').replace('-', '_').upper()}]->(o{index})"""
graph = Neo4jGraph(**neo4j.dict())
graph.query(query)

with open('imported_links.txt', 'w') as file:
    file.writelines([f"{row}\n" for row in imported_rows])