In [None]:
import os
from dotenv import load_dotenv
import json
import matplotlib.pyplot as plt
import networkx as nx
from openai import OpenAI
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [None]:
# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# LIMIT
LIMIT = 4999

In [None]:
FOLDER_PATH  = "C:/Users/Dd/OneDrive/Documents/_github/knowledge-graph-rag/data"

os.chdir(FOLDER_PATH)
documents = []
  
# iterate through all files
for file in os.listdir(): 
    # Check whether file is in text format or not 
    if file.endswith(".txt"):
        size = os.path.getsize(os.path.join(file))
        if size < LIMIT:
            file_path = f"{FOLDER_PATH}/{file}"
            with open(file_path, 'r', encoding = "cp1252") as f:
                documents.append(f.read())
len(documents)

In [None]:
load_dotenv()
api_key = os.getenv("API_KEY")
client = OpenAI(api_key=api_key)

In [None]:
system_prompt = """
You are an expert on  knowledge graph specializing on extracting entities and relationships.

Your task is to identify the entities and relations requested with the user prompt, from a given document.

You must generate the output in a JSON containing a list with JSON objects having first level keys only: "head", "head_type",
"relation", "tail", and "tail_type".

The "head" key must contain the text of the extracted entity,
the "head_type" key must contain the type of the extracted head entity,
the "relation" key must contain the type of relation between the "head" and the "tail", the "tail" key must represent the text of an
extracted entity which is the tail of the relation, and the "tail_type" key must contain the type of the tail entity.

Attempt to extract as many entities and relations as you can but do not make things up.

The JSON must have only "head", "head_type", "relation", "tail", and "tail_type" as first-level keys.
Do not use any other keys.
Do not use entities or extracted entities as keys.

Use the following format as an example output:

[

  {{
    "head": "President Biden",
    "head_type": "organization",
    "relation": "working for",
    "tail": "American people",
    "tail_type": "organization"
  }}
]
"""

In [None]:
user_prompt = """Based on the following example, extract entities and relations from the provided document.

--> Beginning of example

# Document
"We'll be in touch with the latest information on how President Biden and his administration
are working for the American people. Today, Israeli security forces conducted a successful
operation to rescue four hostages from the grips of Hamas in Gaza."

################

# Output
[

  {{
    "head": "President Biden",
    "head_type": "organization",
    "relation": "working for",
    "tail": "American people",
    "tail_type": "organization"
  }}
]

--> End of example

For the following document, generate extract entities and relations as in the provided example.

# Document
{document}


################
"""

In [None]:
def extract_information(text, model="gpt-3.5-turbo"):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt.format(
                    document=document
                )
            }
        ],
        response_format={"type": "json_object"}
    )
    
    print(completion.choices[0].message.content)
    return completion.choices[0].message.content

In [None]:
kg = []
for document in documents:
    clean_document = document.encode() \
                .decode('unicode-escape') \
                .replace('\xa0',' ') \
                .replace('\t',' ') \
                .replace('â', '')
    extracted_relationships = extract_information(clean_document)
    clean_relationships = json.loads(extracted_relationships)
    kg.append(clean_relationships)
kg

In [None]:
relationships = pd.DataFrame(kg)

In [None]:
relationships

In [None]:
relationships = relationships.loc[relationships.astype(str).drop_duplicates().index]

In [None]:
relationships = relationships.dropna()

In [None]:
heads = relationships['head'].values
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(heads)
similarity = util.cos_sim(embeddings[0], embeddings[1])

In [None]:
G = nx.Graph()
for _, row in relationships.iterrows():
  G.add_edge(row['head'], row['tail'], label=row['relation'])

In [None]:
pos = nx.spring_layout(G, seed=47, k=0.9)
labels = nx.get_edge_attributes(G, 'label')
plt.figure(figsize=(15, 15))
nx.draw(G, pos, with_labels=True, font_size=10, node_size=700, node_color='lightblue', edge_color='gray', alpha=0.6)
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=8, label_pos=0.3, verticalalignment='baseline')
plt.show()

In [None]:
relationships.to_csv('../data/relations.csv', index=False)