In [4]:
# %%capture
!pip install -r requirements.txt

Collecting openai (from -r requirements.txt (line 1))
  Downloading openai-1.63.2-py3-none-any.whl.metadata (27 kB)
Collecting graphdatascience (from -r requirements.txt (line 2))
  Downloading graphdatascience-1.13-py3-none-any.whl.metadata (7.5 kB)
Collecting retry==0.9.2 (from -r requirements.txt (line 3))
  Downloading retry-0.9.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting langchain>=0.0.216 (from -r requirements.txt (line 4))
  Downloading langchain-0.3.19-py3-none-any.whl.metadata (7.9 kB)
Collecting streamlit==1.23.1 (from -r requirements.txt (line 5))
  Downloading streamlit-1.23.1-py2.py3-none-any.whl.metadata (7.4 kB)
Collecting streamlit-chat==0.0.2.2 (from -r requirements.txt (line 6))
  Downloading streamlit_chat-0.0.2.2-py3-none-any.whl.metadata (1.2 kB)
Collecting streamlit-chat-media==0.0.4 (from -r requirements.txt (line 7))
  Downloading streamlit_chat_media-0.0.4-py3-none-any.whl.metadata (4.2 kB)
Collecting plotly==5.15.0 (from -r requirements.txt (line 10))


In [11]:
import os
import openai
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep

In [18]:
# Load environment variables
load_dotenv('Code/.env')

False

In [19]:
# OpenAI API configuration
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai_deployment = "chat-gpt35"
print(openai.api_key)

None


In [16]:
# Neo4j configuration & constraints
neo4j_url = os.getenv("NEO4J_CONNECTION_URL")
neo4j_user = os.getenv("NEO4J_USER")
neo4j_password = os.getenv("NEO4J_PASSWORD")
gds = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))

### 2. Helper Functions

In [None]:

# Function to call the OpenAI API
def process_gpt(file_prompt, system_msg):
    completion = openai.ChatCompletion.create(
        engine=openai_deployment,
        max_tokens=15000,
        temperature=0,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
    )
    nlp_results = completion.choices[0].message.content
    sleep(8)
    return nlp_results


# Function to take folder of files and a prompt template, and return a json-object of all the entities and relationships
def extract_entities_relationships(folder, prompt_template):
    start = timer()
    files = glob.glob(f"./data/{folder}/*")
    system_msg = "You are a helpful IT-project and account management expert who extracts information from documents."
    print(f"Running pipeline for {len(files)} files in {folder} folder")
    results = []
    for i, file in enumerate(files):
        print(f"Extracting entities and relationships for {file}")
        try:
            with open(file, "r") as f:
                text = f.read().rstrip()
                prompt = Template(prompt_template).substitute(ctext=text)
                result = process_gpt(prompt, system_msg=system_msg)
                results.append(json.loads(result))
        except Exception as e:
            print(f"Error processing {file}: {e}")
    end = timer()
    print(f"Pipeline completed in {end-start} seconds")
    return results


# Function to take a json-object of entitites and relationships and generate cypher query for creating those entities
def generate_cypher(json_obj):
    e_statements = []
    r_statements = []

    e_label_map = {}

    # loop through our json object
    for i, obj in enumerate(json_obj):
        print(f"Generating cypher for file {i+1} of {len(json_obj)}")
        for entity in obj["entities"]:
            label = entity["label"]
            id = entity["id"]
            id = id.replace("-", "").replace("_", "")
            properties = {k: v for k, v in entity.items() if k not in ["label", "id"]}

            cypher = f'MERGE (n:{label} {{id: "{id}"}})'
            if properties:
                props_str = ", ".join(
                    [f'n.{key} = "{val}"' for key, val in properties.items()]
                )
                cypher += f" ON CREATE SET {props_str}"
            e_statements.append(cypher)
            e_label_map[id] = label

        for rs in obj["relationships"]:
            src_id, rs_type, tgt_id = rs.split("|")
            src_id = src_id.replace("-", "").replace("_", "")
            tgt_id = tgt_id.replace("-", "").replace("_", "")

            src_label = e_label_map[src_id]
            tgt_label = e_label_map[tgt_id]

            cypher = f'MERGE (a:{src_label} {{id: "{src_id}"}}) MERGE (b:{tgt_label} {{id: "{tgt_id}"}}) MERGE (a)-[:{rs_type}]->(b)'
            r_statements.append(cypher)

    with open("cyphers.txt", "w") as outfile:
        outfile.write("\n".join(e_statements + r_statements))

    return e_statements + r_statements


# Final function to bring all the steps together
def ingestion_pipeline(folders):
    # Extrating the entites and relationships from each folder, append into one json_object
    entities_relationships = []
    for key, value in folders.items():
        entities_relationships.extend(extract_entities_relationships(key, value))

    # Generate and execute cypher statements
    cypher_statements = generate_cypher(entities_relationships)
    for i, stmt in enumerate(cypher_statements):
        print(f"Executing cypher statement {i+1} of {len(cypher_statements)}")
        try:
            gds.execute_query(stmt)
        except Exception as e:
            with open("failed_statements.txt", "w") as f:
                f.write(f"{stmt} - Exception: {e}\n")

### 3. Defining Prompts

In [None]:
amazon_products_prompt_template = """
From the list of products below, extract the following Entities & Relationships in the specified format:
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, identify these Entity types in the text and generate them as comma-separated values similar to the entity type format.
    The `id` property of each entity must be alphanumeric and unique among all entities. Use this property to define relationships between entities. Do not create new entity types beyond those listed below. Generate as many entities as needed based on the types provided:
    
    Entity Types:
   label:'Product',id:string,title:string,description:string,price:float,averageRating:float //Represents a product. 'id' should be a unique alphanumeric identifier (e.g., 'anomieBonhomie'), 'title' is the product's name, 'description' is the full product description, 'price' is the product's price (use 0.0 for 'nan' values), and 'averageRating' is the average customer rating.
    
   label:'Category',id:string,name:string //Represents the product category. 'id' should be a camel-case version of the category name (e.g., 'digitalMusic'), and 'name' is the full category name.
    
   label:'Store',id:string,name:string //Represents the store or brand offering the product. 'id' should be a camel-case version of the store name (e.g., 'scrittiPolitti'), and 'name' is the full store name.
    
   label:'Format',id:string,name:string //Represents the product format (e.g., 'Audio CD'). 'id' should be a camel-case version of the format name (e.g., 'audioCD'), and 'name' is the format's full name.
    
2. Next, generate each relationship as triples of head, relationship, and tail. Use the respective `id` properties to refer to the head and tail entities. Relationship properties should be mentioned within brackets as comma-separated values. Generate as many relationships as needed, using the following relationship types:

    Relationship Types:
    productid|BELONGS_TO|categoryid
    productid|SOLD_BY|storeid
    productid|HAS_FORMAT|formatid

3. The output should be formatted as:
{
    "entities": [
        {"label":"Product","id":string,"title":string,"description":string,"price":float,"averageRating":float},
        {"label":"Category","id":string,"name":string},
        {"label":"Store","id":string,"name":string},
        {"label":"Format","id":string,"name":string}
    ],
    "relationships": [
        "productid|BELONGS_TO|categoryid",
        "productid|SOLD_BY|storeid",
        "productid|HAS_FORMAT|formatid"
    ]
}

Case Sheet:
$ctext
"""


### 4. Running the pipeline

In [None]:
folders = {
    "amazon_products": amazon_products_prompt_template,
}
ingestion_pipeline(folders)