In [5]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import SequentialChain, LLMChain

load_dotenv()

True

In [6]:
import neo4j

host = "localhost"
username = "neo4j"
password = "1234qwer"

driver = neo4j.GraphDatabase.driver(f'bolt://localhost:7687', auth=(username, password))
session = driver.session()

def create_graph(query):
    session.run(query)

In [14]:
template1 = """You are expert software analyst, you can easily split and classify entity from sentence 
- Categorized as CORE if the entity is a specific feature of software requirement domain, example (system, application, payment, login) 
- Categorized as USER if the entity is a specific user of the software, example (Admin, developer, Student, customer, buyer, cashier) 
- Categorized as HARDWARE if the entity is a component of computer hardware, example (Keyboard, mouse, CPU) 
- Categorized as PLATFORM if the entity is a third party application or software platform, example (Linux, Paypal, eBay, Apache) 

Given sentence : {sentence}

- Output must separated by comma like [entity],[category]
- Each line is separated by '\n'
- Don't display anything in the output except the output csv format

"""
template2 = """From the output of extraction
^^^
{extraction}
^^^
extraction format is seperated by comma like [entity],[category]

and in context sentence : $$$
{sentence}
$$$

- Generate semantic triple that consist subject, predicate, and object from entities (which is delimited by ^^^)  base on context sentence (which is delimited by $$$) 
- Output must separated by comma the format is [category]:[entity],[predicate],[category]:[entity]
- Each line is separated by '\n'
- Don't display anything in the output except the output csv format
"""


prompt1 = ChatPromptTemplate.from_template(template1)
prompt2 = ChatPromptTemplate.from_template(template2)

model = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv('OPEN_API_KEY'), temperature=0)

chain1 = prompt1 | model
chain2 = prompt2 | model

chain_one = LLMChain(llm=model, prompt=prompt1, output_key="extraction")
chain_two = LLMChain(llm=model, prompt=prompt2, output_key="triple")

overal_chain = SequentialChain(
    chains=[chain_one, chain_two],
    input_variables=["sentence"],
    output_variables=["extraction", "triple"]
)

In [16]:
import pandas as pd
import csv

df_dataset = pd.read_csv('data/odoo_pos_datasets.csv')
for index, row in df_dataset.iterrows():
    sentence = row['Sentence']
    uid = row['id']
    feature = row['Feature']
    message = overal_chain({"sentence": sentence})
    ner = message['extraction'].split('\n')

    for row in ner:
        with open("data/odoo_pos_ner_result.csv", "a", newline="\n") as csvfile:
            writer = csv.writer(csvfile)
            row = f"{row}"
            writer.writerow(row.split(","))

    triples = message['triple'].split('\n')

    for row in triples:
        with open("data/odoo_pos_result.csv", "a", newline="\n") as csvfile:
            writer = csv.writer(csvfile)
            row = f"{uid},{feature},{row}"
            writer.writerow(row.split(","))

In [7]:
import pandas as pd
df_dataset = pd.read_csv('data/odoo_pos_datasets.csv')

In [None]:
def df_to_create_cql(df, uid):
    cql_query = "CREATE "
    entities = []
    for index, row in df.iterrows():
        if (row['subject'] in entities) == False:
            entities.append(row['subject'].strip())
        if (row['object'] in entities) == False:
            entities.append(row['object'].strip())
    init = 'a'
    obj = {}
    for i in range(len(entities)):
        char = chr(ord(init) + i)
        obj[entities[i]] = char
        category, entity = entities[i].split(':')
        cql_query += f"({char}:{category}" + "{" + f"id: '{uid}', name: '{entity}'" + "}), \n"

    idx = 1
    for _, row in df.iterrows():
        subjek = obj[row['subject'].strip()]
        objek = obj[row['object'].strip()]
        predicate = row['predicate'].strip()
        cql_query += f"({subjek})-[:RELATED_TO " + "{ type:"+ f"'{predicate}'" + "}]" + f"->({objek})"
        if(len(df) != idx):
            cql_query += ", \n"
        idx += 1
    return cql_query

In [13]:
import pandas as pd
df_result = pd.read_csv('data/odoo_pos_result.csv')

for index, row in df_dataset.iterrows():
    df_filter = df_result[df_result['id'] == row['id']]
    cql_query = df_to_create_cql(df_filter, row['id'])
    session.run(cql_query)

CREATE (a:CORE{id: 'dbfc0c1b-7196-4a79-99f0-6ea0e5a25eea', name: 'Odoo Point of Sale'}), 
(b:CORE{id: 'dbfc0c1b-7196-4a79-99f0-6ea0e5a25eea', name: 'shops'}), 
(c:CORE{id: 'dbfc0c1b-7196-4a79-99f0-6ea0e5a25eea', name: 'app'}), 
(d:HARDWARE{id: 'dbfc0c1b-7196-4a79-99f0-6ea0e5a25eea', name: 'device'}), 
(e:PLATFORM{id: 'dbfc0c1b-7196-4a79-99f0-6ea0e5a25eea', name: 'web browser'}), 
(f:CORE{id: 'dbfc0c1b-7196-4a79-99f0-6ea0e5a25eea', name: 'Product'}), 
(g:CORE{id: 'dbfc0c1b-7196-4a79-99f0-6ea0e5a25eea', name: 'stock'}), 
(h:CORE{id: 'dbfc0c1b-7196-4a79-99f0-6ea0e5a25eea', name: 'real-time statistics'}), 
(i:CORE{id: 'dbfc0c1b-7196-4a79-99f0-6ea0e5a25eea', name: 'data'}), 
(j:CORE{id: 'dbfc0c1b-7196-4a79-99f0-6ea0e5a25eea', name: 'shops'}), 
(a)-[:RELATED_TO { type:'runs'}]->(j), 
(c)-[:RELATED_TO { type:'works on'}]->(d), 
(d)-[:RELATED_TO { type:'has'}]->(e), 
(f)-[:RELATED_TO { type:'moves'}]->(g), 
(h)-[:RELATED_TO { type:'provide'}]->(i), 
(i)-[:RELATED_TO { type:'is consolidated acr

In [14]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
def bert_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    pooler_output = output['pooler_output'][0]
    return pooler_output.detach().numpy()

In [18]:
for index, row in df_dataset.iterrows():
    uid = row['id']
    records, summary, key = driver.execute_query("MATCH(n {id: $id})-[k]->(m {id: $id}) RETURN n.name, k.type, m.name", id=uid)
    for record in records:
        embedding_subject = bert_embedding(record['n.name'])
        session.run("MATCH (n {id: $id, name: $name}) WHERE n.embedding is null CALL db.create.setNodeVectorProperty(n, 'embedding', $embedding)", id=uid, name=record['n.name'], embedding=embedding_subject)
        embedding_object = bert_embedding(record['m.name'])
        session.run("MATCH (m {id: $id, name: $name}) WHERE m.embedding is null CALL db.create.setNodeVectorProperty(m, 'embedding', $embedding)", id=uid, name=record['m.name'], embedding=embedding_object)
        embedding_relation = bert_embedding(record['k.type'])
        session.run("MATCH(n {id: $id})-[k {type: $name}]->(m {id: $id}) WHERE k.embedding is null CALL db.create.setRelationshipVectorProperty(k, 'embedding', $embedding)", id=uid, name=record['k.type'], embedding=embedding_relation)

In [19]:
for index, row in df_dataset.iterrows():
    session.run("MATCH (n {id: $id}) SET n.app = 'Odoo POS', n.feature = $feature", id=row['id'], feature=row['Feature'])

In [None]:
# session.run("""
# CREATE VECTOR INDEX core_embedding IF NOT EXISTS
# FOR (n:Core)
# ON n.embedding
# OPTIONS {indexConfig: {
#  `vector.dimensions`: 768,
#  `vector.similarity_function`: 'cosine'
# }}""")

# session.run("""
# CREATE VECTOR INDEX core_embedding IF NOT EXISTS
# FOR (n:Core)
# ON n.embedding
# OPTIONS {indexConfig: {
#  `vector.dimensions`: 768,
#  `vector.similarity_function`: 'cosine'
# }}""")