In [6]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import SequentialChain, LLMChain

load_dotenv()

True

In [7]:
template1 = """You are expert software analyst, you can easily split and classify entity from sentence 
- Categorized as CORE if the entity is a specific feature of software requirement domain, example (system, application, payment, login) 
- Categorized as USER if the entity is a specific user of the software, example (Admin, developer, Student, customer, buyer, cashier) 
- Categorized as HARDWARE if the entity is a component of computer hardware, example (Keyboard, mouse, CPU) 
- Categorized as PLATFORM if the entity is a third party application or software platform, example (Linux, Paypal, eBay, Apache) 

Given sentence : {sentence}

- Output must separated by comma like [entity],[category]
- Each line is separated by '\n'
- Don't display anything in the output except the output csv format

"""
template2 = """From the output of extraction
^^^
{extraction}
^^^
extraction format is seperated by comma like [entity],[category]

and in context sentence : $$$
{sentence}
$$$

- Generate semantic triple that consist subject, predicate, and object from entities (which is delimited by ^^^)  base on context sentence (which is delimited by $$$) 
- Output must separated by comma the format is [category]:[entity],[predicate],[category]:[entity]
- Each line is separated by '\n'
- Don't display anything in the output except the output csv format
"""


prompt1 = ChatPromptTemplate.from_template(template1)
prompt2 = ChatPromptTemplate.from_template(template2)

model = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv('OPEN_API_KEY'), temperature=0)

chain1 = prompt1 | model
chain2 = prompt2 | model

chain_one = LLMChain(llm=model, prompt=prompt1, output_key="extraction")
chain_two = LLMChain(llm=model, prompt=prompt2, output_key="triple")

overal_chain = SequentialChain(
    chains=[chain_one, chain_two],
    input_variables=["sentence"],
    output_variables=["extraction", "triple"]
)

  warn_deprecated(


In [8]:
sentence = "A customer can inspect items online add them to a web cart and can purchase the item. Before paying for an item or adding an item to a web cart, the system checks the current inventory to make sure the item is not out of stock"

In [9]:
message = overal_chain({"sentence": sentence})
ner = message['extraction'].split('\n')
triples = message['triple'].split('\n')

  warn_deprecated(


In [10]:
print(triples)

['CORE:system,checks,CORE:inventory  ', 'USER:customer,inspects,CORE:item  ', 'USER:customer,adds,CORE:web cart  ', 'USER:customer,purchases,CORE:item  ', 'CORE:web cart,contains,CORE:item  ']


In [11]:
import pandas as pd

split_data = [item.split(',') for item in triples]
df = pd.DataFrame(split_data, columns=['subject', 'predicate', 'object'])
df

Unnamed: 0,subject,predicate,object
0,CORE:system,checks,CORE:inventory
1,USER:customer,inspects,CORE:item
2,USER:customer,adds,CORE:web cart
3,USER:customer,purchases,CORE:item
4,CORE:web cart,contains,CORE:item


In [81]:
def find_inverse_key(obj, key):
    return next((k for k, v in obj.items() if v == key), None)

def triplet_unique_flatten(data):
    flattened_unique = []
    for triplet in data:
        for element in triplet:
            if element not in flattened_unique:
                flattened_unique.append(element)
    return flattened_unique

def df_map_to_cql(df):
    entities = []
    for index, row in df.iterrows():
        if (row['subject'] in entities) == False:
            entities.append(row['subject'].strip())
        if (row['object'] in entities) == False:
            entities.append(row['object'].strip())
    verbObj = {}
    for index, row in df.iterrows():
        verbObj[row['predicate'].strip()] = f'r{index}'

    init = 'a'
    entityObj = {}
    for i in range(len(entities)):
        char = chr(ord(init) + i)
        entityObj[entities[i]] = char

    xyz = []
    for _, row in df.iterrows():
        subjek = entityObj[row['subject'].strip()]
        objek = entityObj[row['object'].strip()]
        predicate = verbObj[row['predicate'].strip()]
        xyz.append([subjek, predicate, objek])
    return xyz, entityObj, verbObj

def arr_map_to_cql(arr, flattenArr):
    cql_string = ""
    for i in range(len(arr)):
        cql_string += f"MATCH ({arr[i][0]})-[{arr[i][1]}:RELATED_TO]->({arr[i][2]}) \n"
    cql_string += "WITH "
    for i in range(len(flattenArr)):
        cql_string += f"vector.similarity.cosine({flattenArr[i]}.embedding, $embed_{flattenArr[i]}) as score_{flattenArr[i]}, "
    cql_string += "a.id as id, a.feature as feature, a.app as app \n RETURN id, feature, app, ("
    for i in range(len(flattenArr)):
        cql_string += f"score_{flattenArr[i]}+"
    cql_string = cql_string[:-1]
    cql_string += f")/{len(flattenArr)} as score ORDER BY score DESC LIMIT 10"
    return cql_string


MATCH (n)-[r:RELATED_TO]->(m)
WITH n.name as subject, m.name as object, r.type as verb, n.app as app, n.feature as feature, vector.similarity.cosine(n.embedding, $embed_one) as subjectScore, 
    vector.similarity.cosine(m.embedding, $embed_two) as objectScore,
     vector.similarity.cosine(r.embedding, $embed_verb) as verbScore
RETURN subject, object, verb, app, feature, ((subjectScore * 0.5) + (objectScore * 0.3) + (verbScore * 0.2)) as score
ORDER BY score DESC LIMIT 10

MATCH (n {name: "real-time statistics"})-[r:RELATED_TO {type: "provide"}]->(p {name: "data"})
MATCH (x {name: "data"})-[r2:RELATED_TO {type: "is consolidated across" }]->(y {name: "shops"})
return *

In [82]:
mapTriplet, entityObj, verbObj = df_map_to_cql(df)
flatten = triplet_unique_flatten(mapTriplet)
cql_string = arr_map_to_cql(mapTriplet, flatten)
cql_string

'MATCH (a)-[r0:RELATED_TO]->(b) \nMATCH (c)-[r1:RELATED_TO]->(g) \nMATCH (c)-[r2:RELATED_TO]->(e) \nMATCH (c)-[r3:RELATED_TO]->(g) \nMATCH (e)-[r4:RELATED_TO]->(g) \nWITH vector.similarity.cosine(a.embedding, $embed_a) as score_a, vector.similarity.cosine(r0.embedding, $embed_r0) as score_r0, vector.similarity.cosine(b.embedding, $embed_b) as score_b, vector.similarity.cosine(c.embedding, $embed_c) as score_c, vector.similarity.cosine(r1.embedding, $embed_r1) as score_r1, vector.similarity.cosine(g.embedding, $embed_g) as score_g, vector.similarity.cosine(r2.embedding, $embed_r2) as score_r2, vector.similarity.cosine(e.embedding, $embed_e) as score_e, vector.similarity.cosine(r3.embedding, $embed_r3) as score_r3, vector.similarity.cosine(r4.embedding, $embed_r4) as score_r4, a.id as id, a.feature as feature, a.app as app \n RETURN id, feature, app, (score_a+score_r0+score_b+score_c+score_r1+score_g+score_r2+score_e+score_r3+score_r4)/10 as score ORDER BY score DESC LIMIT 10'

MATCH (a)-[r0:RELATED_TO]->(b)
MATCH (c)-[r1:RELATED_TO]->(g)
MATCH (c)-[r2:RELATED_TO]->(e)
MATCH (c)-[r3:RELATED_TO]->(g)
MATCH (e)-[r4:RELATED_TO]->(g)
WITH vector.similarity.cosine(a.embedding, $embed_a) as score_a, vector.similarity.cosine(r0.embedding, $embed_r0) as score_r0, vector.similarity.cosine(b.embedding, $embed_b) as score_b, vector.similarity.cosine(c.embedding, $embed_c) as score_c, vector.similarity.cosine(r1.embedding, $embed_r1) as score_r1, vector.similarity.cosine(g.embedding, $embed_g) as score_g, vector.similarity.cosine(r2.embedding, $embed_r2) as score_r2, vector.similarity.cosine(e.embedding, $embed_e) as score_e, vector.similarity.cosine(r3.embedding, $embed_r3) as score_r3, vector.similarity.cosine(r4.embedding, $embed_r4) as score_r4, a.id as id, a.feature as feature, a.app as app
RETURN id, feature, app, (score_a+score_r0+score_b+score_c+score_r1+score_g+score_r2+score_e+score_r3+score_r4)/10 as score ORDER BY score DESC LIMIT 10

In [83]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [84]:
def bert_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    pooler_output = output['pooler_output'][0]
    return pooler_output.detach().numpy()

In [85]:
embedding = {}
for i in range(len(flatten)):
    if flatten[i][0] == 'r':
        verb = find_inverse_key(verbObj, flatten[i])
        embedding[f"embed_{flatten[i]}"] = bert_embedding(verb)
    else:
        raw = find_inverse_key(entityObj, flatten[i])
        category, entity = raw.split(':')
        embedding[f"embed_{flatten[i]}"] = bert_embedding(entity)

In [86]:
import neo4j

host = "localhost"
username = "neo4j"
password = "1234qwer"

driver = neo4j.GraphDatabase.driver(f'bolt://localhost:7687', auth=(username, password))
session = driver.session()

In [87]:
records, summary, key = driver.execute_query(cql_string, embedding)

for row in records:
    print(row['id'], row['feature'], row['app'], row['score'])

e612c4fa-4e7e-4fcf-96f3-d9f57abc0ea1 Online Sales POS Connect 0.9983813345432282
6d0c8b9e-9a9a-4d53-b40b-23fed2c39c0b Conflict Resolution POS Connect 0.9976717293262481
e8937dda-8bac-42e3-a343-9c5b7f14cbe5 Client Account POS Connect 0.9975002527236938
e8937dda-8bac-42e3-a343-9c5b7f14cbe5 Client Account POS Connect 0.9972433149814606
1ae14148-d054-4533-81d0-f5c33fc93898 Pricing Features Discounts Odoo POS 0.9968249917030334
877c1ca4-6795-4e61-821e-2822db5bc1f0 Inventory Management POS Connect 0.9967495918273925
54bec78a-5b57-45eb-b0c2-715cfb9bc165 Windows Service POS Connect 0.9967009127140045
ecdf70b7-2462-476f-aec0-91e57cf603e2 Windows Service POS Connect 0.9967009127140045
877c1ca4-6795-4e61-821e-2822db5bc1f0 Inventory Management POS Connect 0.9966874957084656
ecdf70b7-2462-476f-aec0-91e57cf603e2 Windows Service POS Connect 0.9965225160121918
