In [1]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import SequentialChain, LLMChain

load_dotenv()

True

In [50]:
import neo4j

host = "localhost"
username = "neo4j"
password = "1234qwer"

# driver = neo4j.GraphDatabase.driver("neo4j://100.27.33.222:7687",
#   auth=neo4j.basic_auth("neo4j", "price-oxygens-scores")
# )
driver = neo4j.GraphDatabase.driver(f'bolt://localhost:7687', auth=(username, password))
session = driver.session()

def create_graph(query):
    session.run(query)

In [14]:
template1 = """You are expert english linguists 

Given sentence : $$$ {sentence} $$$

- Generate semantic triple that consist subject, predicate, and object  base on context sentence (which is delimited by $$$) 
- Output must separated by comma the format is [subject],[predicate],[object]
- Avoid using pronouns such as (you, he, she, it, we, they etc), replace with more relevant subject
- Each line is separated by '\n'
- Don't display anything in the output except the output csv format

"""

prompt1 = ChatPromptTemplate.from_template(template1)

model = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv('OPEN_API_KEY'), temperature=0)

chain1 = prompt1 | model

chain_one = LLMChain(llm=model, prompt=prompt1, output_key="triple")

overal_chain = SequentialChain(
    chains=[chain_one],
    input_variables=["sentence"],
    output_variables=["triple"]
)

In [None]:
import pandas as pd
import csv

df_dataset = pd.read_csv('output/datasets_all_sentence.csv')
for index, row in df_dataset.iterrows():
    sentence = row['sentence']
    app = row['app']
    uid = row['id']
    feature = row['feature']
    message = overal_chain({"sentence": sentence})
    triples = message['triple'].split('\n')

    for row in triples:
        with open("data/all_triplet_lang_result.csv", "a", newline="\n") as csvfile:
            writer = csv.writer(csvfile)
            row = f"{uid},{app},{feature},{row}"
            writer.writerow(row.split(","))

In [51]:
import pandas as pd
df_dataset = pd.read_csv('data/datasets_all_sentence.csv')

In [52]:
def df_to_create_cql(df, uid):
    cql_query = "MERGE (req:Requirement {id: '" + uid +"'}) \n"
    for index, row in df.iterrows():
        cql_query += f"MERGE (s{index}:CORE" + " {name: '" + row['subject'] + "'}) \n"
        cql_query += f"MERGE (p{index}:VERB" + " {name: '" + row['predicate'] + "'}) \n"
        cql_query += f"MERGE (o{index}:CORE" + " {name: '" + row['object'] + "'}) \n"
        cql_query += f"MERGE (s{index})-[:RELATED_TO]->(p{index})-[:RELATED_TO]->(o{index}) \n"
        cql_query += f"MERGE (req)-[:CONTAINS]->(s{index}) \n"
        cql_query += f"MERGE (req)-[:CONTAINS]->(p{index}) \n"
        cql_query += f"MERGE (req)-[:CONTAINS]->(o{index}) \n"
    return cql_query

In [53]:
import pandas as pd
df_result = pd.read_csv('data/all_triplet_lang_result.csv')

for index, row in df_dataset.iterrows():
    try:
        df_filter = df_result[df_result['id'] == row['id']]
        cql_query = df_to_create_cql(df_filter, row['id'])
        session.run(cql_query)
    except Exception as e:
        print(e)
        break

In [54]:
for index, row in df_dataset.iterrows():
    session.run(f"MATCH (r:Requirement) WHERE r.id = '{row['id']}' SET r.app = '{row['app']}', r.sentence = '{row['sentence'].replace("'", "")}'")

In [45]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [46]:
def bert_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    pooler_output = output['pooler_output'][0]
    return pooler_output.detach().numpy()

In [47]:
for index, row in df_dataset.iterrows():
    uid = row['id']
    records, summary, key = driver.execute_query("MATCH(r:Requirement {id: $id})-[:CONTAINS]->(e) RETURN DISTINCT e.name", id=uid)
    for record in records:
        embedding = bert_embedding(record['e.name'])
        session.run("MATCH (n {name: $name}) WHERE n.embedding is null SET n.embedding = $embedding", name=record['e.name'], embedding=embedding)

In [48]:
for index, row in df_dataset.iterrows():
    uid = row['id']
    embedding_feature = bert_embedding(row['feature'])
    embedding_sentence = bert_embedding(row['sentence'])
    session.run("MATCH (n:Requirement {id: $id}) SET n.embedding_feature = $embedding_feature, n.embedding_sentence = $embedding_sentence", id=uid, embedding_feature=embedding_feature, embedding_sentence=embedding_sentence)

In [13]:
for index, row in df_dataset.iterrows():
    session.run("MATCH (n {id: $id}) SET n.app = $app, n.feature = $feature", app=row['app'], id=row['id'], feature=row['feature'])