In [50]:
import neo4j

host = "localhost"
username = "neo4j"
password = "1234qwer"

# driver = neo4j.GraphDatabase.driver("neo4j://100.27.33.222:7687",
#   auth=neo4j.basic_auth("neo4j", "price-oxygens-scores")
# )
driver = neo4j.GraphDatabase.driver(f'bolt://localhost:7687', auth=(username, password))
session = driver.session()

def create_graph(query):
    session.run(query)

In [51]:
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [52]:
def bert_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    pooler_output = output['pooler_output'][0]
    return pooler_output.detach().numpy()


def sentence_transformer_embedding(text):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(text)
    return embeddings

In [None]:
records, summary, key = driver.execute_query("""
MATCH (n:CORE|VERB) RETURN n.name""")
entities = []
for record in records:
    name = record['n.name']
    entities.append(name)

embedding = sentence_transformer_embedding(entities)



In [37]:
for i in range(len(entities)):
    name = entities[i]
    session.run(f"""
    MATCH (n:CORE|VERB {{name: "{name}"}})
    SET n.embedding = $embedding
    """, embedding=embedding[i])

In [38]:
session.run(""" 
MATCH (source:CORE)
RETURN gds.graph.project(
  'newCoreGraph',
  source,
  null,
  {
    sourceNodeProperties: source { .embedding },
    targetNodeProperties: {}
  },
  { undirectedRelationshipTypes: ['*'] }
)
""")

<neo4j._sync.work.result.Result at 0x38b6ec800>

In [45]:
import pandas as pd

entities = []
records, summary, key = driver.execute_query("""
CALL gds.knn.stream('newCoreGraph', {
    topK: 2,
    nodeProperties: ['embedding'],
    randomSeed: 1337,
    concurrency: 1,
    sampleRate: 1.0,
    deltaThreshold: 0.0
})
YIELD node1, node2, similarity
WHERE similarity > 0.8
RETURN gds.util.asNode(node1).name AS Req1, gds.util.asNode(node2).name AS Req2, similarity
ORDER BY similarity DESCENDING, Req1, Req2""")
for record in records:
    entities.append({"entity1": record['Req1'], "entity2": record['Req2'], "similarity": record['similarity']})
df_entity = pd.DataFrame(entities)

In [46]:
df_entity['sorted_pair'] = df_entity.apply(lambda row: tuple(sorted([row['entity1'], row['entity2']])), axis=1)
df_reduced = df_entity.drop_duplicates(subset='sorted_pair').drop(columns='sorted_pair')
df_reduced.to_csv('data/entity_similarity.csv', index=False)

In [48]:
for index, row in df_reduced.iterrows():
    session.run("""
MATCH (old:CORE {name: $entity1})
MATCH (new:CORE {name: $entity2})

WITH old, new
MATCH (old)<-[:CONTAINS]-(target)
MERGE (new)<-[:CONTAINS]-(target)
                
WITH old, new
MATCH (old)-[:RELATED_TO]->(target)
MERGE (new)-[:RELATED_TO]->(target)

WITH old, new
MATCH (source)-[:RELATED_TO]->(old)
MERGE (source)-[:RELATED_TO]->(new)

WITH old, new
DETACH DELETE old
    """, entity1=row['entity1'], entity2=row['entity2'])