In [14]:
import neo4j
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from py2neo import Graph, Node, Relationship


In [15]:
driver = neo4j.GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "Sed@261510040202"))

In [16]:
query1 = "MATCH (l)-[:has]->(m)-[:has]->(n) Where l.id='Project Management' RETURN n"
query2 = "MATCH (l)-[:has]->(m)-[:has]->(n) Where l.id='Software Engineering' RETURN n"

In [17]:
with driver.session() as session:
    result1 = session.run(query1)
    nodes1 = [record for record in result1]
    result2 = session.run(query2)
    nodes2 = [record for record in result2]

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
for node in nodes1:
    print(node)

<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:93' labels=frozenset({'Node'}) properties={'id': 'Closing the Projects'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:92' labels=frozenset({'Node'}) properties={'id': 'Project Leadership and Ethics'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:76' labels=frozenset({'Node'}) properties={'id': 'Software Configuration Management'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:75' labels=frozenset({'Node'}) properties={'id': 'Formal Technical Reviews'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:74' labels=frozenset({'Node'}) properties={'id': 'Risk Analysis & Management'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:91' labels=frozenset({'Node'}) properties={'id': 'Project Contracting'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:90' labels=frozenset({'Node'}) properties={'id': '

In [20]:
for node in nodes2:
    print(node)

<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:93' labels=frozenset({'Node'}) properties={'id': 'Closing the Projects'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:92' labels=frozenset({'Node'}) properties={'id': 'Project Leadership and Ethics'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:76' labels=frozenset({'Node'}) properties={'id': 'Software Configuration Management'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:75' labels=frozenset({'Node'}) properties={'id': 'Formal Technical Reviews'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:74' labels=frozenset({'Node'}) properties={'id': 'Risk Analysis & Management'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:91' labels=frozenset({'Node'}) properties={'id': 'Project Contracting'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:90' labels=frozenset({'Node'}) properties={'id': '

In [21]:
embeddings1 = {}
embeddings2 = {}
for node in nodes1:
    text = node[property=='id']
    text = str(text)
    text = text.lower()
    tokens = tokenizer.encode(text, add_special_tokens=True)
    inputs = torch.tensor([tokens])
    outputs = model(inputs)
    embeddings1[node[property=='id']] = outputs[0].detach().numpy()
for node in nodes2:
    text = node[property=='id']
    text = str(text)
    text = text.lower()
    tokens = tokenizer.encode(text, add_special_tokens=True)
    inputs = torch.tensor([tokens])
    outputs = model(inputs)
    embeddings2[node[property=='id']] = outputs[0].detach().numpy()


In [22]:
similarity_matrix = np.zeros((len(nodes1), len(nodes2)))
for i, node1 in enumerate(nodes1):
    for j, node2 in enumerate(nodes2):
        id1 = (node1[property=='id'])
        id2 = (node2[property=='id'])
        if id1 in embeddings1 and id2 in embeddings2:
            vec1 = embeddings1[id1].flatten()
            vec1.resize(700000,)
            # print(vec1.shape)

            vec2 = embeddings2[id2].flatten()
            vec2.resize(700000,)
            similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
            similarity_matrix[i][j] = similarity
global_similarity = np.linalg.norm(similarity_matrix) / 100

In [23]:
print(similarity_matrix)

[[1.         0.92128551 0.96790767 0.96935022 0.91753942 0.93198597
  0.92512584 0.90392137 0.92824554 0.97223198 0.9268474  0.92345637
  0.9051578  0.92562449 0.92621636 0.9203161  0.86745954 0.97391087
  0.97132349 0.92407972 0.8978194  0.97310448 0.97315812 0.97356719
  0.9662838  0.92383695 0.92741865 0.97442985 0.96380538 0.89793789
  0.91875446]
 [0.92128551 0.99999994 0.92279047 0.92247957 0.97079611 0.91642982
  0.96547383 0.92264485 0.91153377 0.92031074 0.90908742 0.97444308
  0.92110997 0.91218555 0.91327542 0.96511453 0.86099106 0.92588687
  0.92635518 0.97375697 0.89835024 0.9292767  0.92895895 0.92892164
  0.92177707 0.9104684  0.96495116 0.92910302 0.92126387 0.90849769
  0.96277827]
 [0.96790767 0.92279047 1.         0.97462916 0.92239356 0.93189603
  0.92187631 0.90427041 0.93926781 0.9738968  0.93373567 0.92602724
  0.91166037 0.93569022 0.9323498  0.92695665 0.86414188 0.97745228
  0.9855237  0.92643768 0.89119297 0.9740591  0.97585881 0.97805691
  0.98495692 0.93290

In [33]:
final_global_similarity = np.mean(global_similarity)*100
final_global_similarity=round(final_global_similarity,2)
print('The Similarity percentage between Software Engineering and Project Management is',final_global_similarity,'%')

The Similarity percentage between Software Engineering and Project Management is 28.85 %


In [25]:
mean_similarity = np.mean(global_similarity)
print(mean_similarity)

0.2885281695414675


In [26]:
for row in similarity_matrix:
    for element in row:
        print(element, end=' ')
    print()


1.0 0.9212855100631714 0.9679076671600342 0.9693502187728882 0.9175394177436829 0.9319859743118286 0.9251258373260498 0.903921365737915 0.9282455444335938 0.9722319841384888 0.9268473982810974 0.9234563708305359 0.9051578044891357 0.9256244897842407 0.9262163639068604 0.9203161001205444 0.8674595355987549 0.9739108681678772 0.9713234901428223 0.9240797162055969 0.8978193998336792 0.9731044769287109 0.9731581211090088 0.9735671877861023 0.9662837982177734 0.9238369464874268 0.9274186491966248 0.9744298458099365 0.9638053774833679 0.8979378938674927 0.9187544584274292 
0.9212855100631714 0.9999999403953552 0.9227904677391052 0.9224795699119568 0.9707961082458496 0.9164298176765442 0.9654738306999207 0.922644853591919 0.911533772945404 0.9203107357025146 0.9090874195098877 0.9744430780410767 0.9211099743843079 0.912185549736023 0.913275420665741 0.9651145339012146 0.8609910607337952 0.925886869430542 0.9263551831245422 0.9737569689750671 0.8983502388000488 0.929276704788208 0.928958952426