In [15]:
import neo4j
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from py2neo import Graph, Node, Relationship


In [16]:
driver = neo4j.GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "neo4j"))

In [17]:
query1 = "MATCH (l)-[:has]->(m)-[:has]->(n) Where l.id='Machine Learning' RETURN n"
query2 = "MATCH (l)-[:has]->(m)-[:has]->(n) Where l.id='Artificial Intelligence' RETURN n"

In [18]:
with driver.session() as session:
    result1 = session.run(query1)
    nodes1 = [record for record in result1]
    result2 = session.run(query2)
    nodes2 = [record for record in result2]

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
for node in nodes1:
    print(node)

<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:9' labels=frozenset({'Node'}) properties={'id': 'Singular Valued Decomposition'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:142' labels=frozenset({'Node'}) properties={'id': 'Linear Discriminant Analysis'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:141' labels=frozenset({'Node'}) properties={'id': 'Principal Component Analysis'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:140' labels=frozenset({'Node'}) properties={'id': 'Dimensionality Reduction Techniques'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:8' labels=frozenset({'Node'}) properties={'id': 'Density Based Clustering: DBSCAN'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:7' labels=frozenset({'Node'}) properties={'id': 'Expectation Maximization Algorithm'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:6' labels=froze

In [21]:
for node in nodes2:
    print(node)

<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:90' labels=frozenset({'Node'}) properties={'id': 'Banking'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:89' labels=frozenset({'Node'}) properties={'id': 'Retail'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:88' labels=frozenset({'Node'}) properties={'id': 'Applications in Healthcare'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:87' labels=frozenset({'Node'}) properties={'id': 'Problems Robotics can solve'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:86' labels=frozenset({'Node'}) properties={'id': 'Robot hardware'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:85' labels=frozenset({'Node'}) properties={'id': 'Robots'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a79-cc48ba99c1e8:84' labels=frozenset({'Node'}) properties={'id': 'Robotics'}>>
<Record n=<Node element_id='4:155f2b74-3f03-4186-8a7

In [22]:
embeddings1 = {}
embeddings2 = {}
for node in nodes1:
    text = node[property=='id']
    text = str(text)
    text = text.lower()
    tokens = tokenizer.encode(text, add_special_tokens=True)
    inputs = torch.tensor([tokens])
    outputs = model(inputs)
    embeddings1[node[property=='id']] = outputs[0].detach().numpy()
for node in nodes2:
    text = node[property=='id']
    text = str(text)
    text = text.lower()
    tokens = tokenizer.encode(text, add_special_tokens=True)
    inputs = torch.tensor([tokens])
    outputs = model(inputs)
    embeddings2[node[property=='id']] = outputs[0].detach().numpy()


In [23]:
similarity_matrix = np.zeros((len(nodes1), len(nodes2)))
for i, node1 in enumerate(nodes1):
    for j, node2 in enumerate(nodes2):
        id1 = (node1[property=='id'])
        id2 = (node2[property=='id'])
        if id1 in embeddings1 and id2 in embeddings2:
            vec1 = embeddings1[id1].flatten()
            vec1.resize(700000,)
            # print(vec1.shape)

            vec2 = embeddings2[id2].flatten()
            vec2.resize(700000,)
            similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
            similarity_matrix[i][j] = similarity
global_similarity = np.linalg.norm(similarity_matrix) / 100

In [24]:
print(similarity_matrix)

[[0.90114731 0.90079856 0.95735854 ... 0.92304593 0.91350758 0.91025037]
 [0.87808782 0.87742567 0.88240826 ... 0.88419133 0.88329488 0.88271809]
 [0.90186059 0.90364951 0.96491671 ... 0.92525429 0.91473395 0.90667415]
 ...
 [0.89726454 0.89693868 0.92788899 ... 0.9082796  0.97622168 0.90458292]
 [0.89503199 0.89474517 0.92628658 ... 0.90615451 0.97287196 0.90015745]
 [0.89655137 0.89606869 0.92407614 ... 0.90990281 0.97650099 0.90287906]]


In [25]:
final_global_similarity = np.mean(global_similarity)*100
final_global_similarity=round(final_global_similarity,2)
print('The Similarity percentage between Artificial Intelligence and Machine Learning is',final_global_similarity,'%')

The Similarity percentage between Artificial Intelligence and Machine Learning is 57.04 %


In [26]:
for row in similarity_matrix:
    for element in row:
        print(element, end=' ')
    print()


0.9011473059654236 0.9007985591888428 0.9573585391044617 0.9122915267944336 0.9180262684822083 0.904172420501709 0.90529865026474 0.924995481967926 0.9233511090278625 0.9243983030319214 0.9158735275268555 0.9654698967933655 0.9158455729484558 0.9563771486282349 0.9117161631584167 0.9135382175445557 0.9230326414108276 0.9697195291519165 0.9672355055809021 0.925800085067749 0.9234510064125061 0.9767163395881653 0.9036552906036377 0.9647734761238098 0.9012932181358337 0.9181299209594727 0.8923527598381042 0.9100490808486938 0.9207579493522644 0.912461519241333 0.9138266444206238 0.8861016631126404 0.9709042906761169 0.9704002141952515 0.9194340705871582 0.9657498002052307 0.9757285714149475 0.8798525333404541 0.9647321701049805 0.9012029767036438 0.9181942343711853 0.919590413570404 0.8995956182479858 0.9227431416511536 0.8892325162887573 0.898747980594635 0.8964406847953796 0.9631302952766418 0.9160908460617065 0.9691061973571777 0.8932597041130066 0.9721059203147888 0.9716989994049072 0