In [1]:
from transformers import TFBertModel
from transformers import AutoTokenizer, BertModel
import os
os.environ['CURL_CA_BUNDLE'] = ''
import torch
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
import json
import pandas as pd
from openai import AzureOpenAI

#from azure.identity import DefaultAzureCredential, get_bearer_token_provider  

Using Azure OpenAI embedding models

In [2]:
client_small = AzureOpenAI(
  api_key = os.getenv("OPENAI_API_KEY"),
  azure_endpoint = os.getenv("EMBEDDING_SMALL_ENDPOINT"),
  api_version = os.getenv("API_VERSION"))


In [3]:
#client_large = AzureOpenAI(
#  api_key = os.getenv("EMBEDDING_LARGE_API_KEY"),
#  azure_endpoint = os.getenv("EMBEDDING_LARGE_ENDPOINT"),
#  api_version = os.getenv("API_VERSION"))

In [4]:
# The function receives a sencence and returns the embedding (1D numpy array)
def get_embeddings_openAI(text):
    model = "text-embedding-3-small"
    embedding_small = client_small.embeddings.create(input = text, model=model)
    #embedding_large = client_large.embeddings.create(input = [text], model=model)
    return embedding_small.data #.data[0].embedding


Load BERT Model (later we can change to other model)

In [9]:
# Choose the pretrained model
model_checkpoint = 'bert-base-multilingual-cased' #'bert-base-multilingual-cased'   #'google/bert_uncased_L-2_H-128_A-2' (Tiny BERT)
# Max number of tokens in the sentence
max_length= 512 #512 #128

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length= max_length)

# Load the model
#bert_model = TFBertModel.from_pretrained(model_checkpoint, from_pt=True)
bert_model = BertModel.from_pretrained(model_checkpoint)



In [10]:
# The function receives a sencence and returns the embedding (1D numpy array)
def get_embeddings(text):
  input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True, truncation=True)).unsqueeze(0)  # Batch size 1
  outputs = bert_model(input_ids)
  last_hidden_states = outputs[0]
  pooler_output = torch.mean(last_hidden_states, dim=1) 
  return(pooler_output.detach().numpy()[0].tolist())

Connecting to Neo4j

In [5]:
load_dotenv()

# Neo4j variables
NEO4J_URL = os.getenv("NEO4J_URL")
NEO4J_USERNAME =os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

#Connecting to the graph
graph = Neo4jGraph(
    url=NEO4J_URL,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

In [5]:
#Quering the thesis from graph unsing Cypher
result = graph.query("""
MATCH (thesis:Thesis)
RETURN thesis.uri, thesis.title, thesis.abstract
""")

In [6]:
### Transformamos o resultado em um dicionário. Em uma próxima versão, o resultado deve ser transformado diretamente em um DataFrame e CSV.

try:
    embedding_csv = pd.read_csv('data/embedding_openai.csv') 
    #with open('data/embeddings_BERT.json', 'r') as fp:
    #with open('data/embeddings_openai.json', 'r') as fp:
    #    dic_embeddings = json.load(fp)

except:
    embedding_csv = pd.DataFrame(data={'uri':[], 'embeddings': []})
    #dic_embeddings = {}

batch_size = 100
n = 0

texts_list = []
thesis_uri = []
embeddings = []

for thesis in result:
    n = n + 1   

    texts_list.append(str(thesis['thesis.title']) + '\n   \n' + str(thesis['thesis.abstract']))
    thesis_uri.append(thesis['thesis.uri'])

    if n % batch_size == 0:
        print ("Batch number: ", n)
        embs_list = get_embeddings_openAI(texts_list)

        for i in range(len(thesis_uri)):
            #dic_embeddings[thesis_uri[i]] = embs_list[i].embedding
            embeddings.append(embs_list[i].embedding)

        #with open('data/embeddings_BERT.json', 'w') as fp:
        #with open('data/embeddings_openai.json', 'w') as fp:
        #    json.dump(dic_embeddings, fp)
        
        embedding_csv_batch = pd.DataFrame(data={'uri':thesis_uri, 'embeddings': embeddings})
        embedding_csv = pd.concat([embedding_csv, embedding_csv_batch], ignore_index=True)
        embedding_csv.to_csv('data/embedding_openai.csv', index=False)
        #embedding_csv.to_csv('data/embedding_BERT.csv', index=False)

        texts_list = []
        thesis_uri = []
        embeddings = []


    #try:
    #    
    #    dic_embeddings[thesis['thesis.uri']] = get_embeddings_openAI(str(thesis['thesis.title']) + '\n   \n' + str(thesis['thesis.abstract']))
    #    #dic_embeddings[thesis['thesis.uri']] = get_embeddings(str(thesis['thesis.title']) + '\n   \n' + str(thesis['thesis.abstract']))
    #except:
    #    print(thesis['thesis.uri'])
    #    print(thesis['thesis.title'])
    #    print(thesis['thesis.abstract'])
    #    print('--------------')



Batch number:  100
Batch number:  200
Batch number:  300
Batch number:  400
Batch number:  500
Batch number:  600
Batch number:  700
Batch number:  800
Batch number:  900
Batch number:  1000
Batch number:  1100
Batch number:  1200
Batch number:  1300
Batch number:  1400
Batch number:  1500
Batch number:  1600
Batch number:  1700
Batch number:  1800
Batch number:  1900
Batch number:  2000
Batch number:  2100
Batch number:  2200
Batch number:  2300
Batch number:  2400
Batch number:  2500
Batch number:  2600
Batch number:  2700
Batch number:  2800
Batch number:  2900
Batch number:  3000
Batch number:  3100
Batch number:  3200
Batch number:  3300
Batch number:  3400
Batch number:  3500
Batch number:  3600
Batch number:  3700
Batch number:  3800
Batch number:  3900
Batch number:  4000
Batch number:  4100
Batch number:  4200
Batch number:  4300
Batch number:  4400
Batch number:  4500
Batch number:  4600
Batch number:  4700
Batch number:  4800
Batch number:  4900
Batch number:  5000
Batch num

In [None]:
# Salvando o dicionário

#with open('data/embeddings_BERT.json', 'w') as fp:
#with open('data/embeddings_openai.json', 'w') as fp:
#    json.dump(dic_embeddings, fp)

#with open('data/embeddings_BERT.json', 'r') as fp:
#with open('data/embeddings_openai.json', 'r') as fp:
#    dic_embeddings = json.load(fp)

In [None]:
# Transformando o dicionário em CSV

#n = 0
#uri = []
#embeddings = []
#for i in dic_embeddings:
#    n = n + 1
#    uri.append(i)
#    embeddings.append(dic_embeddings[i])
    #if n == 10:
    #    break

#embedding_csv = pd.DataFrame(data={'uri':uri, 'embeddings': embeddings})
#embedding_csv.to_csv('data/embedding_openai.csv', index=False)
#embedding_csv.to_csv('data/embedding_BERT.csv', index=False)

In [7]:
# Loading the embedding as property of Thesis in No4j
#graph.query("""
#    LOAD CSV WITH HEADERS
#    FROM 'file:///C:/Users/facordei/OneDrive%20-%20Capgemini/Documents/GitHub/Indigenous-Slavery-KG/data/embedding_BERT.csv' AS row
#    MATCH (n:Thesis) Where n.uri = row.uri
#    CALL db.create.setNodeVectorProperty(n, 'embedding', apoc.convert.fromJsonList(row.embeddings))
#    """)

graph.query("""
    LOAD CSV WITH HEADERS
    FROM 'file:///C:/Users/facordei/OneDrive%20-%20Capgemini/Documents/GitHub/Indigenous-Slavery-KG/data/embedding_openai.csv' AS row
    MATCH (n:Thesis) Where n.uri = row.uri
    CALL db.create.setNodeVectorProperty(n, 'embedding_openai', apoc.convert.fromJsonList(row.embeddings))
    """)

[]

In [25]:
dimension = len(embedding_csv["embeddings"][0])
#dimension = len(get_embeddings('test'))
#dimension

# Creating a vector index
graph.query("""
    CREATE VECTOR INDEX Thesis_Embeddings IF NOT EXISTS
    FOR (n:Thesis)
    ON n.embedding_openai
    OPTIONS {indexConfig: {
    `vector.dimensions`: """ + str(dimension) + """,
    `vector.similarity_function`: 'cosine'}}
    """)

[]

Buscando teses com vetores similares a um determinado texto

In [95]:
#texto que será usado na busca
#query_text = 'Nelson Mandela'
personagem = 'Kabengele Munanga'
query_text = 'Quem foi ' + personagem + ' ?'
# Transformando as query em vetor
#query_embedding = get_embeddings(query_text)
query_embedding = get_embeddings_openAI(query_text)[0].embedding


In [96]:
# Buscando no índice de vetores 
result = graph.query("""
    CALL db.index.vector.queryNodes('Thesis_Embeddings', 5, """ + str(query_embedding) + """)
    YIELD node, score
    RETURN node.title, node.abstract
    """)

contexto = ''
for r in result:
    contexto = contexto + 'Título: ' + r['node.title'] + ' \n'
    contexto = contexto + 'Título: ' + r['node.abstract'] + ' \n \n'
print(contexto)

Título: MNU representa Zumbi (1970-2005): cultura histórica, movimento negro e ensino de história 
Título: This research aimed to do an appreciation of the historical representations (historiography and historical culture) of a black protagonist of the seventeenth century, Zumbi dos Palmares, produced between 1970 and 2005. The analysis was around of the intellectual production (Clovis Moura, Decio Freitas, Birth and Joel Rufino dos Santos) and of the performance of the leaderships of the Black Movement Unified (MNU), established in 1978 (Amauri Mendes Pereira, Edson Cardoso, Helena Machado, Marcos Cardoso, Milton Barbosa, Yedo Ferreira, Oliveira Silveira and Lelia Gonzalez). The proposal was understand through the historiographical review and oral history, the implications political and pedagogical (teaching of history) of the choices of activist intellectuals and black "militants" on the interpretations of the experiences of black resistance and the construction of meanings of the as

Buscando usando full text search

In [27]:
# Criand o indice Full Text

graph.query("""
    CREATE FULLTEXT INDEX Thesis_fulltext IF NOT EXISTS FOR (n:Thesis) ON EACH [n.title, n.abstract]
    """)

#    OPTIONS {
#    indexConfig: {
#        `fulltext.analyzer`: 'brazilian',
#        `fulltext.eventually_consistent`: true
#        }
#            }

[]

In [28]:
result = graph.query("""
CALL db.index.fulltext.queryNodes("Thesis_fulltext", '""" + query_text + """') YIELD node, score
RETURN score, node.title, node.abstract 
LIMIT 5 
""")
result

[{'score': 9.481579780578613,
  'node.title': 'A escrita de si como construção da subjetividade da autora Carolina Maria de Jesus: análise de fragmentos discursivos da obra Quarto de Despejo.',
  'node.abstract': 'A história de Carolina Maria de Jesus ainda não é bastante conhecida do público brasileiro, nem mesmo daqueles que hoje produzem ou consomem a literatura considerada marginal. Entretanto, a representatividade de sua obra para as vozes que continuam silenciadas é fundamental como precursora e incentivadora. Moradora de uma favela no Canindé, zona norte de São Paulo, Carolina conheceu o jornalista Audálio Dantas em uma das visitas que este fazia ao local. Esta aproximação resultou na publicação dos seus diários, em que descrevia como ninguém o cotidiano da comunidade em cadernos que encontrava no lixo. Mulher e negra, a autora é uma das pioneiras da Literatura Negra. Protagonista de sua história, trazia em seu discurso a resistência e o enfrentamento ao poder, a fim de superar 

## Gerando texto com base nas teses buscadas

Carregando modelo GPT-35

In [8]:
client_gpt35 = AzureOpenAI(
  api_key = os.getenv("OPENAI_API_KEY"),
  azure_endpoint = os.getenv("GPT35_ENDPOINT"),
  api_version = os.getenv("API_VERSION"))

In [None]:
generate_questions_prompt = {}

In [99]:
question = client_gpt35.chat.completions.create(
      model="gpt-35-turbo",  
      messages=[
     {
        "role": "system",
        "content": "Seu objetivo é gerar uma pergunta sobre um personagem com base em teses de mestrado e doutorado. Caso o personagem não seja mencionado no centexto, não gere resposta.\n\nVocê receberá um contexto e o nome do personagem.\nO contexto contém uma lista de textos proveniente de tese de mestrado e doutorado com seus respectivos títulos.\nO personagem é uma figura histórica que está presente nos textos do contexto.\n\nInstruções:\n- Caso a personagem não seja citado nos textos do contexto informe retorne 'None'.\n- A perguntas deve ser direta.\n- As perguntas devem estar relacionadas com a personagem.\n- A pergunta deve estar relacionada com os textos do contexto.\n- A pergunta deve focar na personagem e não nos textos."
    },
    {
        "role": "user",
        "content": "- contexto: {{contexto}}\n- personagem: {{personagem}}\n"
    },
    {
        "role": "assistant",
        "content": ""
    },
    {
        "role": "user",
        "content": contexto + "- personagem: " + personagem 

    }
],  
          #past_messages=10,  
          max_tokens=800,  
          temperature=1.0,  
          top_p=0.90,  
          frequency_penalty=0,  
          presence_penalty=0,  
          stop=None,  
          stream=False  
      )  
        
question = question.to_dict()['choices'][0]['message']['content']

In [101]:
question

'Qual é a contribuição de Kabengele Munanga para o estudo da produção literária moçambicana e das literaturas africanas de língua portuguesa?'

Dois agentes conversando

In [102]:
answer = client_gpt35.chat.completions.create(
      model="gpt-35-turbo",  
      messages=[
     {
        "role": "system",
        "content": "Seu objetivo é responder a pergunta com base no contexto."
    },
    {
        "role": "user",
        "content": ""
    },
    {
        "role": "assistant",
        "content": contexto},
    {
        "role": "user",
        "content": question

    }
],  
          #past_messages=10,  
          max_tokens=800,  
          temperature=1.0,  
          top_p=0.90,  
          frequency_penalty=0,  
          presence_penalty=0,  
          stop=None,  
          stream=False  
      )  

answer = answer.to_dict()['choices'][0]['message']['content']

In [103]:
print(answer)

O texto menciona que Kabengele Munanga contribuiu para a pesquisa ao fornecer dados críticos e descritivos de natureza histórica e social que possibilitaram o contato com o mundo colonial nos moldes euroafricanos, o que era necessário para a pesquisa sobre a produção literária moçambicana. Portanto, sua contribuição está em fornecer informações que permitem uma melhor compreensão do contexto histórico e social em que a literatura moçambicana foi produzida. Não é mencionado especificamente qual é a contribuição de Munanga para as literaturas africanas de língua portuguesa.


In [104]:
replica = client_gpt35.chat.completions.create(
      model="gpt-35-turbo",  
      messages=[
     {
        "role": "system",
        "content": "Seu objetivo é gerar uma pergunta de aprofundamento com base nas respostas dada pelos assistant."
    },
    {
        "role": "user",
        "content": ""
    },
    {
        "role": "assistant",
        "content": question},
    {
        "role": "user",
        "content": answer

    }
],  
          #past_messages=10,  
          max_tokens=800,  
          temperature=1.0,  
          top_p=0.90,  
          frequency_penalty=0,  
          presence_penalty=0,  
          stop=None,  
          stream=False  
      )  

replica = replica.to_dict()['choices'][0]['message']['content']

In [105]:
tréplica = client_gpt35.chat.completions.create(
      model="gpt-35-turbo",  
      messages=[
     {
        "role": "system",
        "content": "Seu objetivo é responder a pergunta."
    },
    {
        "role": "user",
        "content": question
    },
    {
        "role": "assistant",
        "content": answer},
    {
        "role": "user",
        "content": replica
    }
],  
          #past_messages=10,  
          max_tokens=800,  
          temperature=1.0,  
          top_p=0.90,  
          frequency_penalty=0,  
          presence_penalty=0,  
          stop=None,  
          stream=False  
      )  

tréplica = tréplica.to_dict()['choices'][0]['message']['content']

In [106]:
print(tréplica)

As informações fornecidas por Kabengele Munanga sobre o contexto histórico e social em que a literatura moçambicana foi produzida podem ser extrapoladas para outras literaturas africanas de língua portuguesa, já que muitas delas também foram produzidas em contextos coloniais e pós-coloniais. Dessa forma, as informações fornecidas por Munanga podem contribuir para uma melhor compreensão das literaturas africanas de língua portuguesa como um todo, permitindo uma análise mais aprofundada do impacto do colonialismo e de outras questões sociais e políticas na produção literária desses países.


# RASCUNHO

In [32]:
n = 0
small_dic = {}
for i in dic_embeddings:
    n = n + 1
    small_dic[i]=dic_embeddings[i]
    if n == 10:
        break

with open('data/small_embeddings.json', 'w') as fp:
    json.dump(small_dic, fp)

In [30]:
ini = 'CALL apoc.import.json("'
dic = str(small_dic)
fim = '" YELD'
print(ini + dic + fim)
result = graph.query(ini + dic + fim)

result

CALL apoc.import.json("{'tag:stardog:api:_e_teko_e_arandu_e_producao_de_subjetividades_e_educacao_superior_e_educacoes_outras_modos_de_vida_criados_e_afirmados_por_kaiowas_e_guaranis': [-0.036806486546993256, 0.053812891244888306, 0.4343889653682709, 0.07008563727140427, 0.06726423650979996, -0.0707978904247284, -0.1256973147392273, 0.12517786026000977, -0.15045271813869476, -0.0530383475124836, -0.14858010411262512, 0.36490532755851746, 0.21294069290161133, 0.293968141078949, 0.08269951492547989, -0.24902383983135223, 0.5886573195457458, -0.3843301236629486, -0.5035964846611023, -0.09140083193778992, -0.20768438279628754, 0.14955902099609375, -0.16800528764724731, -0.12197914719581604, -0.01872563175857067, 0.49669182300567627, -0.49017098546028137, -0.07764355838298798, -0.5541002750396729, -0.4451574385166168, 0.019914740696549416, 0.2981143593788147, -0.08936356753110886, 0.43163129687309265, -0.08834492415189743, -0.1539146602153778, 0.0964585542678833, -0.12782658636569977, 0.520

CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input 'YELD': expected
  "!="
  "%"
  ")"
  "*"
  "+"
  ","
  "-"
  "/"
  "::"
  "<"
  "<="
  "<>"
  "="
  "=~"
  ">"
  ">="
  "AND"
  "CONTAINS"
  "ENDS"
  "IN"
  "IS"
  "OR"
  "STARTS"
  "XOR"
  "^"
  "||" (line 1, column 16429 (offset: 16428))
"CALL apoc.import.json("{'tag:stardog:api:_e_teko_e_arandu_e_producao_de_subjetividades_e_educacao_superior_e_educacoes_outras_modos_de_vida_criados_e_afirmados_por_kaiowas_e_guaranis': [-0.036806486546993256, 0.053812891244888306, 0.4343889653682709, 0.07008563727140427, 0.06726423650979996, -0.0707978904247284, -0.1256973147392273, 0.12517786026000977, -0.15045271813869476, -0.0530383475124836, -0.14858010411262512, 0.36490532755851746, 0.21294069290161133, 0.293968141078949, 0.08269951492547989, -0.24902383983135223, 0.5886573195457458, -0.3843301236629486, -0.5035964846611023, -0.09140083193778992, -0.20768438279628754, 0.14955902099609375, -0.16800528764724731, -0.12197914719581604, -0.01872563175857067, 0.49669182300567627, -0.49017098546028137, -0.07764355838298798, -0.5541002750396729, -0.4451574385166168, 0.019914740696549416, 0.2981143593788147, -0.08936356753110886, 0.43163129687309265, -0.08834492415189743, -0.1539146602153778, 0.0964585542678833, -0.12782658636569977, 0.5204546451568604, 0.3569797873497009, -0.09574247896671295, 0.5851715207099915, -0.024746665731072426, -0.1744711697101593, -0.1689671277999878, -0.42350468039512634, 0.27099552750587463, -0.15467196702957153, -0.37508246302604675, 0.1486402302980423, -0.33220288157463074, 0.019050251692533493, -0.23955626785755157, -0.12216554582118988, 0.24688196182250977, 0.5575321912765503, 0.040253423154354095, 0.17994076013565063, 0.45033422112464905, -0.1231883093714714, 0.2035515457391739, 0.2420579344034195, 0.0943511575460434, -0.30139604210853577, -0.15325340628623962, -0.017730465158820152, -0.11004705727100372, -0.04367056116461754, 0.1733042597770691, 0.5425084233283997, -0.2925083339214325, 0.26019787788391113, 0.1334245800971985, 0.14416450262069702, -0.19256027042865753, 1.0496826171875, 0.2723519206047058, -0.025706758722662926, -0.7517722845077515, -0.5359048247337341, 0.2102508246898651, -0.04782823100686073, -0.08196313679218292, 0.1251509040594101, -0.27085018157958984, 0.3166036009788513, 0.12066400796175003, 0.2616274952888489, 0.8659851551055908, -1.158420443534851, 0.09697829186916351, -0.15251240134239197, 0.8620539307594299, 0.23916827142238617, -0.4029153287410736, 0.6717545390129089, -0.04087100550532341, 0.1306651383638382, 0.25199094414711, 0.3619392514228821, -0.46044445037841797, -0.21613812446594238, 0.40657442808151245, -0.6441735625267029, 0.36409685015678406, -0.06511378288269043, 0.29430222511291504, 0.09570177644491196, -0.5689263939857483, 0.745375394821167, 0.20830976963043213, 0.21324975788593292, -0.055858124047517776, -0.002565853064879775, 0.2906344532966614, -0.5652074813842773, -0.1805146336555481, 0.07987534254789352, 0.6998064517974854, -0.289205938577652, -0.18692804872989655, 0.16323241591453552, -0.3325956463813782, 0.5160807967185974, 0.4816710650920868, 0.040324050933122635, 0.09003300964832306, 0.17669767141342163, 0.2395826131105423, -0.08634478598833084, -0.21828053891658783, -0.0075728027150034904, -0.19899965822696686, -0.02031492441892624, -0.05122145265340805, 0.011652868241071701, 0.22332774102687836, -0.15877936780452728, -0.5877932906150818, 0.18917396664619446, -0.1830974817276001, -0.049260783940553665, -0.7023374438285828, -0.013571192510426044, -0.1136179268360138, 0.131465882062912, 0.15052121877670288, 0.684983491897583, -0.2687850892543793, -0.4228517711162567, 0.11176556348800659, 0.22271759808063507, 0.5270820260047913, -0.35214361548423767, -0.2900218367576599, 0.29539379477500916, 0.36316540837287903, 0.053185656666755676, -0.1692861020565033, 0.2058946043252945, 0.014560955576598644, -0.05824558064341545, -0.7237663269042969, 0.4362011253833771, -0.3335190415382385, -0.07948721945285797, 0.07944896817207336, -0.013394394889473915, 0.20050360262393951, 0.4522116482257843, -0.11048179864883423, -0.15891686081886292, -0.580199658870697, 0.4347684979438782, 0.20387627184391022, 0.4906925559043884, -0.10749483853578568, -0.08001214265823364, 0.1230357438325882, -0.2094496190547943, -0.4751521050930023, 0.10117720067501068, -0.6457124352455139, 0.2415587306022644, 0.1630667746067047, -0.007458262611180544, -0.07008293271064758, -0.16304613649845123, -0.05718468502163887, -0.5091074109077454, -0.16173960268497467, 0.12171870470046997, -0.2850736081600189, -0.09505921602249146, 0.07363772392272949, 0.7812104225158691, -0.3122028708457947, 0.04906456917524338, 0.1219024509191513, -0.007526932284235954, -0.7098046541213989, -0.16547656059265137, -0.09307794272899628, -0.018706725910305977, 0.036525506526231766, 0.7419152855873108, 0.3753834366798401, -0.150615856051445, 0.006848562508821487, 0.926007866859436, 0.06590724736452103, 0.20702992379665375, 0.5064778923988342, -0.07348021119832993, -0.09931609779596329, 0.101242795586586, 0.33217430114746094, -0.07292855530977249, 0.5018674731254578, 0.0846385583281517, 0.47264787554740906, -0.05103665590286255, -0.10303100943565369, -0.4755307137966156, -0.17911632359027863, -0.04028373211622238, 0.03408683463931084, 0.5620173811912537, -0.04392801970243454, -0.5967426300048828, -0.5992370843887329, -0.8481064438819885, -0.45079413056373596, 0.5121033787727356, -0.049586016684770584, 0.037597279995679855, 0.23165594041347504, -0.039163000881671906, -0.08645161241292953, 0.11808662861585617, 0.06075352802872658, 0.4273255467414856, 0.0726994201540947, -0.21794484555721283, -0.016814954578876495, -0.05858556553721428, -0.4278324246406555, -0.14020255208015442, -0.1879248023033142, 0.033254336565732956, -0.13461287319660187, 0.14000657200813293, -0.3112565577030182, 0.055553704500198364, -0.03366812318563461, 0.004065123852342367, -0.00748042855411768, 0.00590812973678112, 0.4207923412322998, -0.3530111312866211, 0.04494248330593109, -0.6730908751487732, 0.03341153636574745, 0.39244967699050903, 0.026293206959962845, 0.1277383267879486, 0.030291058123111725, 0.1504867672920227, -0.07985880225896835, 0.28957414627075195, 0.3364022672176361, 0.6659277081489563, 0.13540516793727875, 0.3722918629646301, -0.0641898661851883, -0.006337354425340891, 0.06200205162167549, -0.0750192180275917, -0.45173802971839905, 0.3801213800907135, 0.008708189241588116, 0.02868851087987423, 0.07515929639339447, -0.005578231997787952, -0.400903582572937, -0.602798581123352, 0.4236920475959778, -0.29844897985458374, -0.2580201327800751, 0.05942957103252411, 0.3395899832248688, -0.15048670768737793, 0.30301734805107117, 0.4061506688594818, -0.000515780586283654, -0.08735349029302597, -0.49821755290031433, 0.40125352144241333, -0.11984893679618835, -0.0018652357393875718, 0.2588847279548645, 0.11237931251525879, -0.1914864331483841, 0.026339247822761536, -0.3060506582260132, 0.08472336828708649, -0.16335321962833405, -0.6934089660644531, 0.6969538927078247, -0.0668063685297966, -0.6443238258361816, 0.7637065052986145, -0.1616797149181366, 0.6056482791900635, 0.3139175772666931, -0.04050855711102486, -0.0014039636589586735, -0.059299200773239136, -0.6684141755104065, 0.2892480194568634, -0.103827103972435, -0.0014900752576068044, 0.34043216705322266, 0.41488978266716003, -0.7031764388084412, 0.2792683243751526, 0.0702560544013977, -0.5914579033851624, 0.10556195676326752, 0.00247579300776124, -0.36405113339424133, 1.4506739377975464, -0.17806510627269745, -0.2206554412841797, -0.07892238348722458, -0.0839608833193779, -0.09407958388328552, 0.05532388389110565, 0.044497136026620865, -0.3046165406703949, 0.06773766130208969, -0.35299599170684814, 0.7364513874053955, -0.17217350006103516, 0.6601009368896484, -0.2895190417766571, -0.22760307788848877, 0.9728447198867798, -0.2414723038673401, -0.9503423571586609, 0.1261790245771408, -0.16946743428707123, 0.03339770808815956, 0.31570354104042053, -0.16340459883213043, 0.2879990339279175, -0.78266841173172, -0.04475128650665283, -0.12389801442623138, 0.13581496477127075, 0.16625306010246277, 0.29932311177253723, 0.2820899188518524, 0.16727317869663239, -0.6582158803939819, -0.062383975833654404, 0.06336093693971634, -0.16147896647453308, 0.2584836184978485, 0.870462954044342, 0.3291264474391937, 0.8973677158355713, -0.5320374965667725, -0.9292318224906921, 0.23981621861457825, 0.0834980458021164, -0.05227680876851082, -0.6196547150611877, 0.9467188119888306, 0.205891415476799, 0.23345687985420227, -0.08266235888004303, 0.48741415143013, -0.13618429005146027, 0.10006466507911682, 0.03141402825713158, -0.11334028840065002, -0.2063293606042862, 0.5202634930610657, 0.043246183544397354, 0.06662154942750931, 0.17533652484416962, 0.2004539519548416, 0.1722433865070343, -0.4948872923851013, 0.4031597375869751, 0.29805076122283936, 0.09303156286478043, -0.6884803771972656, 0.16361910104751587, 0.15562497079372406, 0.5679760575294495, -0.0993085652589798, 0.361441045999527, -0.21798303723335266, 0.3124881684780121, 0.24387012422084808, 0.06558947265148163, -0.009975820779800415, -0.06717627495527267, -0.12804453074932098, -0.021390276029706, 0.39942672848701477, 0.4787154793739319, 0.2753368020057678, 0.4034196138381958, -0.7955411076545715, 0.16468995809555054, 0.13037653267383575, 0.45991256833076477, 0.07123998552560806, -0.04132118821144104, 0.019554544240236282, -3.10337495803833, 0.14367182552814484, 0.05130111053586006, -0.4998648464679718, 0.058306071907281876, -0.5737856030464172, -0.014005083590745926, -0.5478608012199402, -0.33574414253234863, -0.38209646940231323, -0.34398043155670166, 0.2776748538017273, -0.22988124191761017, 0.08562253415584564, 0.06221606209874153, -0.6436825394630432, -0.08552724123001099, 0.02809659019112587, -0.41647714376449585, 0.21162813901901245, 0.04245224595069885, -0.21032878756523132, -0.08615320175886154, -0.32296034693717957, -0.07756182551383972, 0.03411606699228287, 0.33238810300827026, 0.04788536950945854, -0.5810317993164062, -0.05614905804395676, -0.4203481078147888, -0.1447417140007019, -0.3549673855304718, -0.4242382347583771, -0.04124489426612854, 0.09348097443580627, -0.9713863730430603, -0.3121427893638611, -0.09005945175886154, 0.4389905333518982, 0.06374242901802063, -0.2640017569065094, -0.6336422562599182, 0.3266758620738983, 0.18159209191799164, 0.6622101664543152, 0.759299635887146, -0.4681280553340912, 0.10730404406785965, 0.050087641924619675, -0.09551433473825455, 0.22068119049072266, -0.36389151215553284, 0.09519820660352707, -0.32013073563575745, 0.1383190155029297, -0.38705867528915405, 0.05081828683614731, 0.15700721740722656, 0.8194381594657898, 0.13596872985363007, 0.02834869548678398, -0.11106879264116287, 0.37156981229782104, -0.26924335956573486, 0.3467080891132355, 0.08404766023159027, 0.2693069577217102, -0.47331807017326355, -0.4500104486942291, -1.133683443069458, 0.1181020736694336, -0.054942142218351364, 0.06620825827121735, -0.22113747894763947, -0.30853715538978577, 0.3094252943992615, -0.07971072942018509, 0.3559972643852234, 0.015277906320989132, -0.0564146563410759, -0.11073598265647888, -0.21770372986793518, 0.008336847648024559, 0.3699151575565338, -0.38423535227775574, 0.018939265981316566, -0.3627963960170746, -1.0632411241531372, 0.058587756007909775, 0.2736249268054962, -0.21576891839504242, 0.2615467309951782, 0.3571455776691437, -0.5689948201179504, 0.23435254395008087, 0.18397323787212372, -0.39202505350112915, 0.5194091200828552, 0.5028443336486816, -0.3239791989326477, -0.03642410412430763, -0.7657902836799622, -0.17847460508346558, 0.02820981852710247, 0.25901681184768677, 0.010204923339188099, -0.4259410798549652, 0.2129589021205902, -0.2746521532535553, 0.5326789617538452, 0.3275492489337921, 0.46343088150024414, 0.17945240437984467, -0.28293898701667786, -0.18518921732902527, 0.5869547128677368, -0.3162815570831299, 0.17845100164413452, 0.12875996530056, 0.30749189853668213, 0.0215251836925745, 0.20127101242542267, -0.3143974244594574, 0.3915400207042694, -0.06712089478969574, -0.11068248748779297, -0.03448980301618576, -0.21517249941825867, 0.3146260678768158, 0.03846476227045059, -0.06817299872636795, 0.0054930467158555984, 0.4624660611152649, -0.4558231234550476, 0.22079028189182281, -0.026105541735887527, -0.6129524111747742, -0.2821694314479828, -0.05861193314194679, 0.1267559677362442, 0.1900716871023178, 0.11565132439136505, -0.3430553674697876, -0.12814301252365112, -0.15289951860904694, 0.296073853969574, 0.12763711810112, 0.43266233801841736, 0.15092670917510986, 0.2570955455303192, -0.09588196873664856, 0.2906711995601654, -0.7203313708305359, -0.26352059841156006, 0.051465824246406555, -0.5431920289993286, 0.2312544286251068, -0.23748216032981873, 0.19304996728897095, -0.14434534311294556, -0.15149779617786407, -0.060618799179792404, -0.2966369092464447, 0.3846270740032196, -0.08132834732532501, -0.010962027125060558, -0.31222736835479736, -0.21180903911590576, -0.8403549790382385, -0.16143378615379333, 0.31779369711875916, 0.2892802059650421, 0.1549306958913803, -0.5027759075164795, 0.2281264364719391, 0.04510853812098503, 0.24771730601787567, -0.5778154134750366, -0.2811427116394043, -0.5366131663322449, -0.9456926584243774, 0.061266567558050156, -0.05834982916712761, -0.3077985942363739, -0.4257233440876007, -0.3459209203720093, 0.5326559543609619, -0.5770834684371948, 0.11906369030475616, -0.1303664594888687, -0.24386905133724213, -0.4090850055217743, -0.5106832385063171, 0.43670016527175903, 0.09327193349599838, -0.11630453914403915, 0.09623892605304718, -0.09757299721240997, 0.6600512862205505, -0.4136124551296234, -0.1251986026763916, 0.2547442615032196, -0.10614131391048431, -0.2152504324913025, -0.07474596053361893, -0.1315256953239441, -0.18302452564239502, -0.048101283609867096, -0.2077019363641739, -0.19010354578495026, 0.09722279757261276, 0.5433810949325562, -0.1965523660182953, -0.13903187215328217, -0.10775981843471527, -0.2660888731479645, -0.22001966834068298, -0.2868545949459076, -0.09699976444244385, -0.4138107895851135, -0.029112139716744423, 0.20078280568122864, -0.15013813972473145, 0.2496599704027176, -0.39744892716407776, -0.31926441192626953, 0.6412363052368164, 0.04506243020296097, 0.3453604280948639, 0.04647071659564972, 0.1844177395105362, 0.8335568308830261, 0.3504738509654999, -0.3176579475402832, -0.8537718653678894, 1.087005853652954, 0.002446346450597048, 0.5235180258750916, 0.2941656708717346, -0.2946045398712158, -0.25709667801856995, 0.6333131790161133, -0.766395092010498, -0.02969951555132866, 0.16698706150054932, -0.2359257936477661, -0.12757904827594757, 0.34332478046417236, -0.14696374535560608, -0.028228826820850372, 0.4782811403274536, -0.041586317121982574, -0.0018059908179566264, 0.032425668090581894, -0.00652418052777648, 0.3697277009487152, -0.2359236776828766, 0.23186947405338287, -0.036857206374406815, -0.19306866824626923, 0.023063169792294502, 0.5822622179985046, 0.2779204845428467, -0.1121533066034317, -0.3293464779853821, -0.11161371320486069, 0.2682311236858368, 0.21995402872562408, -0.4345397651195526, -0.16072750091552734, -0.25096064805984497, 0.24569937586784363, 0.17825914919376373, 0.23557443916797638, 0.46774768829345703, -0.054598111659288406, 0.06436552107334137, -0.4023549556732178, 0.28184187412261963, -0.7877952456474304, -0.4727360010147095, -0.26809778809547424, -0.14783278107643127, -0.6777615547180176, 0.07579608261585236, -0.00722146313637495, 0.08224307745695114, -0.22985722124576569, -0.3660401701927185, -0.32023489475250244, -0.1846027374267578, 0.0730714201927185, -0.14398889243602753, -0.6078234910964966, -0.021661698818206787, 0.23938366770744324, -0.17554572224617004, 0.12014421820640564, -0.6448988318443298, -0.36167916655540466, 0.19343046844005585, 0.22346261143684387, -0.11185696721076965, 0.18022707104682922, 0.0027052073273807764, -0.17564751207828522, 0.14021940529346466, -0.8693497776985168, 0.0702766478061676, 0.05100807920098305, 0.7028763890266418, -0.0011337322648614645, 0.021937739104032516, 0.7342085242271423, 0.11790359020233154, 0.03333105519413948, 0.1729896366596222, -0.9519023895263672, -0.11588999629020691, 0.22025737166404724, -0.03768922761082649, -0.057152170687913895, 0.29677334427833557, 0.1350308209657669, 0.24726355075836182, -0.3732370138168335, -0.22581365704536438, 0.10424363613128662, -0.251447468996048, 0.11008425801992416, 0.12352444231510162, 0.17542633414268494, -0.4598976969718933, 0.5608384609222412, 0.11584111303091049, 0.22917954623699188, -0.3656335175037384, -1.1583527326583862, -0.17720860242843628, -0.09547062963247299, 0.3539263606071472, 0.027361787855625153, -0.03391031175851822, -0.0005696265143342316]}" YELD"
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             ^}

In [8]:
str(small_dic)

"{'tag:stardog:api:_e_teko_e_arandu_e_producao_de_subjetividades_e_educacao_superior_e_educacoes_outras_modos_de_vida_criados_e_afirmados_por_kaiowas_e_guaranis': [-0.036806486546993256, 0.053812891244888306, 0.4343889653682709, 0.07008563727140427, 0.06726423650979996, -0.0707978904247284, -0.1256973147392273, 0.12517786026000977, -0.15045271813869476, -0.0530383475124836, -0.14858010411262512, 0.36490532755851746, 0.21294069290161133, 0.293968141078949, 0.08269951492547989, -0.24902383983135223, 0.5886573195457458, -0.3843301236629486, -0.5035964846611023, -0.09140083193778992, -0.20768438279628754, 0.14955902099609375, -0.16800528764724731, -0.12197914719581604, -0.01872563175857067, 0.49669182300567627, -0.49017098546028137, -0.07764355838298798, -0.5541002750396729, -0.4451574385166168, 0.019914740696549416, 0.2981143593788147, -0.08936356753110886, 0.43163129687309265, -0.08834492415189743, -0.1539146602153778, 0.0964585542678833, -0.12782658636569977, 0.5204546451568604, 0.35697

In [None]:
result = graph.query("""
    WITH apoc.util.compress('""" + small_dic + """
    CALL apoc.import.json(jsonCompressed, {compression: 'DEFLATE'})
    CALL apoc.load.json("file:///Users/facordei/OneDrive - Capgemini/Documents/GitHub/Indigenous-Slavery-KG/data/embeddings.json")
    YIELD value
    """)

result

WITH apoc.util.compress('{"type":"node","id":"2","labels":["User"],"properties":{"age":12}}', {compression: 'DEFLATE'}) AS jsonCompressed
CALL apoc.import.json(jsonCompressed, {compression: 'DEFLATE'})
YIELD source, format, nodes, relationships, properties
RETURN source, format, nodes, relationships, properties

In [None]:
#Load the embeddings in the Noe4j database
n = 0
for uri in dic_embeddings:

    result = graph.query("""
    MATCH (thesis:Thesis) 
    Where (thesis.uri = '""" + uri + """')
    CALL db.create.setNodeVectorProperty(thesis, 'embedding', """ + str(dic_embeddings[uri]) + """)
    """)

    n = n + 1
    if n % 50 == 0:
        print (uri)
    

#MATCH (thesis:Thesis) 
#Where (thesis.uri = "tag:stardog:api:_nos_aqui_e_o_espaco_dos_sem_vez__quilombolas_e_educacao_em_poconemt")
#CALL db.create.setNodeVectorProperty(thesis, 'embedding', [0,1,2,3,4])
#RETURN thesis.uri, thesis.title, thesis.abstract

In [6]:
result = graph.query("""
    CALL apoc.load.json("file:///Users/facordei/OneDrive - Capgemini/Documents/GitHub/Indigenous-Slavery-KG/data/embeddings.json")
    YIELD value
    """)

result

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `apoc.load.json`: Caused by: java.lang.RuntimeException: Import from files not enabled, please set apoc.import.file.enabled=true in your apoc.conf}

In [148]:
import os
from getpass import getpass

import numpy as np
import tensorflow as tf
from transformers import TFBertModel
from transformers import AutoTokenizer

from langchain_community.graphs import Neo4jGraph

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt, mpld3

### Load BERT model, NEO4J database and generating embeddings

In [2]:
# Choose the pretrained model
model_checkpoint = 'bert-base-multilingual-cased' #'bert-base-multilingual-cased'   #'google/bert_uncased_L-2_H-128_A-2' (Tiny BERT)
# Max number of tokens in the sentence
max_length= 512 #512 #128

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length= max_length)

In [3]:
# Load the model
bert_model = TFBertModel.from_pretrained(model_checkpoint, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [4]:
# The function receives a sencence and returns the embedding (1D numpy array)
def get_embeddings(text):
  inputs = tokenizer(text, truncation=True, return_tensors="tf")
  outputs = bert_model(inputs)

  last_hidden_states = outputs.last_hidden_state
  pooler_output = outputs.pooler_output
  return(pooler_output.numpy()[0].tolist())

In [5]:
# Example
text = "Hello, my dog is cute"
get_embeddings(text)

[0.3043421804904938,
 0.0655800998210907,
 0.2867922782897949,
 -0.18544849753379822,
 -0.13707706332206726,
 0.5639879703521729,
 0.22897112369537354,
 0.20353536307811737,
 -0.47550225257873535,
 0.43214404582977295,
 -0.10113587975502014,
 -0.2794475257396698,
 -0.2155974805355072,
 -0.1278197020292282,
 0.21039064228534698,
 -0.21768882870674133,
 0.709071934223175,
 -0.009180756285786629,
 0.19599683582782745,
 -0.4344290494918823,
 -0.9999848008155823,
 -0.120848149061203,
 -0.34574177861213684,
 -0.21058939397335052,
 -0.3446536064147949,
 0.1665513664484024,
 -0.2625880837440491,
 0.07631348818540573,
 0.19793269038200378,
 -0.19297268986701965,
 0.1113186627626419,
 -0.999985933303833,
 0.5718837380409241,
 0.7091876864433289,
 0.24170824885368347,
 -0.09670403599739075,
 0.22048485279083252,
 0.27047964930534363,
 0.2205585539340973,
 -0.3865535855293274,
 -0.26536279916763306,
 -0.09083323925733566,
 -0.1782226413488388,
 0.24492590129375458,
 -0.18337216973304749,
 -0.24588

Connecting Neo4j database

In [6]:
# Neo4j variables
NEO4J_URL = getpass("NEO4J_URL")
NEO4J_USERNAME = getpass("NEO4J_USERNAME")
NEO4J_PASSWORD = getpass("NEO4J_PASSWORD")

NEO4J_URL ········
NEO4J_USERNAME ········
NEO4J_PASSWORD ········


In [313]:
#Connecting to the graph
graph = Neo4jGraph(
    url=NEO4J_URL,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

In [314]:
#Quering the graph using Cypher
result_LRA = graph.query("""
MATCH (n:LRA) RETURN ID(n), n.name, n.data_description;
""")

In [9]:
# Loop to creating the embeddings for LRA
embeddings_LRA = []

i = 0
for text in result_LRA:

    if text['n.name'] == None:
        text['n.name'] = " "
    if text['n.data_description'] == None:
        text['n.data_description'] = " "
   
    name_embeddings = get_embeddings(text['n.name'])
    description_embeddings = get_embeddings(text['n.data_description'])
    nameANDdescription_embeddings = get_embeddings(text['n.name']+ ' - ' +text['n.data_description'])
   
    embeddings_LRA.append({'ID': text['ID(n)'],
                           'name': text['n.name'],
                           'descriptions': text['n.data_description'],
                           'name_embeddings': name_embeddings,  
                           'description_embeddings': description_embeddings,
                           'nameANDdescription_embeddings': nameANDdescription_embeddings})

    graph.query("MATCH (n:LRA) WHERE ID(n) = " + str(text['ID(n)']) + " SET n.name_embeddings = " + str(name_embeddings) + " SET n.description_embeddings = " + str(description_embeddings) + " SET n.nameANDdescription_embeddings = " + str(nameANDdescription_embeddings) + ";")


In [23]:
#Quering the graph using Cypher
result_BII = graph.query("""
MATCH (n:BII) RETURN ID(n), n.name, n.information_asset_details;
""")

In [25]:
# Loop to creating the embeddings for BII
embeddings_BII = []

i = 0
for text in result_BII:

    if text['n.name'] == None:
        text['n.name'] = " "
    if text['n.information_asset_details'] == None:
        text['n.information_asset_details'] = " "
   
    name_embeddings = get_embeddings(text['n.name'])
    description_embeddings = get_embeddings(text['n.information_asset_details'])
    nameANDdescription_embeddings = get_embeddings(text['n.name']+ ' - ' +text['n.information_asset_details'])
   
    embeddings_BII.append({'ID': text['ID(n)'],
                           'name': text['n.name'],
                           'descriptions': text['n.information_asset_details'],
                           'name_embeddings': name_embeddings,  
                           'description_embeddings': description_embeddings,
                           'nameANDdescription_embeddings': nameANDdescription_embeddings})

    graph.query("MATCH (n:BII) WHERE ID(n) = " + str(text['ID(n)']) + " SET n.name_embeddings = " + str(name_embeddings) + " SET n.description_embeddings = " + str(description_embeddings) + " SET n.nameANDdescription_embeddings = " + str(nameANDdescription_embeddings) + ";")

In [294]:
# Lists with name, descriptions and Embeddings

emb_LRA = []
name_LRA = []
desc_LRA = []
ID_LRA = []

for data in embeddings_LRA:
    name_LRA.append(data['name'])
    desc_LRA.append(data['descriptions'])
    emb_LRA.append(data['nameANDdescription_embeddings'])
    ID_LRA.append(data['ID'])


emb_BII = []
name_BII = []
desc_BII = []
ID_BII = []

for data in embeddings_BII:
    name_BII.append(data['name'])
    desc_BII.append(data['descriptions'])
    emb_BII.append(data['nameANDdescription_embeddings'])
    ID_BII.append(data['ID'])

### LRA x BII similarity

In [27]:
# Calculating a matrix with cosine similarity between LRA x BII
LRA_BII_cos = cosine_similarity(np.array(emb_LRA), np.array(emb_BII))

In [28]:
# Printing most similar BII data for each LRA data above the threshold.

th = 0.995

for n in range (len(LRA_BII_cos)):

    most_sim = np.argmax(LRA_BII_cos[n])
    
    if LRA_BII_cos[n][most_sim] > th:
            
        print ("LRA data: ", name_LRA[n])
        print ("LRA description: ", desc_LRA[n])
        print (" ")
        
        print("Most similar BII data: ", name_BII[most_sim])
        print("Most similar BII description: ", desc_BII[most_sim])
        print ("cosine similarity: ", LRA_BII_cos[n][most_sim])
        print ("------------")

LRA data:  Real-time well data
LRA description:  Real-Time drilling and logging data - Incl.cement log data. SiteCom + ROP (Rate of Penetration) + WOB (Weight on bit)+ TQ (Torque) + Hookload + MWD (Measurements while drilling) + LWD (Logging while drilling) + WITSML (Well Site Information Transfer Standard Markup Language)
 
Most similar BII data:  Real-Time log data
Most similar BII description:  Real-Time drilling and logging data - Incl.cement log data. SiteCom + ROP (Rate of Penetration) + WOB (Weight on bit)+ TQ (Torque) + Hookload + MWD (Measurements while drilling) + LWD (Logging while drilling) + WITSML (Well Site Information Transfer Standard Markup Language)
cosine similarity:  0.9954749943918912
------------
LRA data:  Disposal of surplus
LRA description:  Sales documention (description of equipment, certificates) + Sales order
 
Most similar BII data:  Disposal of surplus
Most similar BII description:  Sales documention (description of equipment, certificates) + Sales order

In [39]:
# For each data point in LRA finding all data in BII greater than the threshold. 
# Creating a [:STRONG_IM_LINK] relation between those nodes 

th = 0.995

for n in range(len(LRA_BII_cos)):
    for m in np.where(LRA_BII_cos[n] > th)[0]:
        print ("LRA ID: ", embeddings_LRA[n]['ID'])
        print ("LRA data: ", name_LRA[n])
        print ("LRA description: ", desc_LRA[n])
        print (" ")

        print ("BII ID: ", embeddings_BII[m]['ID'])
        print("Most similar BII data: ", name_BII[m])
        print("Most similar BII description: ", desc_BII[m])
        print ("cosine similarity: ", LRA_BII_cos[n][m])
        print ("------------")
        
        graph.query("MATCH (lra:LRA) WHERE ID(lra) = " + str(embeddings_LRA[n]['ID']) + " MATCH (bii:BII) WHERE ID(bii) = " + str( embeddings_BII[m]['ID']) + " MERGE (bii)-[:STRONG_IM_LINK]-(lra)")


LRA ID:  0
LRA data:  Real-time well data
LRA description:  Real-Time drilling and logging data - Incl.cement log data. SiteCom + ROP (Rate of Penetration) + WOB (Weight on bit)+ TQ (Torque) + Hookload + MWD (Measurements while drilling) + LWD (Logging while drilling) + WITSML (Well Site Information Transfer Standard Markup Language)
 
BII ID:  2650
Most similar BII data:  Real-Time log data
Most similar BII description:  Real-Time drilling and logging data - Incl.cement log data. SiteCom + ROP (Rate of Penetration) + WOB (Weight on bit)+ TQ (Torque) + Hookload + MWD (Measurements while drilling) + LWD (Logging while drilling) + WITSML (Well Site Information Transfer Standard Markup Language)
cosine similarity:  0.9954749943918912
------------
LRA ID:  324
LRA data:  Disposal of surplus
LRA description:  Sales documention (description of equipment, certificates) + Sales order
 
BII ID:  2506
Most similar BII data:  Disposal of surplus
Most similar BII description:  Sales documention (d

### LRA x LRA similarity

In [260]:
# Calculating a matrix with cosine similarity between LRA x LRA
LRA_LRA_cos = cosine_similarity(np.array(emb_LRA), np.array(emb_LRA))

In [265]:
# For each data point in LRA finding all data in LRA greater than the threshold. 
# Creating a [:STRONG_IM_LINK] relation between those nodes 

th = 0.995

for n in range(len(LRA_LRA_cos)):
    for m in np.where(LRA_LRA_cos[n] > th)[0]:
        if embeddings_LRA[n]['ID'] != embeddings_LRA[m]['ID']:
            # print ("LRA ID: ", embeddings_LRA[n]['ID'])
            # print ("LRA data: ", name_LRA[n])
            # print ("LRA description: ", desc_LRA[n])
            # print (" ")
    
            # print ("LRA ID: ", embeddings_LRA[m]['ID'])
            # print("Most similar LRA data: ", name_LRA[m])
            # print("Most similar LRA description: ", desc_LRA[m])
            # print ("cosine similarity: ", LRA_LRA_cos[n][m])
            # print ("------------")
            
            graph.query("MATCH (lra:LRA) WHERE ID(lra) = " + str(embeddings_LRA[n]['ID']) + " MATCH (lra2:LRA) WHERE ID(lra2) = " + str( embeddings_LRA[m]['ID']) + " MERGE (lra)-[:STRONG_IM_LINK]-(lra2)")

### BII x BII similarity

In [268]:
# Calculating a matrix with cosine similarity between LRA x LRA
BII_BII_cos = cosine_similarity(np.array(emb_BII), np.array(emb_BII))

In [269]:
# For each data point in BII finding all data in BII greater than the threshold. 
# Creating a [:STRONG_IM_LINK] relation between those nodes 

th = 0.995

for n in range(len(BII_BII_cos)):
    for m in np.where(BII_BII_cos[n] > th)[0]:
        if embeddings_BII[n]['ID'] != embeddings_BII[m]['ID']:
            print ("BII ID: ", embeddings_BII[n]['ID'])
            print ("BII data: ", name_BII[n])
            print ("BII description: ", desc_BII[n])
            print (" ")
    
            print ("BII ID: ", embeddings_BII[m]['ID'])
            print("Most similar BII data: ", name_BII[m])
            print("Most similar BII description: ", desc_BII[m])
            print ("cosine similarity: ", BII_BII_cos[n][m])
            print ("------------")
            
            graph.query("MATCH (bii:BII) WHERE ID(bii) = " + str(embeddings_BII[n]['ID']) + " MATCH (bii2:BII) WHERE ID(bii2) = " + str( embeddings_BII[m]['ID']) + " MERGE (bii)-[:STRONG_IM_LINK]-(bii2)")

BII ID:  2119
BII data:  Safety risk documentation
BII description:  Includes risk evaluation, assessment, decision and report. E.g. Hazid report
 
BII ID:  2477
Most similar BII data:  Notification of incident
Most similar BII description:  All correspondence related to notification of incident to relevant entities. Examples of typical documents are notification form, follow-up correspondence (e.g. documentation dispatch)
cosine similarity:  0.9953949952057396
------------
BII ID:  2155
BII data:  Geophysical reports (PETEC)
BII description:  Interpretation, processing or other geophysical reports
 
BII ID:  2169
Most similar BII data:  Geological reports (PETEC)
Most similar BII description:  Geological description, characterization and evaluation reports
cosine similarity:  0.9954115932184938
------------
BII ID:  2169
BII data:  Geological reports (PETEC)
BII description:  Geological description, characterization and evaluation reports
 
BII ID:  2155
Most similar BII data:  Geophy

### Clustering

In [295]:
#Join the lists from LRA and BII in the same arrays 
clust_emb = np.array(emb_LRA + emb_BII)
clust_name = np.array(name_LRA + name_BII)
clust_desc = np.array(desc_LRA + desc_BII)
clust_ID = np.array(ID_LRA + ID_BII)
clust_dataset = np.array(["LRA"] * len(emb_LRA) + ["BII"] * len(emb_BII))


In [296]:
# Clustering algorithm with minimun of 5 elemnets with maximuns distance of "alpha"
alpha = 0.008
clustering = DBSCAN(eps=alpha, min_samples=5, metric='cosine').fit(clust_emb)
#clustering = KMeans(n_clusters=500, random_state=0, n_init="auto").fit(clust_emb)

Reductions of dimensionality and visualization

In [297]:
#PCA_emb = PCA(n_components=2).fit_transform(clust_emb)

In [298]:
TSNE_emb = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3).fit_transform(clust_emb)

In [299]:
fig, ax = plt.subplots(figsize=(20, 20))#, layout='constrained')

plt.scatter(TSNE_emb.T[0], TSNE_emb.T[1], c=clustering.labels_, s=2)

for i in range(len(clust_emb)):
    if clustering.labels_[i] > -1:
        plt.text(x=TSNE_emb.T[0][i], 
                 y=TSNE_emb.T[1][i],
                 #s='(' + str(clustering.labels_[i]) + ') ' + str(i), fontdict=dict(size=1),)
                 s='(' + str(clustering.labels_[i]) + ') ' + ' - ' + clust_dataset[i] + ' - ' + clust_name[i], fontsize= 2)
        # if i > 1000:
        #     break
    
mpld3.display(fig)

In [304]:
clu = 11
clust_n_name = clust_name[clustering.labels_ == clu]
clust_n_desc = clust_desc[clustering.labels_ == clu]
clust_n_ID = clust_ID[clustering.labels_ == clu]
clust_n_dataset = clust_dataset[clustering.labels_ == clu]

for data in range(len(clust_n_name)):
    print(clust_n_dataset[data], " / (", clust_n_ID[data], ") ", clust_n_name[data], " / ", clust_n_name[data])

LRA  / ( 185 )  Well integrity  /  Well integrity
LRA  / ( 4243 )  Well integrity  /  Well integrity
LRA  / ( 4457 )  Well integrity  /  Well integrity
BII  / ( 2373 )  Certificates (Incl. M4 PM Certificate)  /  Certificates (Incl. M4 PM Certificate)
BII  / ( 2494 )  License/Lease - equity and administrative  /  License/Lease - equity and administrative


In [325]:
# Creating a "Data Product" node for each cluster and connect the data points

for clu in set(clustering.labels_):
    if clu != -1:
        
        graph.query("MERGE (n:DataProduct {number: " + str(clu) +  "})")
        print ("MERGE DataProduct number ", clu)
        #graph.query("MATCH (bii:BII) WHERE ID(bii) = " + str(embeddings_BII[n]['ID']) + " MATCH (bii2:BII) WHERE ID(bii2) = " + str( embeddings_BII[m]['ID']) + " MERGE (bii)-[:STRONG_IM_LINK]-(bii2)")

        clust_n_ID = clust_ID[clustering.labels_ == clu]
        clust_n_dataset = clust_dataset[clustering.labels_ == clu]

        for data in range(len(clust_n_ID)):

            graph.query("MATCH (n:DataProduct {number: " + str(clu)  + "}) MATCH (m:" + clust_n_dataset[data] + ") WHERE ID(m) = " + str(clust_n_ID[data]) + " MERGE (m)-[:PART_OF]-(n);")
            print("MERGE  (n:DataProduct {number: " + str(clu)  + "}) <-[:PART_OF]-  (m:" + clust_n_dataset[data] + " ID(" + str(clust_n_ID[data]) + ")) ")


MERGE DataProduct number  0
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(15)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(28)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(29)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(36)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(37)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(52)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(61)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(76)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(77)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(84)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(100)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(101)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(171)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_OF]-  (m:LRA ID(271)) 
MERGE  (n:DataProduct {number: 0}) <-[:PART_