In [1]:
from transformers import TFBertModel
from transformers import AutoTokenizer, BertModel
import os
os.environ['CURL_CA_BUNDLE'] = ''
import torch
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
import json
import pandas as pd
from openai import AzureOpenAI

#from azure.identity import DefaultAzureCredential, get_bearer_token_provider  

Using Azure OpenAI embedding models

In [None]:
#client_small = AzureOpenAI(
#  api_key = os.getenv("OPENAI_API_KEY"),
#  azure_endpoint = os.getenv("EMBEDDING_SMALL_ENDPOINT"),
#  api_version = os.getenv("API_VERSION"))


In [3]:
client_large = AzureOpenAI(
  api_key = os.getenv("OPENAI_API_KEY"),
  azure_endpoint = os.getenv("EMBEDDING_LARGE_ENDPOINT"),
  api_version = os.getenv("API_VERSION"))

In [None]:
# The function receives a sencence and returns the embedding (1D numpy array)
def get_embeddings_openAI(text):
    #model = "text-embedding-3-small"
    #embedding_small = client_small.embeddings.create(input = text, model=model)
    #return embedding_small.data
    model = "text-embedding-3-large"
    embedding_large = client_large.embeddings.create(input = text, model=model)
    return embedding_large.data


Load BERT Model (later we can change to other model)

In [5]:
# Choose the pretrained model
#model_checkpoint = 'bert-base-multilingual-cased' #'bert-base-multilingual-cased'   #'google/bert_uncased_L-2_H-128_A-2' (Tiny BERT)
# Max number of tokens in the sentence
#max_length= 512 #512 #128

# Load tokenizer
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length= max_length)

# Load the model
#bert_model = TFBertModel.from_pretrained(model_checkpoint, from_pt=True)
#bert_model = BertModel.from_pretrained(model_checkpoint)

In [None]:
# The function receives a sencence and returns the embedding (1D numpy array)
#def get_embeddings(text):
#  input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True, truncation=True)).unsqueeze(0)  # Batch size 1
#  outputs = bert_model(input_ids)
#  last_hidden_states = outputs[0]
#  pooler_output = torch.mean(last_hidden_states, dim=1) 
#  return(pooler_output.detach().numpy()[0].tolist())

Connecting to Neo4j

In [6]:
load_dotenv()

# Neo4j variables
NEO4J_URL = os.getenv("NEO4J_URL")
NEO4J_USERNAME =os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

#Connecting to the graph
graph = Neo4jGraph(
    url=NEO4J_URL,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

In [7]:
#Quering the thesis from graph unsing Cypher
result = graph.query("""
MATCH (thesis:Thesis)
RETURN thesis.uri, thesis.title, thesis.abstract
""")

In [137]:
### Transformamos o resultado em um DataFrame e CSV.

try:
    embedding_csv = pd.read_csv('data/embedding_openai_1.csv') 
    #with open('data/embeddings_BERT.json', 'r') as fp:
    #with open('data/embeddings_openai.json', 'r') as fp:
    #    dic_embeddings = json.load(fp)

except:
    embedding_csv = pd.DataFrame(data={'uri':[], 'embeddings': []})
    #dic_embeddings = {}

batch_size = 3
n = 0

texts_list = []
thesis_uri = []
embeddings = []

for thesis in result[:12]:
    n = n + 1   

    texts_list.append(str(thesis['thesis.title']) + '\n   \n' + str(thesis['thesis.abstract']))
    thesis_uri.append(thesis['thesis.uri'])

    if n % batch_size == 0:
        print ("Batch number: ", n)
        embs_list = get_embeddings_openAI(texts_list)

        for i in range(len(thesis_uri)):
            #dic_embeddings[thesis_uri[i]] = embs_list[i].embedding
            embeddings.append(embs_list[i].embedding)

        #with open('data/embeddings_BERT.json', 'w') as fp:
        #with open('data/embeddings_openai.json', 'w') as fp:
        #    json.dump(dic_embeddings, fp)

        embedding_csv_batch = pd.DataFrame(data={'uri':thesis_uri, 'embeddings': embeddings})
        embedding_csv = pd.concat([embedding_csv, embedding_csv_batch], ignore_index=True)
        embedding_csv.to_csv('data/embedding_openai_1.csv', index=False)
        #embedding_csv.to_csv('data/embedding_BERT.csv', index=False)

        texts_list = []
        thesis_uri = []
        embeddings = []

if embeddings != []:
    print ("Batch number: ", n)
    embs_list = get_embeddings_openAI(texts_list)

    for i in range(len(thesis_uri)):
        #dic_embeddings[thesis_uri[i]] = embs_list[i].embedding
        embeddings.append(embs_list[i].embedding)

    #with open('data/embeddings_BERT.json', 'w') as fp:
    #with open('data/embeddings_openai.json', 'w') as fp:
    #    json.dump(dic_embeddings, fp)

    embedding_csv_batch = pd.DataFrame(data={'uri':thesis_uri, 'embeddings': embeddings})
    embedding_csv = pd.concat([embedding_csv, embedding_csv_batch], ignore_index=True)
    embedding_csv.to_csv('data/embedding_openai_1.csv', index=False)
    #embedding_csv.to_csv('data/embedding_BERT.csv', index=False)

Batch number:  3
Batch number:  6
Batch number:  9
Batch number:  12


In [63]:
# Usando cypher para ler o arquivo CSV
graph.query("""
    LOAD CSV WITH HEADERS
    FROM 'file:///C:/Users/facordei/OneDrive%20-%20Capgemini/Documents/GitHub/Indigenous-Slavery-KG/data/embedding_openai.csv' AS row
    MATCH (n:Thesis) Where n.uri = row.uri
    CALL db.create.setNodeVectorProperty(n, 'embedding_openai', apoc.convert.fromJsonList(row.embeddings))
    """)

[]

In [164]:
# Criando um indice de vetores

dimension = len(get_embeddings_openAI('texto de teste')[0].embedding)

# Creating a vector index
graph.query("""
    CREATE VECTOR INDEX Thesis_Embeddings IF NOT EXISTS
    FOR (n:Thesis)
    ON n.embedding_openai
    OPTIONS {indexConfig: {
    `vector.dimensions`: """ + str(dimension) + """,
    `vector.similarity_function`: 'cosine'}}
    """)

[]

In [165]:
# Criand o indice Full Text

graph.query("""
    CREATE FULLTEXT INDEX Thesis_fulltext IF NOT EXISTS FOR (n:Thesis) ON EACH [n.title, n.abstract]
    """)

[]

Buscando teses com vetores similares a um determinado texto

In [166]:
#texto que será usado na busca
#query_text = 'Nelson Mandela'
personagem = 'Kabengele Munanga'
query_text = 'Quem foi ' + personagem + ' ?'
# Transformando as query em vetor
#query_embedding = get_embeddings(query_text)
query_embedding = get_embeddings_openAI(query_text)[0].embedding


In [167]:
# Buscando no índice de vetores 
result = graph.query("""
    CALL db.index.vector.queryNodes('Thesis_Embeddings', 5, """ + str(query_embedding) + """)
    YIELD node, score
    RETURN node.title, node.abstract
    """)

contexto = ''
for r in result:
    contexto = contexto + 'Título: ' + r['node.title'] + ' \n'
    contexto = contexto + 'Título: ' + r['node.abstract'] + ' \n \n'
print(contexto)

Título: Educação brasileira e identidade negra em Kabengele Munanga 
Título: This study aimed to identify the theoretical support that grounds the production of Kabengele Munanga and characterize the conception of black identity in the production of the author, and its contributions to the curriculum of the Brazilian Education. To achieve these objectives were researched two books of the author that deal specifically about the construction of the black identity. We also researched the bibliographic production realized by Brazilian and foreigner researchers that deal about the racial relations, the construction of the black identity and its implications for the Brazilian educational context, that are related to the academic production of Kabengele Munanga. This work was based on a theoretical research, starting from the reading of the documents and bibliography about the construction of the black identity and education. The school is understood as a social institution that generates val

Buscando usando full text search

In [168]:
result = graph.query("""
CALL db.index.fulltext.queryNodes("Thesis_fulltext", '""" + query_text + """') YIELD node, score
RETURN score, node.title, node.abstract 
LIMIT 5 
""")
result

[{'score': 18.117076873779297,
  'node.title': 'Educação brasileira e identidade negra em Kabengele Munanga',
  'node.abstract': 'This study aimed to identify the theoretical support that grounds the production of Kabengele Munanga and characterize the conception of black identity in the production of the author, and its contributions to the curriculum of the Brazilian Education. To achieve these objectives were researched two books of the author that deal specifically about the construction of the black identity. We also researched the bibliographic production realized by Brazilian and foreigner researchers that deal about the racial relations, the construction of the black identity and its implications for the Brazilian educational context, that are related to the academic production of Kabengele Munanga. This work was based on a theoretical research, starting from the reading of the documents and bibliography about the construction of the black identity and education. The school is 