In [1]:
from dotenv import load_dotenv
import os
from langchain_community.graphs import Neo4jGraph
from openai import AzureOpenAI
import google.generativeai as genai

Conectando ao Neo4j e aos Azure OpenAI

In [2]:
load_dotenv()

# Neo4j variables
NEO4J_URL = os.getenv("NEO4J_URL")
NEO4J_USERNAME =os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

#Connecting to the graph
graph = Neo4jGraph(
    url=NEO4J_URL,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

In [3]:
# Conectando o modelo de Embedding da Azure OpenAI

#client_small = AzureOpenAI(
#  api_key = os.getenv("OPENAI_API_KEY"),
#  azure_endpoint = os.getenv("EMBEDDING_SMALL_ENDPOINT"),
#  api_version = os.getenv("API_VERSION"))

client_large = AzureOpenAI(
  api_key = os.getenv("OPENAI_API_KEY"),
  azure_endpoint = os.getenv("EMBEDDING_LARGE_ENDPOINT"),
  api_version = os.getenv("API_VERSION"))


# The function receives a sencence and returns the embedding (1D numpy array)
def get_embeddings_openAI(text):
    #model = "text-embedding-3-small"
    #embedding_small = client_small.embeddings.create(input = text, model=model)
    #return embedding_small.data[0].embedding
    model = "text-embedding-3-large"
    embedding_large = client_large.embeddings.create(input = text, model=model)
    return embedding_large.data[0].embedding 


# Conectando o modelo de Embedding da Google Gemini
#genai.configure(api_key=os.getenv('GEMINI_API'))

#def get_embeddings_openAI(text):
#    result = genai.embed_content(
#        model="models/text-embedding-004",
#        content=text)
#    return result['embedding']

### GRAPH RAG  
  
Definir uma funcao que: 1) recebe um texto; 2)usa o modelo de embedding para transformar o texto em um vetor: 3) Usa o vetor para buscar outros textos do grafo (usando o índice de vetores)   

In [4]:
#def busca_grafo_vetor(query_text):
#    query_embedding = get_embeddings_openAI(query_text)

    # Buscando no índice de vetores 
#    results = graph.query("""
#        CALL db.index.vector.queryNodes('Thesis_Embeddings', 15, """ + str(query_embedding) + """)
#        YIELD node, score
#        RETURN node.uri, node.title, node.abstract, node.description, score 
#        LIMIT 15 
#        """)
    
#    return (results)

#MATCH (node)-[:author]-(author)
#RETURN node.uri, node.title, node.abstract, node.created, author.label, score

In [5]:
#def busca_grafo_fulltext(query_text):

#    results = graph.query("""
#        CALL db.index.fulltext.queryNodes("Thesis_fulltext", '""" + query_text + """') 
#        YIELD node, score
#        RETURN node.uri, node.title, node.abstract, node.description, score  
#        LIMIT 15 
#        """)
#    return (results)

#        MATCH (node)-[:author]-(author)
#        RETURN node.uri, node.title, node.abstract, node.created , author.label, score 

In [6]:
def busca_grafo_vetor(query_text):
    query_embedding = get_embeddings_openAI(query_text)

    # Buscando no índice de vetores 
    results = graph.query("""
        CALL{
        CALL db.index.vector.queryNodes('Thesis_Embeddings', 15, """ + str(query_embedding) + """) 
        YIELD node, score

        MATCH (node:Thesis)
        MATCH (node)-[:author]-(author)
        RETURN score, node.created AS created, node.uri AS node_uri, node.title AS title, node.abstract AS abstract, node.description AS description, author.label AS author 
        LIMIT 15 

        UNION ALL

        CALL db.index.vector.queryNodes('Thesis_Embeddings', 15, """ + str(query_embedding) + """) 
        YIELD node, score
        MATCH (node:Excerpt)<-[:BFO_0000051]-(t:Thesis)
        MATCH (t)-[:author]-(author)
        RETURN score, t.created AS created, node.uri AS node_uri, t.title AS title, Null AS abstract, node.description AS description,  author.label AS author
        LIMIT 15}

        RETURN score, created, node_uri, title, abstract, description, author
        ORDER BY score DESC
        LIMIT 15
        """)
    
    return (results)

In [7]:
def busca_grafo_fulltext(query_text):

    results = graph.query("""
        CALL{
        CALL db.index.fulltext.queryNodes("Thesis_fulltext", '""" + query_text + """') 
        YIELD node, score

        MATCH (node:Thesis)
        MATCH (node)-[:author]-(author)
        RETURN score, node.created AS created, node.uri AS node_uri, node.title AS title, node.abstract AS abstract, node.description AS description, author.label AS author 
        LIMIT 15 

        UNION ALL

        CALL db.index.fulltext.queryNodes("Thesis_fulltext", '""" + query_text + """') 
        YIELD node, score
        MATCH (node:Excerpt)<-[:BFO_0000051]-(t:Thesis)
        MATCH (t)-[:author]-(author)
        RETURN score, t.created AS created, node.uri AS node_uri, t.title AS title, Null AS abstract, node.description AS description,  author.label AS author
        LIMIT 15}

        RETURN score, created, node_uri, title, abstract, description, author
        ORDER BY score DESC
        LIMIT 15
        """)
    return (results)


In [8]:
# Reciprocal Rank Fusion (RRF)
def RFF(rank1, rank2, w_rank1=1.0, w_rank2=1.0):
    k = 60
    score = {}
    title_text = {}

    for p in range(len(rank1)):
        uri = rank1[p]['node_uri']
        score[uri] = 1/(p+1+k)
        title_text[uri] = {'title': rank1[p]['title'],#['node.title'], 
                           'text': str(rank1[p]['abstract']) + " " + str(rank1[p]['description']),#['node.abstract'], 
                           'author': rank1[p]['author'],#['author.label'],
                           'created': rank1[p]['created'],#['node.created']}
                           'uri': uri}

    for p in range(len(rank2)):
        uri = rank2[p]['node_uri']
        if uri not in score:
            score[uri] = 1/(p+1+k)
            title_text[uri] = {'title': rank1[p]['title'],#['node.title'], 
                            'text': str(rank1[p]['abstract']) + " " + str(rank1[p]['description']),#['node.abstract'], 
                            'author': rank1[p]['author'],#['author.label'],
                            'created': rank1[p]['created'],#['node.created']}
                            'uri': uri}
        else:
            score[uri] = (score[uri]) * w_rank1 + (1/(p+1+k)) * w_rank2


    uri_list = []
    score_list = []
    sorted_title_text = []
    for i in sorted(score, key = score.get, reverse=True):
        uri_list.append(i)
        score_list.append(score[i])
        sorted_title_text.append(title_text[i])
    return (sorted_title_text, score_list)



In [9]:
def busca_grafo_hibrida(query_text):
    results_vector = busca_grafo_vetor(query_text)
    results_text = busca_grafo_fulltext(query_text)
    return RFF(results_vector, results_text, w_rank1=1.0, w_rank2=0.5)

In [10]:
#Conectando Azure OpenAI
#endpoint = os.getenv("GPT35_ENDPOINT")  
#deployment = os.getenv("DEPLOYMENT_NAME", "gpt-35-turbo")
#endpoint = os.getenv("GPT4_ENDPOINT")  
#deployment = os.getenv("DEPLOYMENT_NAME", "gpt-4") 
#subscription_key = os.getenv("OPENAI_API_KEY")   
     
#client_chat = AzureOpenAI(  
#        azure_endpoint=endpoint,  
#        api_key=subscription_key,  
#        api_version="2024-05-01-preview",  
#    )  
    
#Conectando Google Gemini
genai.configure(api_key=os.getenv('GEMINI_API'))
gemini_model = genai.GenerativeModel("gemini-1.5-flash")

In [11]:
def gen_respostas(query_text):

    results = busca_grafo_hibrida(query_text)

    respostas = []
    uris = []

    for n in range(5):
        text = results[0][n]['text']
        titulo = results[0][n]['title']
        autor = results[0][n]['author']
        created = results[0][n]['created']
        uri = results[0][n]['uri']
        citation = autor + ". " + titulo + ". " + created + "."

        #Azure OpenAI Completion
        #completion = client_chat.chat.completions.create(  
        #model=deployment,  
        #messages=[
        #    {
        #    "role": "system",
        #    "content": "You will be provided with a text delimited by triple quotes and a question. Your task is to answer the question using only the provided text and to cite the passage(s) of the document used to answer the question. If the text does not contain the information needed to answer this question then simply write: 'Informação Insuficiente.' If an answer to the question is provided, it must be annotated with a citation. Use the following format for to cite relevant passages (" + citation + "). Your answers must be written in Brazilian Portuguese."
        #    },
        #    {
        #    "role": "user",
        #    "content": '"""' + text + '""" "QUESTION: "' + query_text
        #    }
        #    ],    
        #        max_tokens=800, temperature=1.0, top_p=0.95, frequency_penalty=0, presence_penalty=0, stop=None, stream=False  
        #    )  
        #resposta = completion.to_dict()['choices'][0]['message']['content']
        
        #Google Gemini Completion
        resposta = gemini_model.generate_content('You will be provided with a text delimited by triple quotes and a question. Your task is to answer the question using only the provided text and to cite the passage(s) of the document used to answer the question. If the text does not contain the information needed to answer this question then simply write: "Informação Insuficiente". If an answer to the question is provided, it must be annotated with a citation. Use the following format for to cite relevant passages (' + citation + '). Your answers must be written in Brazilian Portuguese. /n  """' + text + '""" "QUESTION: "' + query_text).text
        
        respostas.append(resposta)
        uris.append(uri)
        
    return respostas, uris

In [12]:
def summarize_respostas(query_text, respostas):

    fontes = ''
    for r in respostas:
        fontes = fontes + '\n' + r 

    #completion = client_chat.chat.completions.create(  
    #model=deployment,  
    #messages=[
    #    {
    #    "role": "system",
    #    "content": "You will be provided with one question and several answers for the question delimited by triple quotes. Your task is to summarize the provided answers. If all answers were 'Informação Insuficiente', you must also return the text 'Informação Insuficiente.' You must maintain the citations and references for the original documents using the same format. Include facts present in the provided answers that directly address the provided question in the final answer. Your answers must be written in Brazilian Portuguese."
    #    },
    #    {
    #    "role": "user",
    #    "content": '"""' + fontes + '"""' + 'QUESTION: ' + query_text
    #    }
    #    ],  
    #        max_tokens=800, temperature=1.0, top_p=0.95,frequency_penalty=0, presence_penalty=0, stop=None, stream=False  
    #    )  
    
    #resposta = completion.to_json()
    #summ_resposta = completion.to_dict()['choices'][0]['message']['content']

    #Google Gemini Completion
    summ_resposta = gemini_model.generate_content('You will be provided with one question and several answers for the question delimited by triple quotes. Your task is to summarize the provided answers. If all answers were "Informação Insuficiente", you must also return the text "Informação Insuficiente". You must maintain the citations and references for the original documents using the same format. Include facts present in the provided answers that directly address the provided question in the final answer. Your answers must be written in Brazilian Portuguese. \n """' + fontes + '"""' + 'QUESTION: ' + query_text).text
        
        
    return summ_resposta

In [13]:
# Lista de perguntas

def perguntas(personagem):
    perg = []
    #perg.append('Qual o nome completo de ' + personagem + '?')
    #perg.append('Onde nasceu ' + personagem + '?')
    #perg.append('Qual a data de nascimento de ' + personagem + '?')
    #perg.append('Onde morreu ' + personagem + '?')
    #perg.append('Qual a data de morte de ' + personagem + '?')
    perg.append('Quem foi ' + personagem + '?')
    perg.append('Quais os principais feitos de ' + personagem + '?')
    perg.append('Quais os principais cargos, funções, ou emprogos de ' + personagem + '?')
    
    return perg

In [14]:
def perguntas_e_respostas(personagem):

    perg_resp = {}

    pergs = perguntas(personagem)

    for perg in pergs:
        respostas, uris = gen_respostas(perg)
        resposta_final = summarize_respostas(perg, respostas)
        perg_resp[perg] = {"Resumo": resposta_final, "Respostas": respostas, "uris": uris} 

    return perg_resp

In [19]:
personagem = 'Viviane Juguero'
dic_perguntas_respostas = perguntas_e_respostas(personagem)

In [20]:
print(dic_perguntas_respostas.keys())

dict_keys(['Quem foi Viviane Juguero?', 'Quais os principais feitos de Viviane Juguero?', 'Quais os principais cargos, funções, ou emprogos de Viviane Juguero?'])


In [21]:
dic_perguntas_respostas['Quem foi Viviane Juguero?']

{'Resumo': 'Viviane Juguero foi uma figura multifacetada e atuante nas artes, letras e filosofia (Piegaz, Acevesmoreno Flores. Dramaturgias negras do Pampa : uma análise decolonial. 2023.), nascida em Bagé, no Pampa.  Dramaturga, pesquisadora, professora, atriz, poeta e produtora (Juguero, 2019, p. 4; Juguero, 2023), ela se doutorou em Artes Cênicas pela UFRGS com a tese "Dramaturgias radicais: poéticas matrísticas para uma arte dialógica" (Piegaz, Acevesmoreno Flores. Dramaturgias negras do Pampa : uma análise decolonial. 2023.; Juguero, 2019), que discute a relação entre criação e recepção dramatúrgica,  abordando a questão da autoria e sua intencionalidade, dialogando com Bakhtin (Juguero, 2019, p. 78-79; Juguero, 2019, p. 43).  Sua dramaturgia, descrita como "radical",  aborda a complexidade das relações humanas, focando em relações étnicas e de gênero, buscando provocar reflexões sobre a identidade brasileira e ampliando as relações emotivo-racionais do público, dando voz a grupos