In [1]:
%pip install --upgrade --quiet  langchain langchain-community openai langchain-experimental neo4j tiktoken yfiles_jupyter_graphs python-dotenv json-repair langchain-openai langchain_core PyPDF2


Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
neo4j-graphrag 1.6.1 requires json-repair<0.40.0,>=0.39.1, but you have json-repair 0.42.0 which is incompatible.

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from langchain_core.runnables import  RunnablePassthrough, RunnableLambda
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
from langchain_community.graphs import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
import os
from neo4j import  Driver

from dotenv import load_dotenv

load_dotenv()

In [36]:
# Obtener variables de entorno
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_api_key = os.getenv("AZURE_OPENAI_KEY")
azure_deployment = os.getenv("GPT_ENGINE")
api_version = os.getenv("API_VERSION")
neo4j_url = os.getenv("NEO4J_URL")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
azure_embedding_model=os.getenv("AZURE_EMBEDDING_MODEL")
embedding_deployment=os.getenv("EMBEDDING_DEPLOYMENT")
openai_api_type=os.getenv("OPEN_API_TYPE")
embedding_openai_api_version=os.getenv("EMBEDDING_OPEN_API_VERSION")

In [10]:
graph = Neo4jGraph(url=neo4j_url, username=neo4j_username, password=neo4j_password)

In [26]:
FILES_FOLDER_PATH="./material"

In [49]:
import os
import PyPDF2

documents = []

# Verifica si la carpeta existe
if os.path.isdir(FILES_FOLDER_PATH):
    for filename in os.listdir(FILES_FOLDER_PATH):
        file_path = os.path.join(FILES_FOLDER_PATH, filename)

        # Verifica si el archivo es un PDF
        if filename.lower().endswith(".pdf"):
            try:
                with open(file_path, "rb") as f:
                    reader = PyPDF2.PdfReader(f)
                    text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
                    documents.append({"filename": filename, "content": text})
                    print(f"Text extracted from: {filename}")
            except Exception as e:
                print(f"Error reading {filename}: {e}")

else:
    print(f"The folder '{FILES_FOLDER_PATH}' does not exist.")


In [50]:
for doc in documents:
    print(f"{doc['filename']} - Length of content: {len(doc['content'])}")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Configurar el splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=24)

# Convertir los documentos en chunks
chunked_documents = []
for doc in documents:
    chunks = text_splitter.split_text(doc["content"])
    for i, chunk in enumerate(chunks):
        chunked_documents.append(Document(page_content=chunk, metadata={"filename": doc["filename"], "chunk_id": i + 1}))

print(len(chunked_documents))
print(chunked_documents[:5])



In [52]:
import os
from langchain.document_loaders import TextLoader
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import SystemMessage


# # Definir los nodos y relaciones permitidos para la extracción de información
allowed_nodes = []
allowed_relationships = []


# Configurar Azure OpenAI
llm = AzureChatOpenAI(
    azure_endpoint=azure_endpoint,
    openai_api_key=azure_api_key,
    azure_deployment=azure_deployment,
    model=azure_deployment,  
    api_version=api_version,  
    temperature=0
)

# Configurar el transformador de gráficos con el prompt
llm_transformer = LLMGraphTransformer(
    llm=llm,
#     allowed_nodes=allowed_nodes,
#     allowed_relationships=allowed_relationships,
 )
graph_documents = llm_transformer.convert_to_graph_documents(chunked_documents)

In [None]:
graph_documents[0]

CUIDADO! Para resetear el grafo en caso de ser necesario.

In [None]:
from neo4j import GraphDatabase

# Conectar a la base de datos Neo4j
driver = GraphDatabase.driver(uri=neo4j_url, auth=(neo4j_username, neo4j_password))

# Función para limpiar el grafo
def clear_graph(tx):
    tx.run("MATCH (n) DETACH DELETE n")

# Función para comprobar si el grafo está vacío (contar nodos y relaciones)
def check_empty_graph(tx):
    # Contar tanto los nodos como las relaciones
    result = tx.run("MATCH (n) OPTIONAL MATCH (n)-[r]->() RETURN count(n) AS node_count, count(r) AS relationship_count")
    for record in result:
        return record["node_count"], record["relationship_count"]

# Ejecutar la limpieza del grafo
def execute_clear_graph():
    with driver.session() as session:
        session.execute_write(clear_graph)
        print("Graph database cleared successfully.")

# Ejecutar la comprobación del estado del grafo
def execute_check_empty_graph():
    with driver.session() as session:
        node_count, relationship_count = session.execute_read(check_empty_graph)
        print(f"Node count: {node_count}, Relationship count: {relationship_count}")

# Limpiar el grafo y luego verificar
execute_clear_graph()          # Limpiar el grafo
execute_check_empty_graph()    # Verificar que está vacío


In [33]:
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [None]:
def showGraph():
    driver = GraphDatabase.driver(
        uri = neo4j_url,
        auth = (neo4j_username,
                neo4j_password)
    )
    session = driver.session()
    widget = GraphWidget(graph=session.run("MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t").graph())
    widget.node_label_mapping = 'id'
    return widget

showGraph()

In [None]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain_openai import AzureOpenAIEmbeddings

# Configurar Embeddings de Azure OpenAI
embeddings = AzureOpenAIEmbeddings(
    model=azure_embedding_model,
    openai_api_key=azure_api_key,
    azure_endpoint=azure_endpoint, 
    deployment=embedding_deployment, 
    openai_api_type=openai_api_type,
    openai_api_version=embedding_openai_api_version,
)

# Crear índice vectorial en Neo4j
vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    url=neo4j_url,
    username=neo4j_username,
    password=neo4j_password,
)

# Convertir el índice en un Retriever
vector_retriever = vector_index.as_retriever()

In [38]:
driver = GraphDatabase.driver(
        uri = neo4j_url, 
        auth = (neo4j_username,
                neo4j_password)
    )

def create_fulltext_index(tx):
    query = '''
    CREATE FULLTEXT INDEX `fulltext_entity_id` 
    FOR (n:__Entity__) 
    ON EACH [n.id];
    '''
    tx.run(query)

# Function to execute the query
def create_index():
    with driver.session() as session:
        session.execute_write(create_fulltext_index)
        print("Fulltext index created successfully.")

# Call the function to create the index
try:
    create_index()
except:
    pass

# Close the driver connection
driver.close()

In [39]:
from langchain.prompts import PromptTemplate
from langchain.chains import GraphCypherQAChain
from langchain.chat_models import AzureChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate


# Configurar Azure OpenAI
llmGraph= AzureChatOpenAI(
    azure_endpoint=azure_endpoint,
    openai_api_key=azure_api_key,
    azure_deployment=azure_deployment,
    model=azure_deployment,
    api_version=api_version,
    temperature=0
)


# Usar GraphCypherQAChain para integrar las consultas al gráfico y la LLM
chain = GraphCypherQAChain.from_llm(
    llmGraph,
    graph=graph,
    verbose=True,
    allow_dangerous_requests=True,
    validate_cypher=True
)

# Función para hacer las preguntas all llm contra el graph
def ask_question(query):
    response = chain.invoke({"query": query})
    return response


Lo que ocurre por detras:
1. Conuslta el schema del grafo, de esta forma puede encontrar una query adecuada
2. Genera la consulta cypher.
3. Realiza una valdiaciòn de la conuslta cypher_query_corrector.
4. Ejecuta la consulta contra el grafo.
5. Una ultima consulta a un llm para obtener el reusltado final.

In [None]:
ask_question("")

In [42]:
def full_retriever(query: str):
    graph_data = ask_question(query)
    vector_data = [el.page_content for el in vector_retriever.invoke(query)]
    final_data = f"""Graph data:
{graph_data}
vector data:
{"#Document ".join(vector_data)}
    """
    print(final_data)
    return final_data

In [43]:
llm_final_answer = AzureChatOpenAI(
    azure_endpoint=azure_endpoint,
    openai_api_key=azure_api_key,
    azure_deployment=azure_deployment,
    model=azure_deployment,
    api_version=api_version,
    temperature= 0
)

final_prompt_structure = ChatPromptTemplate.from_messages(
    [
        ("system", "You are AI assistant and you will answers questions using graph data and the vector data."),
        ("human", """Answer the question based only on the following context:
{context}

Question: {query}
Use natural language and be concise.
Answer:"""),
    ]
)

# 3. Crea una función simple para procesar la entrada
def process_input(query):
    # Obtener el contexto usando el retriever
    context = full_retriever(query)
    
    # Preparar los datos para el prompt
    data = {
        "context": context,
        "query": query
    }

    final_prompt = final_prompt_structure.format(**data)
    response = llm_final_answer.invoke(final_prompt)
    return response.text()

In [None]:
query = ""
response = process_input(query)
print("--------FINAL ANSWER--------")
print(response)