In [1]:
import os
import pprint

from langchain_groq import ChatGroq
from langchain_community.document_loaders import WebBaseLoader
from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
import time
import numpy as np
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceBgeEmbeddings


load_dotenv()


USER_AGENT environment variable not set, consider setting it to identify your requests.


False

In [41]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="jaimevera1107/all-MiniLM-L6-v2-similarity-es",
    #model_name="jinaai/jina-embeddings-v2-base-es",
    model_kwargs={'device':'cpu', 'trust_remote_code': True}, 
    encode_kwargs={'normalize_embeddings': False, 'attn_implementation': "eager"},
)

In [42]:
import chromadb
from chromadb.utils import embedding_functions

# Use similarity searching algorithm and return 3 most relevant documents.
client = chromadb.PersistentClient('./db_subvenciones')

db = Chroma(client=client, collection_name="ayudas",embedding_function=huggingface_embeddings)

#retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 10})

In [43]:
collection = client.get_or_create_collection("ayudas")

query = """   Ayuntamiento de Madrid   """  
         # Sample question, change to other questions you are interested in.
# Ejemplo de búsqueda

results = collection.query(
    query_embeddings = np.array(huggingface_embeddings.embed_query(query)),
    #query_texts = [query],

    
    
    n_results=5,
    )


In [44]:
#llm=ChatGroq(groq_api_key=groq_api_key,
#model_name="gemma2-9b-it")
from langchain_community.llms import Ollama
llm = Ollama(model="gemma2:9b", base_url="http://127.0.0.1:11434")


In [None]:
""" #MULTIQUERY RETRIEVER

from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=db.as_retriever(search_kwargs={"k": 6}), llm=llm
) """

In [46]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [47]:

from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="Destinatarios",
        description="Empresas, organizaciones o personas que reciben la ayuda",
        type="string",
    ),
    AttributeInfo(
        name="Organismo",
        description="Organismo que da la ayuda (Ayuntamiento, Consejería, etc.)",
        type="string",
    ),
    AttributeInfo(
        name="Referencia",
        description="Número de referencia y título de la convocatoria (breve descripción)",
        type="string",
    ),
    AttributeInfo(
        name="Sector", 
        description="Sector al que se destina la convocatoria, puede ser uno de estos valores: "+
        "[Turismo,Investigación y desarrollo,Artesano,Comercio,Agroalimentario,Industria,Cultura y comunicación,"+
        "Agrario,Pesquero]", 
        type="string"
    ),
    AttributeInfo(
        name="Subsector", 
        description="Subsector dentro del Sector, completa la información del atributo Sector.", 
        type="string"
    ),
    AttributeInfo(
        name="Tipo", 
        description="Tipo de ayudas", 
        type="string"
    ),
    AttributeInfo(
        name="Ámbito Geográfico", 
        description="Define en qué territorio (ciudad, comunidad o región), se concede la ayuda.", 
        type="string"
    )    
]

In [48]:

from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

selfqueryRetriever = SelfQueryRetriever.from_llm(
    llm,
    db,
    "Subvenciones y ayudas",
    metadata_field_info
)

In [None]:
docs = selfqueryRetriever.invoke("Qué subvenciones da el Ayuntamiento de Cilleros")
pprint.pp(docs)

In [33]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [None]:
docs = retriever.invoke("Qué subvenciones da el Ayuntamiento de Cilleros")
pprint.pp(docs)

In [None]:
from langchain.retrievers import MergerRetriever

mergeRetriever = MergerRetriever(retrievers=[selfqueryRetriever, retriever])



In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder


model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")
compressor = CrossEncoderReranker(model=model, top_n=5)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke("Ayuntamiento de Madrid")
pprint.pp(compressed_docs)

In [None]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

In [None]:
print (relevant_docs)