In [None]:
import os
from langchain_groq import ChatGroq
from langchain_community.document_loaders import WebBaseLoader
from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
import time
import numpy as np
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceBgeEmbeddings


load_dotenv()
## load the Groq API key
groq_api_key=os.environ['GROQ_API_KEY']

In [2]:
# Load pdf files in the local directory
def load_and_split_text(pdf_path):
    loader = PyPDFDirectoryLoader(pdf_path)

    # docs_before_split = loader.load()


    # text_splitter = RecursiveCharacterTextSplitter(
    #     chunk_size = 512,
    #     chunk_overlap  = 100,
    # )
    # docs_after_split = text_splitter.split_documents(docs_before_split)

    # return docs_after_split
    doc_pages = loader.load()
    return doc_pages
    

In [3]:
import re

def extract_substring_index(text, start_marker, end_marker):
    start_index = text.index(start_marker) + len(start_marker)
    end_index = text.index(end_marker, start_index)
    return text[start_index:end_index]

In [4]:

def get_metadata(text):
    AMBITO ='Ámbito Geográfico'
    INFORMACION='Información Detallada'

    document_tags = ['Referencia','Organismo','Sector','Subsector',
                    AMBITO,'Tipo','Destinatarios','Plazo de solicitud']

    tagIndex = 0
    metadata = {}
    metadataInText=""
    while tagIndex < len(document_tags)-1:
        start = document_tags[tagIndex]
        end = document_tags[tagIndex+1]
        if(start=='Ámbito Geográfico'):
            metadata[start]=extract_substring_index(text,start,end).replace(AMBITO,'').replace(INFORMACION,'').strip()
        else:
            metadata[start]=extract_substring_index(text,start,end).strip()
        #metadataInText = metadataInText+", "+start+" es "+metadata[start]
        tagIndex+=1
            
        
    #return [ metadata, metadataInText ]
    return metadata

In [5]:
from pathlib import Path
from urllib.parse import urlparse

import requests
def download_file(url,output_path,filename):
    response = requests.get(url)
    if response.status_code == 200:        
        with open(output_path+"/"+filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {filename}")
    else:
        print(f"Failed to download {url}")

In [6]:

from uuid import uuid4


def download_linked_files(page, output_path):
    urls=[]
    if "/Annots" in page:
        for annot in page["/Annots"]:
            annotObj = annot.get_object()
            if("/A" in annotObj):
                uri = annotObj.get("/A").get("/URI")
                if uri is not None:
                    print("[+] URL Found:", uri)
                    urls.append(uri)
    
    if(not os.path.exists(output_path)):
        os.makedirs(output_path)
    for url in urls:
        download_file(url, output_path, str(uuid4())+".pdf")             

In [None]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    #model_name="jaimevera1107/all-MiniLM-L6-v2-similarity-es",
    model_name="jinaai/jina-embeddings-v2-base-es",
    model_kwargs={'device':'cpu', 'trust_remote_code': True}, 
    encode_kwargs={'normalize_embeddings': False, 'attn_implementation': "eager"},
)

In [None]:
import PyPDF2
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import chromadb
from chromadbx import UUIDGenerator
import os
from urllib.parse import urlparse
from chromadb.utils import embedding_functions


# Cargar modelo de embedding
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Crear una colección en ChromaDB
client = chromadb.PersistentClient('./db_subvenciones')
client.delete_collection("ayudas")
collection = client.create_collection("ayudas")
pathToMetadata = './ayudas/metadatos'
pathToText = './ayudas/texto'
# Función para procesar un PDF
def process_pdf(pdf_path):

    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        limit = 1
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text = page.extract_text().replace("\n"," ")
            if limit==10:
                break
            if (text.find("Ayudas e incentivos (detalle)") > -1):
                a = urlparse(pdf_path)
                output_dir = pathToText+"/"+os.path.basename(a.path)+"/"+"Page_"+str(page_num)
                
                #Get metadata from page
                """  metadata = get_metadata(text)
                page_metadata = metadata[0]
                page_metadataInText = metadata[1] """
                
                page_metadata = get_metadata(text)
                download_linked_files(page, output_dir)
    
                splitted_text = load_and_split_text(output_dir)
                
                if(len(splitted_text) > 0):
                    embeddings=[]
                    docs=[]

                    for text in splitted_text:
                        cleanstr=text.page_content.replace("\n","")
                        #Add the metadata in text format to associate it with every chung, since I
                        #consider it important search criteria
                        #completestr = "("+page_metadataInText+")."+cleanstr
                        completestr = cleanstr
                        docs.append(completestr)                        
                        embeddings.append(np.array(huggingface_embeddings.embed_query(completestr)))
                    # Agregar a ChromaDB
                    collection.add(
                        ids=UUIDGenerator(len(docs)),
                        documents=docs,
                        embeddings=embeddings,
                        metadatas=[page_metadata]*len(docs)
                    )
                    
                    limit+=1

# Procesar todos los PDFs en una carpeta
import os
for file in os.listdir(pathToMetadata):
    if file.endswith(".pdf"):
        process_pdf(os.path.join(pathToMetadata, file))



In [9]:
query = """   71572   """  
         # Sample question, change to other questions you are interested in.
# Ejemplo de búsqueda

results = collection.query(
    query_embeddings = np.array(huggingface_embeddings.embed_query(query)),
    #query_texts = [query],
    n_results=5,
    )
print(results)
     

In [10]:
import chromadb

# Use similarity searching algorithm and return 3 most relevant documents.

db = Chroma(client=client, collection_name="ayudas",embedding_function=huggingface_embeddings)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 10})