In [2]:
import os
import PyPDF2

from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [3]:
# Load pdf files in the local directory
def load_and_split_text(pdf_path):
    loader = PyPDFDirectoryLoader(pdf_path)

    #docs_before_split = loader.load()


    #text_splitter = RecursiveCharacterTextSplitter(
    #    chunk_size = 1024,
    #    chunk_overlap  = 100,
    #)
    #docs_after_split = text_splitter.split_documents(docs_before_split)

    #return docs_after_split
    doc_pages = loader.load()
    return doc_pages
    

In [4]:
import re

def extract_substring_index(text, start_marker, end_marker):
    start_index = text.index(start_marker) + len(start_marker)
    end_index = text.index(end_marker, start_index)
    return text[start_index:end_index]

In [5]:

def get_metadata(text):
    AMBITO ='Ámbito Geográfico'
    INFORMACION='Información Detallada'

    document_tags = ['Referencia','Título','Organismo','Sector','Subsector',
                    AMBITO,'Tipo','Destinatarios','Plazo de solicitud']

    tagIndex = 0
    metadata = {}
    metadataInText=""
    while tagIndex < len(document_tags)-1:
        start = document_tags[tagIndex]
        end = document_tags[tagIndex+1]
        if(start=='Ámbito Geográfico'):
            metadata[start]=extract_substring_index(text,start,end).replace(AMBITO,'').replace(INFORMACION,'').strip()
        else:
            metadata[start]=extract_substring_index(text,start,end).strip()
        metadataInText = metadataInText+", "+start+" es "+metadata[start]
        tagIndex+=1
            
        
    return [ metadata, metadataInText ]

In [6]:
from pathlib import Path
from urllib.parse import urlparse

import requests
def download_file(url,output_path,filename):
    response = requests.get(url)
    if response.status_code == 200:        
        with open(output_path+"/"+filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {filename}")
    else:
        print(f"Failed to download {url}")

In [7]:

from uuid import uuid4


def download_linked_files(page, output_path):
    urls=[]
    if "/Annots" in page:
        for annot in page["/Annots"]:
            annotObj = annot.get_object()
            if("/A" in annotObj):
                uri = annotObj.get("/A").get("/URI")
                if uri is not None:
                    print("[+] URL Found:", uri)
                    urls.append(uri)
    
    if(not os.path.exists(output_path)):
        os.makedirs(output_path)
    for url in urls:
        download_file(url, output_path, str(uuid4())+".pdf")             

In [8]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="jaimevera1107/all-MiniLM-L6-v2-similarity-es",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': False}
)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import chromadb
from chromadbx import UUIDGenerator
import os
from urllib.parse import urlparse

# Cargar modelo de embedding
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Crear una colección en ChromaDB
client = chromadb.PersistentClient('./db_subvenciones')
client.delete_collection("ayudas")
collection = client.create_collection("ayudas")
pathToMetadata = './ayudas/metadatos'
pathToText = './ayudas/texto'
# Función para procesar un PDF
def process_pdf(pdf_path):

    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        limit = 1
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text = page.extract_text().replace("\n"," ")
            if limit==2:
                break
            if (text.find("Ayudas e incentivos (detalle)") > -1):
                a = urlparse(pdf_path)
                output_dir = pathToText+"/"+os.path.basename(a.path)+"/"+"Page_"+str(page_num)
                
                #Get metadata from page
                page_metadata = get_metadata(text)[0]
                page_metadataInText = get_metadata(text)[1]
                download_linked_files(page, output_dir)
    
                splitted_text = load_and_split_text(output_dir)
                
                if(len(splitted_text) > 0):
                    embeddings=[]
                    docs=[]

                    for text in splitted_text:
                        cleanstr=text.page_content.replace("\n","")
                        #Add the metadata in text format to associate it with every chung, since I
                        #consider it important search criteria
                        completestr = "("+page_metadataInText+")"+"."+cleanstr
                        docs.append(completestr)                        
                        embeddings.append(np.array(huggingface_embeddings.embed_query(completestr)))
                    # Agregar a ChromaDB
                    collection.add(
                        ids=UUIDGenerator(len(docs)),
                        documents=docs,
                        embeddings=embeddings,
                        metadatas=[page_metadata]*len(docs)
                    )
                    
                    limit+=1

# Procesar todos los PDFs en una carpeta
import os
for file in os.listdir(pathToMetadata):
    if file.endswith(".pdf"):
        process_pdf(os.path.join(pathToMetadata, file))



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[+] URL Found: https://www.cofides.es/financiacion/internacionalizacion/pyme-invierte
Failed to download https://www.cofides.es/financiacion/internacionalizacion/pyme-invierte
[+] URL Found: https://wapis.ipyme.org/servicioayudas/ayudas/detalle?id=71572&fichero=
[+] URL Found: https://wapis.ipyme.org/servicioayudas/ayudas/detalle?id=71572&fichero=
[+] URL Found: https://wapis.ipyme.org/servicioayudas/ayudas/detalle?id=71572&fichero=
Downloaded 69ffe0d4-c2fe-4132-80a8-1bb983299eaf.pdf
Downloaded 5f80cf3c-88ed-453b-9557-8b0088f7977f.pdf
Downloaded 8492d7dc-4a8e-4dcb-88bb-6ddba7a1d0b7.pdf


In [10]:
query = """  Ayuntamiento de Hernani """  
         # Sample question, change to other questions you are interested in.
# Ejemplo de búsqueda

results = collection.query(
    #query_embeddings = np.array(huggingface_embeddings.embed_query(query)),
    query_texts = [query],
    n_results=2,
    )
print(results)
     

{'ids': [['caba1308-85af-4766-b97b-08e90f2a785c', '874bc68d-126a-441f-87bb-0d42bf7ff6a3']], 'embeddings': None, 'documents': [['(, Referencia es 71572, Título es Se convoca, en régimen de concurrencia competitiva, la concesión de subvenciones  públicas destinadas al incremento de la competitividad del comercio de proximidad en  el territorio de Ceuta, anualidad 2019, Organismo es Consejería de Economía, Hacienda, Administraciones Públicas y Empleo, Sector es Comercio, Subsector es , Ámbito Geográfico es Ceuta, Tipo es Subvención, Destinatarios es Organizaciones interprofesionales, con personalidad jurídica propia y corporaciones  de derecho público).Boletín Oficial de la Ciudad de Ceuta  -  Plaza de África S/N                                            4.639 \uf0b7 Identificación de la deuda cuyo aplazamiento o fraccionamiento se solicita indicando, al menos, su importe, concepto y fe-cha de finalización del plazo de ingreso voluntario. \uf0b7 Causas que motivan la solicitud de aplazam

In [11]:
import chromadb

# Use similarity searching algorithm and return 3 most relevant documents.

db = Chroma(client=client, collection_name="ayudas",embedding_function=huggingface_embeddings)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

  db = Chroma(client=client, collection_name="ayudas",embedding_function=huggingface_embeddings)


In [12]:
# Remote huggingface execution
# from langchain_community.llms import HuggingFaceHub

# hf = HuggingFaceHub(
#     repo_id="stabilityai/stablelm-2-1_6b",
#     model_kwargs={"temperature":0.1, "max_length":500})

# query = """What were the trends in median household income across different states in the United States between 2021 and 2022."""  # Sample question, change to other questions you are interested in.
# hf.invoke(query)

In [13]:
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
#model_name = "datificate/gpt2-small-spanish"
model_name="bigscience/bloomz-560m"
#model_name="facebook/xglm-564M"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.01,
    do_sample=True,
    return_full_text=True,
    max_new_tokens=500,
    device="cpu"
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [14]:
llm.generate(["¿Cómo estás?"])

LLMResult(generations=[[Generation(text='¿Cómo estás?')]], llm_output=None, run=[RunInfo(run_id=UUID('9d356498-db3b-4adb-ab19-53d8757951cf'))], type='LLMResult')

In [None]:
from langchain.chains import ReduceDocumentsChain

qa_template = """Eres un asistente para responder a preguntas. "
    "Usa los fragmentos de texto proporcionados para responder "
    "a la pregunta. Si no conoces la respuesta, simplemente di "
    "que no lo sabes. Usa la frase con la respuesta para 
    responder a la pregunta."
    "\n\n"
    "{context}"

Pregunta: {question}
Respuesta:"""

prompt = PromptTemplate(template=qa_template,
                            input_variables=['context','question'])
combine_custom_prompt='''
Responde con todas las respuestas que encuentres en diferentes documentos.

Text:`{context}`
'''


combine_prompt_template = PromptTemplate(
    template=combine_custom_prompt, 
    input_variables=['context']
)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, chain_type="map_reduce",
 chain_type_kwargs= {
        "token_max":1024,
        "verbose": False,
        "question_prompt": prompt,
        "combine_prompt": combine_prompt_template,
        "combine_document_variable_name": "context"})

question = "Campaña para facilitar las compras" 

result=qa_chain.invoke(question)
print(result['result'])