In [None]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [None]:
# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./ayudas/")

docs_before_split = loader.load()
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size = 1100,
#     chunk_overlap  = 100,
# )
# docs_after_split = text_splitter.split_documents(docs_before_split)

# docs_after_split[0]

# Temporal para evitar split
docs_after_split = docs_before_split

In [None]:
# avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
# avg_char_before_split = avg_doc_length(docs_before_split)
# avg_char_after_split = avg_doc_length(docs_after_split)

# print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
# print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')


In [None]:
embedding = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True , 'max_length':512}
)

# from langchain.embeddings import SentenceTransformerEmbeddings
# embedding = SentenceTransformerEmbeddings(model_name="hiiamsid/sentence_similarity_spanish_es")


In [None]:
sample_embedding = np.array(embedding.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

In [None]:
vectorstore = Chroma.from_documents(docs_after_split, embedding)


In [None]:
query = """Ayuntamiento de Tomelloso"""  
         # Sample question, change to other questions you are interested in.
relevant_documents = []
relevant_documents = vectorstore.similarity_search(query,5)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query.\n')
for document in relevant_documents:
    print ('*********************')
    print(document)
     

In [None]:
# Use similarity searching algorithm and return 3 most relevant documents.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
# Remote huggingface execution
# from langchain_community.llms import HuggingFaceHub

# hf = HuggingFaceHub(
#     repo_id="stabilityai/stablelm-2-1_6b",
#     model_kwargs={"temperature":0.1, "max_length":500})

# query = """What were the trends in median household income across different states in the United States between 2021 and 2022."""  # Sample question, change to other questions you are interested in.
# hf.invoke(query)

In [None]:
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
#model_name = "datificate/gpt2-small-spanish"
model_name="bigscience/bloomz-560m"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    return_full_text=True,
    max_new_tokens=500,
    device="cpu"
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [None]:
llm.generate(["Translate to English: Je t’aime."])

In [None]:
# from langchain. chat_models import ChatOpenAI


# llm = ChatOpenAI (model_name="gpt-3.5-turbo", temperature=0)


In [None]:
qa_template = """You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"

Pregunta: {question}
Respuesta:"""

prompt = PromptTemplate(template=qa_template,
                            input_variables=['context','question'])
combine_custom_prompt='''
Combina todas las respuestas encontradas en la respuesta final, en diferentes líneas.

Text:`{context}`
'''


combine_prompt_template = PromptTemplate(
    template=combine_custom_prompt, 
    input_variables=['context']
)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, chain_type="map_reduce",
 chain_type_kwargs= {
        "verbose": False,
        "question_prompt": prompt,
        "combine_prompt": combine_prompt_template,
        "combine_document_variable_name": "context"})

question = "¿Cuál es el Titulo del documento?"

result=qa_chain.invoke(question)
print(result['result'])

In [None]:
# Call the QA chain with our query.

prompt_template =  """Eres un asistente de preguntas y respuestas"
    "utiliza la información del contexto para dar una respuesta"
    "\n\n"
    "{context}"

Pregunta: {question}
Respuesta:"""

PROMPT = PromptTemplate(template=prompt_template)


qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, chain_type="stuff",
 chain_type_kwargs= {"prompt": PROMPT})

question = """¿Cuántas ayudas da el Ayuntamiento de La Pobla de Vallbona?"""

result=qa_chain.invoke(question)
print(result['result'])

In [None]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
