In [1]:
!pip install -q torch \
    transformers \
    accelerate \
    bitsandbytes \
    langchain \
    sentence-transformers \
    faiss-gpu \
    openpyxl \
    pacmap \
    python-dotenv \
    pandas \
    datasets \
    matplotlib \
    ragatouille \
    langchain-openai \
    langchain_huggingface \
    ragas

In [3]:
import torch
torch.cuda.device_count()

In [5]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_huggingface import HuggingFacePipeline
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from typing import Optional, List, Tuple
import dill



class SimpleRAG():
    def __init__(self, cache_dir, cache_chunks=True, cache_vector_db=True):
        self.cache_dir = cache_dir
        self.cache_chunks = cache_chunks
        self.cache_vector_db = cache_vector_db
        Path(self.cache_dir).mkdir(parents=True, exist_ok=True)


    def set_prompts(self, system_prompt, context_prompt):
        self.system_prompt = system_prompt
        self.context_prompt = context_prompt


    def load_docs_from_hf(self, dataset_id, split, column):
        self.hf_dataset_id = dataset_id
        self.hf_dataset_split = split
        self.hf_dataset_column = column

        print('- Loading HF Dataset...')
        ds = load_dataset(dataset_id, split=split)

        self.ls_docs = [
            Document(page_content=row[column])
            for row in tqdm(ds, desc='Converting to LangChain Document')
        ]


    def load_embedding_model_from_hf(self, model_id, device='cuda:0'):
        self.hf_embedding_model_id = model_id

        print(f'- Loading Embedding Model & Tokenizer: {self.hf_embedding_model_id}')
        self.embedding_model = HuggingFaceEmbeddings(
            model_name=model_id,
            multi_process=True, # multi GPU
            model_kwargs={'device': device},
            encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
        )

        self.embedding_tokenizer = AutoTokenizer.from_pretrained(model_id)

        self.chunk_size = self.embedding_model.client.max_seq_length
        print(f'Max Seq Length: {self.chunk_size}')


    def splitting_docs(self, percent_overlap=0.1):
        self.ls_text_seps = [
            "\n#{1,6} ",
            "\n\n",
            "\n",
            " ",
            "",
        ]

        self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
            self.embedding_tokenizer,
            chunk_size=self.chunk_size,
            chunk_overlap=int(self.chunk_size * percent_overlap),
            add_start_index=True,
            strip_whitespace=True,
            separators=self.ls_text_seps,
        )

        cache_path = f'{self.cache_dir}/cached_chunks.pkl'
        try:
            with open(cache_path, 'rb') as f:
                print(f'- Loading Chunks from Cache: "{cache_path}"')
                self.ls_chunks = dill.load(f)
        except FileNotFoundError as e:
            ls_chunks_temp= []
            for doc in tqdm(self.ls_docs, desc='- Splitting Documents to Chunks'):
                ls_chunks_temp += self.text_splitter.split_documents([doc])

            set_unique_texts = set()
            ls_unique_chunks = []
            for chunk in tqdm(ls_chunks_temp, desc='- Removing Duplicated Chunks'):
                if chunk.page_content not in set_unique_texts:
                    set_unique_texts.add(chunk.page_content)
                    ls_unique_chunks.append(chunk)

            self.ls_chunks = ls_unique_chunks

            if self.cache_chunks:
                with open(cache_path, 'wb') as f:
                    print(f'- Caching Chunks at "{cache_path}"')
                    dill.dump(self.ls_chunks, f)

        print(f'{len(self.ls_docs):,} Documents splitted into {len(self.ls_chunks):,} Chunks')


    def prepare_vector_db(self):
        def get_cur_time():
            return datetime.now().isoformat()


        #cache_path = f'{self.cache_dir}/cached_vector_db.pkl'
        cache_path = f'{self.cache_dir}/cached_vector_db'
        try:
            # with open(cache_path, 'rb') as f:
            #     print(f'Loading Vector DB from Cache: "{cache_path}"')
            #     self.vector_db = dill.load(f)
            print(f'- Loading Vector DB from Cache: "{cache_path}"')
            self.vector_db = FAISS.load_local(
                cache_path,
                self.embedding_model,
                distance_strategy=DistanceStrategy.COSINE,
                allow_dangerous_deserialization=True,
            )
        except RuntimeError as e:
            time_start = get_cur_time()
            print(f'- Vector DB: Start Embedding at {time_start}')
            self.vector_db = FAISS.from_documents(
                self.ls_chunks,
                self.embedding_model,
                distance_strategy=DistanceStrategy.COSINE,
            )
            time_end = get_cur_time()
            print(f'- Vector DB: Finished Embedding at {time_end}')


            if self.cache_vector_db:
                # with open(cache_path, 'wb') as f:
                #     print(f'Caching Vector DB at "{cache_path}"')
                #     dill.dump(self.vector_db, f)
                print(f'- Caching Vector DB at "{cache_path}"')
                self.vector_db.save_local(cache_path)


    def load_generator_model_from_hf(self, model_id, device='cuda:0', torch_dtype=torch.bfloat16, max_new_tokens=1024, trust_remote_code=True):
        self.hf_generator_model_id = model_id

        print(f'- Loading Generator Model & Tokenizer: "{self.hf_generator_model_id}"')
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code)
        pipe = pipeline(
            "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=max_new_tokens
        )
        self.generator_model = HuggingFacePipeline(pipeline=pipe)
        self.generator_model_output_parser = StrOutputParser()


    def generate_answer(self, question: str, context: Optional[str] = None) -> str:
        if 'phi' in self.hf_generator_model_id.lower():
            # phi3 does not follow system prompt instructions.
            rag_prompt = ChatPromptTemplate.from_messages([
                ("user", self.system_prompt + '\n' + self.context_prompt)
            ])
        else:
            rag_prompt = ChatPromptTemplate.from_messages([
                ("system", self.system_prompt),
                ("user", self.context_prompt)
            ])
        chain = rag_prompt | self.generator_model #| self.generator_model_output_parser
        answer = chain.invoke({'context': context, 'question': question})
        return answer


    def rag_pipeline(self, question, reranker=None, num_retrieved_docs=4):
        print(f'- Vector DB: Retrieving {num_retrieved_docs} Similar Documents')
        retrieved_docs = self.vector_db.similarity_search(
            query=question,
            k=num_retrieved_docs,
        )

        print(f'Vector DB: Retrieved {len(retrieved_docs):,} Documents')

        prompt_context = '\nExtracted documents:\n' + '\n'.join(f"Document {str(i)}:::\n{doc.page_content}" for i, doc in enumerate(retrieved_docs))

        #return retrieved_docs

        #prompt_context = self.retrieved_docs_to_prompt_context(retrieved_docs)
        #return prompt_context
        if reranker:
            # print(f"=> Reranking documents...")
            # relevant_docs_reranked = reranker.rerank(question,
            #                                         [doc.page_content for doc in retrieved_docs],
            #                                         k=NUM_RERANKED_DOCS)
            # print(f'=> After reranking, {len(relevant_docs_reranked)} docs')
            # relevant_docs_reranked_text = [doc["content"] for doc in relevant_docs_reranked]
            # rag_answer = self.generate_answer(question, relevant_docs_reranked_text)
            # if return_context:
            #     return (rag_answer, relevant_docs_reranked)
            pass
        else:
            print('- Generating Answer')
            rag_answer = self.generate_answer(question, prompt_context)
            return (rag_answer, retrieved_docs)




In [None]:
pipeline?

In [8]:
# import yaml
# with open("config.yaml") as stream:
#     try:
#         yaml_dict = yaml.safe_load(stream)
#         print(yaml_dict['system_prompt'])
#     except yaml.YAMLError as exc:
#         print(exc)


Using the information contained in the context, give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
If the answer cannot be deduced from the context, do not generate any response on your own.
Place your response in the following JSON:
{
    "found_the_answer": <True or False>,
    "actual_response": <Str>,
    "id_of_relevant_documents": <List(Int)>,
}


In [9]:
rag = SimpleRAG('test_cache')

In [11]:
system_prompt = """Using the information contained in the context, give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
If the answer cannot be deduced from the context, do not generate any response on your own.
Place your response only in the following JSON and do not generate anything else:
{{
    "found_the_answer": <true or false>,
    "actual_response": <Str>,
    "id_of_relevant_documents": <List(Int)>,
}}"""

#system_prompt = yaml_dict['system_prompt']

context_prompt = """Context:
{context}
---
Now here is the question you need to answer.
{question}""" # keywords {context} & {question} should remain the same for every prompt.

rag = SimpleRAG('test_cache')
rag.set_prompts(system_prompt, context_prompt)
rag.load_docs_from_hf('luisespinosa/maec-2020', 'train', 'text')
rag.load_embedding_model_from_hf('thenlper/gte-small')
rag.splitting_docs()
rag.prepare_vector_db()

- Loading HF Dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Converting to LangChain Document: 100%|██████████| 3443/3443 [00:00<00:00, 8884.46it/s]


- Loading Embedding Model & Tokenizer: thenlper/gte-small




Max Seq Length: 512
- Loading Chunks from Cache: "test_cache/cached_chunks.pkl"
3,443 Documents splitted into 25,863 Chunks
- Loading Vector DB from Cache: "test_cache/cached_vector_db"


In [12]:
rag.load_generator_model_from_hf('microsoft/Phi-3-mini-4k-instruct')

- Loading Generator Model & Tokenizer: "microsoft/Phi-3-mini-4k-instruct"


tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

In [19]:
test_q = 'In what sectors is Barcelona playing an important role?'

In [14]:
test_q = 'Name 3 Persian dishes.'

In [20]:
rd = rag.rag_pipeline(test_q)

- Vector DB: Retrieving 4 Similar Documents
Vector DB: Retrieved 4 Documents
- Generating Answer


In [22]:
print(rd[0])

Human: Using the information contained in the context, give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
If the answer cannot be deduced from the context, do not generate any response on your own.
Place your response in the following JSON:
{
    "found_the_answer": <True or False>,
    "actual_response": <Str>,
    "id_of_relevant_documents": <List(Int)>,
}
Context:

Extracted documents:
Document 0:::
Yes, I mean the ---+ one joint venture is related to our joint venture in Spain with Santander.
And really they've had changing priorities with respect to what they want to focus on.
And as we think about Spain, it continues to be an area that has some economic challenges.
So from our standpoint we really do believe that we have the opportunity to continue to grow in that market on our own with a focus on the right customers.
And I would be very clear we are not exiting the market.
With respect to the 