In [1]:
from typing import List

# 1. Document loader

In [2]:
from langchain_community.document_loaders import PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


In [3]:
pdf_files: List[str]  = [
    "https://arxiv.org/pdf/2312.16862.pdf", # TinyGPT-V
    "https://arxiv.org/pdf/2308.10792.pdf", # Instruction Tuning
    "https://arxiv.org/pdf/2401.10020.pdf", # Self-Reward
    "https://arxiv.org/pdf/2201.11903.pdf", #Chain-of-Thought Prompting
    "https://arxiv.org/pdf/2401.18059v1.pdf", # RAPTOR
    "https://arxiv.org/pdf/2307.09288.pdf", # LLama 2
    "https://arxiv.org/pdf/2302.13971.pdf", # LLama 1
]

In [4]:
def remove_non_utf8_characters(text):
    utf8_encoded_text = ""
    for char in text:
        try:
            char.encode('utf-8')
            utf8_encoded_text += char
        except UnicodeEncodeError:
            pass
    return utf8_encoded_text

In [5]:
doc_loaded = []

for pdf_file in pdf_files:
    docs = PyPDFLoader(pdf_file, extract_images=True).load()
    for doc in docs:
        doc.page_content = remove_non_utf8_characters(doc.page_content)
    doc_loaded.extend(docs)

In [6]:
print(f"Total number of page loaded: {len(doc_loaded)}")

Total number of page loaded: 250


# 2. Document split

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [8]:
chunk_size = 500
chunk_overlap = 0
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

In [9]:
doc_splitted = splitter.split_documents(doc_loaded)

In [10]:
print(f"Total number of chunks: {len(doc_splitted)}")

Total number of chunks: 1979


# 3. Vector database

In [11]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings()

  embedding_model = HuggingFaceEmbeddings()
  embedding_model = HuggingFaceEmbeddings()


In [12]:
from langchain_chroma import Chroma
chroma_db = Chroma.from_documents(doc_splitted, embedding=embedding_model)

# 4. Retriever

In [13]:
retriever = chroma_db.as_retriever(
    search_type="similarity", 
    search_kwargs={
        "k": 10
    }
)

# 5. LLM 

In [15]:
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

model_name: str = "/data_hdd_16t/khanhtran/LLM/.hf_models/Phi-3-mini-4k-instruct"

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=nf4_config,
    low_cpu_mem_usage=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
max_new_token = 1024

model_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=max_new_token,
    pad_token_id=tokenizer.eos_token_id
)

gen_kwargs = {
    "temperature": 0.9
}

llm = HuggingFacePipeline(
    pipeline=model_pipeline,
    model_kwargs=gen_kwargs
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.78s/it]
Device set to use cuda:0
  llm = HuggingFacePipeline(


# 5. RAG prompt

In [None]:
%pip install langchainhub

In [16]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

In [17]:
prompt.messages[0].prompt.input_variables

['context', 'question']

In [44]:
print(prompt.messages[0].prompt.template)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:


# 6. Str Parser

In [18]:
from langchain_core.output_parsers import StrOutputParser

str_parser = StrOutputParser()

# 6. RAG Chain

In [19]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [21]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | str_parser
)

In [22]:
user_question = "What is Instruction Tuning?"

output = rag_chain.invoke(user_question)

# 7. Post processing

In [23]:
import re

In [25]:
def extract_answer(
        text_response: str, 
        pattern: str = r"Answer:\s*(.*)"
    ) -> str:
    match = re.search(pattern, text_response)

    if match:
        answer_text = match.group(1).strip()
        return answer_text
    else:
        return "Answer not found."

In [None]:
res = extract_answer(output)
print(res)


'Instruction tuning (IT) focuses on the quality and variety of synthetic data rather than manual annotations which can be slower and costlier. It utilizes two primary methods: distillation, which imparts knowledge from pre-trained language models like GPT-3, PaLM, and LLaMA, and self-improvement, which uses a seed set of (evaluation instruction prompt, evaluation result response) examples for training. This enables the model to perform tasks by making educated guesses instead of truly grasping the underlying tasks.'