In [1]:
!pip -q install langchain huggingface_hub tiktoken
!pip -q install chromadb
!pip -q install PyPDF2 pypdf sentence_transformers
!pip -q install --upgrade together

!pip -q install -U FlagEmbedding

## RetrievalQA with LLaMA 2-7B

In [2]:
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip install -q datasets loralib sentencepiece
!pip -q install bitsandbytes accelerate xformers einops
!pip -q install langchain

In [None]:
!nvidia-smi

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [5]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          use_auth_token=True,)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             use_auth_token=True,
                                            #  load_in_8bit=True,
                                            #  load_in_4bit=True
                                             )

In [7]:

!wget http://imlab.postech.ac.kr/dkim/class/csed514_2019s/DeepLearningBook.pdf

--2023-08-29 15:31:48--  http://imlab.postech.ac.kr/dkim/class/csed514_2019s/DeepLearningBook.pdf
Resolving imlab.postech.ac.kr (imlab.postech.ac.kr)... 141.223.12.103
Connecting to imlab.postech.ac.kr (imlab.postech.ac.kr)|141.223.12.103|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21642066 (21M) [application/pdf]
Saving to: ‘DeepLearningBook.pdf’


2023-08-29 15:31:53 (4,55 MB/s) - ‘DeepLearningBook.pdf’ saved [21642066/21642066]



# LangChain multi-doc retriever with ChromaDB

***Key Points***
- Multiple Files - PDFs
- ChromaDB
- LLaMA-2 LLM
- BGE Embeddings


## Setting up LangChain


In [8]:
import os

In [9]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader



## Load multiple and process documents

In [10]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [11]:
len(documents)

802

In [12]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

len(texts)

2410

In [13]:
texts[3]

Document(page_content='CONTENTS3.2RandomVariables. . . . .. . . . . . . .. . . . . . . . .. . . .563.3ProbabilityDistributions. . . . . . . . .. . . . . . . .. . . . . .563.4MarginalProbability. . . . . . . . .. . . . . . . . .. . . . . . .583.5ConditionalProbability. .. . . . . . . .. . . . . . . .. . . . .593.6TheChainRuleofConditionalProbabilities. . . . . . . . .. . .593.7IndependenceandConditionalIndependence. . . . . . . . .. . .603.8Expectation,VarianceandCovariance. . . . . . . . . .. . . . .603.9CommonProbabilityDistributions. . . . . . . . . . . . . . .. .623.10UsefulPropertiesofCommonFunctions. . .. . . . . . . . .. .673.11Bayes’Rule. . . . . . . . . .. . . . . . . .. . . . . . . .. . . .703.12TechnicalDetailsofContinuousVariables. . . . . .. . . . . . .713.13InformationTheory. . . . . . . . . .. . . . . . . .. . . . . . . .723.14StructuredProbabilisticModels. . . .. . . . . . . .. . . . . . .754NumericalComputation804.1OverﬂowandUnderﬂow. . . . . . . . .. . . . . . . .. . .

## HF BGE Embeddings

In [14]:

from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


In [None]:
!nvidia-smi

## create the DB


In [17]:
%%time
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk

persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = model_norm

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

CPU times: user 1min 8s, sys: 866 ms, total: 1min 9s
Wall time: 57.3 s


## Make a retriever

In [18]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

## Make a chain

In [19]:
## Default LLaMA-2 prompt style
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [20]:
sys_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. """

instruction = """CONTEXT:/n/n {context}/n

Question: {question}"""
get_prompt(instruction, sys_prompt)

"[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n<</SYS>>\n\nCONTEXT:/n/n {context}/n\n\nQuestion: {question}[/INST]"

In [21]:
# Use a pipeline for later
from transformers import pipeline

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 512,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

In [23]:
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain
import csv

In [24]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})

In [25]:
from langchain.prompts import PromptTemplate
prompt_template = get_prompt(instruction, sys_prompt)

llama_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [26]:
chain_type_kwargs = {"prompt": llama_prompt}


In [27]:
from langchain.schema import prompt
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       chain_type_kwargs=chain_type_kwargs,
                                       return_source_documents=True)



In [28]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [29]:
# full example
query = "What is a CNN, a RNN and a LSTM?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

  A CNN (Convolutional Neural Network) is a type of neural network that is particularly well-suited for image
classification tasks. It uses convolutional layers to extract features from images, followed by pooling layers
to reduce the dimensionality of the feature space.
A RNN (Recurrent Neural Network) is a type of neural network that is well-suited for sequential data, such as
text or time series data. It has a feedback loop that allows information from previous time steps to influence
the current step, allowing it to capture long-term dependencies in the data.
An LSTM (Long Short-Term Memory) is a type of RNN that is designed to handle the vanishing gradient problem
that can occur in traditional RNNs. It uses a special type of cell state that allows it to selectively forget
or remember information from previous time steps, allowing it to learn long-term dependencies in the data.


Sources:
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf
DeepLearni

In [30]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x7f8c697878d0>)

In [31]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

[INST]<<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. 
<</SYS>>

CONTEXT:/n/n {context}/n

Question: {question}[/INST]
