#**Llama 2/OpenAI+ Pinecone + LangChain**

##**Step 1: Install All the Required Pakages**

In [None]:
!pip -q install langchain
!pip -q install pypdf
!pip -q install unstructured
!pip -q install sentence_transformers
!pip -q install pinecone-client
!pip -q install huggingface_hub
!pip -q install openai
!pip -q install tiktoken
!pip -q install bitsandbytes accelerate xformers einops
!pip -q install datasets loralib sentencepiece
!pip -q install chromadb
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir

#**Step 2: Import All the Required Libraries**

In [None]:
import os
import sys

from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.vectorstores import Pinecone
import pinecone
from langchain import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from huggingface_hub import notebook_login

from langchain.llms import LlamaCpp, OpenAI
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download

from langchain.prompts import PromptTemplate
from langchain.prompts.chat import SystemMessagePromptTemplate

import torch
import transformers
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI

  from tqdm.autonotebook import tqdm


#**Step 3: Load the Data**

In [None]:
!cp -r "/content/drive/MyDrive/Colab Notebooks/GenAI LLM Projects/docs" /content/

In [None]:
document=[]
for file in os.listdir("docs"):
  if file.endswith(".pdf"):
    pdf_path="./docs/"+file
    loader=PyPDFLoader(pdf_path)
    document.extend(loader.load())
  elif file.endswith('.docx') or file.endswith('.doc'):
    doc_path="./docs/"+file
    loader=Docx2txtLoader(doc_path)
    document.extend(loader.load())
  elif file.endswith('.txt'):
    text_path="./docs/"+file
    loader=TextLoader(text_path)
    document.extend(loader.load())

#**Step 4: Split the Text into Chunks**

In [None]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100)

In [None]:
splitted_docs=text_splitter.split_documents(document)

In [None]:
len(splitted_docs)

51

#**Step 5: Setup the Environment. Pinecone and HfHub API keys**

In [None]:
hf_auth = ''   # Your HuggingFace key
pinecone_auth = ''   # Your Pinecone key
openai_auth = ''   # Your openai key

os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_auth

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', pinecone_auth)
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

os.environ["OPENAI_API_KEY"]=openai_auth

#**Step 6: Downlaod the Embeddings**

In [None]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
embeddings_size = len(embeddings.embed_query('c'))   # 384

In [None]:
# embeddings = OpenAIEmbeddings()
# embeddings_size = len(embeddings.embed_query('c'))   # 1536

#**Step 8: Pinecone - Create Embeddings for Each of the Text Chunk**

In [None]:
# # initialize pinecone
# pinecone.init(
#     api_key=PINECONE_API_KEY,  # find at app.pinecone.io
#     environment=PINECONE_API_ENV  # next to api key in console
# )

In [None]:
# index_name = "new-index-pc" # put in the name of your pinecone index here
# pinecone.delete_index(index_name)
# pinecone.create_index(index_name, dimension=embeddings_size, metric="dotproduct")

In [None]:
# # pinecone_db = Pinecone.from_texts([t.page_content for t in splitted_docs], embeddings, index_name=index_name)
# pinecone_db = Pinecone.from_documents(splitted_docs, embeddings, index_name=index_name)

## If you already have an index, you can load it like this


In [None]:
# pinecone_db = Pinecone.from_existing_index(index_name, embeddings)

#**Step 8: Chroma - Create Embeddings for Each of the Text Chunk**

In [None]:
!rm -rf ./chroma_data

In [None]:
chroma_db=Chroma.from_documents(splitted_docs, embedding=embeddings, persist_directory='./chroma_data')
chroma_db.persist()

#**Step 9: Similarity Search**

In [None]:
tmp_docs=chroma_db.similarity_search("what is attention mechanism?")
tmp_docs

#**Step 9: Query the Docs to get the Answer Back (Llama 2 Model)**

##  Quantized Llama CPP from the Hugging Face Community

In [None]:
# ### From: https://huggingface.co/TheBloke/Llama-2-13B-GGUF/tree/main
# model_name_or_path = "TheBloke/Llama-2-13B-GGUF"
# model_basename = "llama-2-13b.Q5_K_S.gguf"
# model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)


# n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
# n_batch = 256  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# # Loading model,
# llm = LlamaCpp(
#     model_path=model_path,
#     max_tokens=512,
#     n_gpu_layers=n_gpu_layers,
#     n_batch=n_batch,
#     callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), # Callbacks support token-wise streaming
#     n_ctx=1024,
#     # verbose=False,
# )

##  Official Meta Llama 2 Model from the Hugging Face

In [None]:
### Quantization config

import torch
import transformers

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
### Model config

model_id = 'meta-llama/Llama-2-13b-chat-hf'

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          use_auth_token=hf_auth,)


model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map='auto',
                                             config=model_config,
                                             quantization_config=bnb_config,
                                             torch_dtype=torch.float16,
                                             use_auth_token=hf_auth,
                                            #   load_in_8bit=True,
                                            #   load_in_4bit=True
                                             )

In [None]:
llama_pipe = pipeline(task="text-generation",
              model=model,
              tokenizer=tokenizer,
              torch_dtype=torch.bfloat16,
              device_map='auto',
              max_new_tokens=512,
              min_new_tokens=-1,
            #   top_k=30,   ### Top k tokens of the output tokens
              top_p=0.1,   ### Top p cumulative probabilities of the output tokens
            #   repetition_penalty=1.2
              )

In [None]:
# from langchain.llms import HuggingFaceHub
# llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

# llm=HuggingFacePipeline(pipeline=llama_pipe, model_kwargs={'temperature':0})
llm=OpenAI(temperature=0.2, model_name='gpt-3.5-turbo')   ### Default model is 'gpt-3.5-turbo'
llm

# Query

In [None]:
def count_chars_in_docs(docs):
    char_counter=0
    for i in range(len(docs)):
        char_counter+=len(docs[i].page_content)

    return char_counter

def reduce_retrived_tokens(retrived_docs):
    char_count_og = count_chars_in_docs(retrived_docs)
    max_char_limit = 3000
    if char_count_og>max_char_limit:
        extra_count = char_count_og - max_char_limit
        chars_to_strip = extra_count//len(retrived_docs)
        chars_to_strip = ((chars_to_strip//10)+1)*10   # Rounding the integer up to nearest 10's, eg, 10,20,30,etc. Eg: 57-->60
        for i in range(len(retrived_docs)):
            retrived_docs[i].page_content = retrived_docs[i].page_content[:-chars_to_strip]
        char_count_new = count_chars_in_docs(retrived_docs)
        print("Reduced token size...")
        print(f"Original size = {char_count_og}. After reduction = {char_count_new}")
    else:
        print("No need for token reduction...")
        print(f"Original size = {char_count_og}")
    return retrived_docs

def get_similar_docs(query, db):
    retrived_docs=db.similarity_search(query)
    retrived_docs=reduce_retrived_tokens(retrived_docs)
    return retrived_docs

In [None]:
prompt_template = """You are a helpful assistant who gives answers in short. \
                     Remember you are talking to a technical audience so you can be as detailed as possible. \
                     Strike a friendly and converstional tone. \
                     Use the following pieces of context to answer the question at the end. \
                     If you don't know the answer, just say that you don't know it, don't try to make up an answer.

{context}

Question: {question}
Answer in short in Hindi:
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [None]:
query = "what is attention mechanism?"
retrived_docs = get_similar_docs(query=query, db=chroma_db)

No need for token reduction...
Original size = 2573


In [None]:
chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)

In [None]:
reply = chain.run(input_documents=retrived_docs, question=query)

In [None]:
reply