In [None]:
"""
Loads LLAMA2 model, loads and prepares the phones_dataset, performs RAG, generates results for the given queries.
This notebook works on Google Colab with a GPU runtime.
Code inspired from: https://www.kaggle.com/code/gpreda/rag-using-llama-2-langchain-and-chromadb
"""

In [None]:
# Install the required pip libraries on Colab
!pip install -q accelerate==0.22.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.33.0 trl==0.4.7 langchain==0.0.300 sentence_transformers==2.2.2 chromadb==0.4.12

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.5/426.5 kB[0m [31m36.7 MB/s

In [None]:
import pandas as pd
import torch
from datasets import Dataset
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)

In [None]:
# Silent unnecessary warnings of Transformers library
logging.set_verbosity_warning()

In [None]:
# Define quantization parameters
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # model with 4-bit precision
    bnb_4bit_quant_type="nf4",  # nf4 quantization type initializes the weights using a normal distribution
    bnb_4bit_compute_dtype=getattr(torch, "float16"),  # modifis the data type of the computation. Speeds up the calculation when using QLoRA configuration
    bnb_4bit_use_double_quant=False,  # deactivate the nested quantization
)

In [None]:
# LLM model name to be loaded from Huggingface portal
model_name = "NousResearch/Llama-2-13b-chat-hf"
# Use GPU for calculations. T4 accelator is enough.
device_map = {"": 0}

In [None]:
# Load LLM model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/175 [00:00<?, ?B/s]



In [None]:
# Loads the pre-trained tokenizer model that is stored within the LLM model repo in Huggingface portal
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # set padding token to be end of sequence token
tokenizer.padding_side = "right"  # helps avoiding overflow

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
# Makes a pipeline to use a pretrained model for inference.
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=2048  # the maximum number of the tokens to generate, not including the input prompt. Defines the length of generated text
    )

In [None]:
# Create a LangChain pipeline from the Transformers pipeline. Required to perform RAG operation using langchain tools.
llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
# Load and prepare the dataset that is used for RAG
df = pd.read_csv("/content/all_phones_processed.csv")  # Store the dataset in your current Colab session
df = df[df["Language"] == "eng"]  # Pick only English items
df = df.dropna()  # Drop empty rows
one_line_data = df["Root"] + " " + df["Comment"]  # Wrap the comment and answer of each row in a one-line text
df_text = pd.DataFrame({"text": one_line_data}).reset_index(drop=True)
df_text.head()

Unnamed: 0,text
0,Hello guys\nCan you help me\nI am bit confused...
1,"i have 13 pro with iOS 16.7 , but i want to up..."
2,"i have 13 pro with iOS 16.7 , but i want to up..."
3,not good . wait for 17.1 i already updated to ...
4,"i have 13 pro with iOS 16.7 , but i want to up..."


In [None]:
# Convert the pandas DataFrame to Transformers document type
loader = DataFrameLoader(df_text, page_content_column="text")
documents = loader.load()

In [None]:
# Data split step. Splits each item of the dataset into smaller chunks. This is only useful if the items are very long. For our dataset, this step is not needed and has no effect.

# from transformers import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=8)
# documents = text_splitter.split_documents(documents)

In [None]:
# Create an embedding model from Huggingface
model_name = "sentence-transformers/all-mpnet-base-v2"  # Sentence-transformers model. Maps sentences to a multi-dim. vector space.
model_kwargs = {"device": "cuda"}  # use GPU for calculations

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# Create a ChromaDB vector database, to store the embeddings
vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory="chroma_db")

In [None]:
# Create a VectorStoreRetriever initialized from the Chroma vector database.
# Retriever finds the most relevant information from the dataset according to the query.
retriever = vectordb.as_retriever()

In [None]:
# Set a question-answering chain.
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # stuffs all the documents into the final prompt
    retriever=retriever,
    verbose=True,
)

In [None]:
# Run inference. Query a prompt from the actual LLM model (without RAG) and query the same prompt from QA chain (RAG)
def evaluate_rag(query):
  print(f"Query:\n{query}\n-------------")
  without_rag = llm(prompt=query)
  print("Answer - Without RAG\n")
  print(without_rag)
  print("\n\n-------------\nAnswer - With RAG\n")
  with_rag = qa.run(query)
  print(with_rag)

In [None]:
# Perform similarity search. Identifies the most similar items to the query, found in the dataset. Gives the score according to distance asynchronously.
def similarity_search(query):
  docs = vectordb.similarity_search_with_score(query)
  print(f"Retrieved documents: {len(docs)}")
  for doc in docs:
      doc_details = doc[0].to_json()['kwargs']
      #print("Source: ", doc_details['metadata']['source'])
      print("Source: ", round(doc[1], 3), "Text: ", doc_details['page_content'], "\n")

In [None]:
question = "Describe Samsung S23 Ultra from a photographers point of view."
query=f'''SYSTEM: You are a helpful, respectful and honest technical assistant. Answer only the qestion. Give a long answer. Your answer should be based on the users opinions. Start the answer with: Based on users opinions...

USER: {question}

ASSISTANT:
'''

evaluate_rag(query)
print("-------------------------")
similarity_search(question)
print("-------------------------------------------------------------------------------------------------------------------")

Query:
SYSTEM: You are a helpful, respectful and honest technical assistant. Answer only the qestion. Give a long answer. Your answer should be based on the users opinions. Start the answer with: Based on users opinions...

USER: Describe Samsung S23 Ultra from a photographers point of view.

ASSISTANT:

-------------
Answer - Without RAG


Based on users opinions, the Samsung S23 Ultra is a top-notch smartphone that offers exceptional camera capabilities for photographers. The quad-camera setup, which includes a 50-megapixel primary sensor, a 12-megapixel front camera, a 3D depth sensor, and a macro camera, provides a wide range of photography options.

The primary sensor, with its high resolution and large pixel size, delivers excellent image quality with detailed and vibrant colors. The camera also features advanced features such as optical zoom, digital zoom, and bokeh mode, allowing photographers to capture high-quality images with ease.

The front camera, with its 12-megapixel re