In [None]:
"""
Loads LLAMA2 model, loads and prepares the phones_dataset, loads finetuned LLM model, performs RAG, generates results for the given queries.
This notebook works on Google Colab with a GPU runtime.
Code inspired from:
https://www.kaggle.com/code/gpreda/rag-using-llama-2-langchain-and-chromadb
https://github.com/mlabonne/llm-course/blob/main/Fine_tune_Llama_2_in_Google_Colab.ipynb
Descriptions are mainly picked from the documentaions of the libraries.
"""

In [None]:
# Install the required pip libraries on Colab
!pip install -q accelerate==0.22.0 peft==0.5.0 bitsandbytes==0.40.2 transformers==4.33.0 trl==0.4.7 langchain==0.0.300 sentence_transformers==2.2.2 sentence_transformers==2.2.2 chromadb==0.4.12

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/251.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m204.8/251.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

In [None]:
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import SFTTrainer
from peft import LoraConfig
import pandas as pd
from google.colab import drive
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA

In [None]:
# Mount Google Drive. Allows store and load from Drive, so files will not be missed after Colab session is expired.
drive.mount('/content/gdrive')
logging.set_verbosity_warning()

Mounted at /content/gdrive


In [None]:
# Define quantization parameters
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # model with 4-bit precision
    bnb_4bit_quant_type="nf4",  # nf4 quantization type initializes the weights using a normal distribution
    bnb_4bit_compute_dtype=getattr(torch, "float16"),  # modifis the data type of the computation. Speeds up the calculation when using QLoRA configuration
    bnb_4bit_use_double_quant=False,  # deactivate the nested quantization
)

In [None]:
# LLM model name to be loaded from Huggingface portal
model_name = "NousResearch/Llama-2-13b-chat-hf"
# Use GPU for calculations. T4 accelator is enough.
device_map = {"": 0}

In [None]:
# Loads the pre-trained tokenizer model that is stored within the LLM model repo in Huggingface portal
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # set padding token to be end of sequence token
tokenizer.padding_side = "right"  # helps avoiding overflow

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
# Load and prepare the dataset that is used for RAG
df = pd.read_csv("/content/all_phones_processed.csv")  # Store the dataset in your current Colab session
df = df[df["Language"] == "eng"]  # Pick only English items
df = df.dropna()  # Drop empty rows
one_line_data = df["Root"] + " " + df["Comment"]  # Wrap the comment and answer of each row in a one-line text
df_text = pd.DataFrame({"text": one_line_data}).reset_index(drop=True)
df_text.head()

Unnamed: 0,text
0,Hello guys\nCan you help me\nI am bit confused...
1,"i have 13 pro with iOS 16.7 , but i want to up..."
2,"i have 13 pro with iOS 16.7 , but i want to up..."
3,not good . wait for 17.1 i already updated to ...
4,"i have 13 pro with iOS 16.7 , but i want to up..."


In [None]:
df_text = df_text.sample(frac=1).reset_index(drop=True)  # shuffle
# df_text = df_text.iloc[0:20000]  # smaller dataset
print(df_text.shape)
df_text.head()

(30014, 1)


Unnamed: 0,text
0,Sure mahn! That 4.7 inches stands out and make...
1,Is it worth upgrading from 14 pro max to 15 pr...
2,"Sucks that they're doing this to reduce costs,..."
3,I've bought the phone and the battery life suc...
4,Yes extremely draining even the phone is turne...


In [None]:
# Load the finetuned model from Google Drive
loaded_model = AutoModelForCausalLM.from_pretrained(
    "/content/gdrive/MyDrive/colab_data/model_13B_30000",
    quantization_config=bnb_config,
    device_map=device_map
)

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/175 [00:00<?, ?B/s]



In [None]:
# Make a pipeline to use a pretrained model for inference.
pipe_tuned = pipeline(
    task="text-generation",
    model=loaded_model,
    tokenizer=tokenizer,
    max_new_tokens=512  # the maximum number of the tokens to generate, not including the input prompt. Defines the length of generated text
    )

In [None]:
# Create a LangChain pipeline from the Transformers pipeline. Required to perform RAG operation using langchain tools.
llm_tuned = HuggingFacePipeline(pipeline=pipe_tuned)

In [None]:
# Convert the pandas DataFrame to Transformers document type
loader = DataFrameLoader(df_text, page_content_column="text")
documents = loader.load()

In [None]:
# Create an embedding model from Huggingface
model_name = "sentence-transformers/all-mpnet-base-v2"  # Sentence-transformers model. Maps sentences to a multi-dim. vector space.
model_kwargs = {"device": "cuda"}  # use GPU for calculations

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# Create a ChromaDB vector database, to store the embeddings
vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory="chroma_db")

In [None]:
# Create a VectorStoreRetriever initialized from the Chroma vector database.
# Retriever finds the most relevant information from the dataset according to the query.
retriever = vectordb.as_retriever()

In [None]:
# Set a question-answering chain.
qa = RetrievalQA.from_chain_type(
    llm=llm_tuned,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

In [None]:
# Run inference. Performs RAG on the LLM model that is finetuned.
question = "Is Apple phone better than an Android phone?"
query=f'''SYSTEM: You are a helpful, respectful and honest technical assistant. Answer only the qestion. Answer in one paragraph.

USER: {question}

ASSISTANT:
'''
qa.run(query)



[1m> Entering new RetrievalQA chain...[0m





[1m> Finished chain.[0m


" \n \nThe choice between an Apple phone and an Android phone depends on your personal preferences and needs. Apple phones are known for their sleek design, user-friendly interface, and seamless integration with other Apple devices. They also have a reputation for being more secure and having better battery life. However, Android phones offer more customization options, a wider range of choices in terms of price and features, and the ability to use third-party apps. Ultimately, the decision comes down to what you value most in a phone. \n \nRespectful Answer: \n \nI understand that you are looking for a new phone and are considering both Apple and Android options. Both have their strengths and weaknesses, and the best choice for you will depend on your specific needs and preferences. I would be happy to help you explore your options and find the best phone for you. \n \nHonest Answer: \n \nI'm just an AI, I don't have personal preferences or biases. However, I can provide you with info