In [None]:
"""
Loads LLAMA2 model, loads and prepares the phones_dataset, performs Finetuning, generates results for the given queries.
This notebook works on Google Colab with a GPU runtime.
Code inspired from: https://github.com/mlabonne/llm-course/blob/main/Fine_tune_Llama_2_in_Google_Colab.ipynb
Descriptions are mainly picked from the documentaions of the libraries.
"""

In [None]:
# Install the required pip libraries on Colab
!pip install -q accelerate==0.22.0 peft==0.5.0 bitsandbytes==0.40.2 transformers==4.33.0 trl==0.4.7 langchain==0.0.300

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/251.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m204.8/251.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

In [None]:
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import SFTTrainer
from peft import LoraConfig
import pandas as pd
from google.colab import drive
from langchain.llms import HuggingFacePipeline

In [None]:
# Mount Google Drive. Allows store and load from Drive, so files will not be missed after Colab session is expired.
drive.mount('/content/gdrive')
logging.set_verbosity_warning()

Mounted at /content/gdrive


In [None]:
# Define quantization parameters
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # model with 4-bit precision
    bnb_4bit_quant_type="nf4",  # nf4 quantization type initializes the weights using a normal distribution
    bnb_4bit_compute_dtype=getattr(torch, "float16"),  # modifis the data type of the computation. Speeds up the calculation when using QLoRA configuration
    bnb_4bit_use_double_quant=False,  # deactivate the nested quantization
)

In [None]:
# LLM model name to be loaded from Huggingface portal
model_name = "NousResearch/Llama-2-13b-chat-hf"
# Use GPU for calculations. T4 accelator is enough.
device_map = {"": 0}

In [None]:
# Load LLM model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/175 [00:00<?, ?B/s]



In [None]:
# Loads the pre-trained tokenizer model that is stored within the LLM model repo in Huggingface portal
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # set padding token to be end of sequence token
tokenizer.padding_side = "right"  # helps avoiding overflow

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
# Load and prepare the dataset that is used for RAG
df = pd.read_csv("/content/all_phones_processed.csv")  # Store the dataset in your current Colab session
df = df[df["Language"] == "eng"]  # Pick only English items
df = df.dropna()  # Drop empty rows
one_line_data = df["Root"] + " " + df["Comment"]  # Wrap the comment and answer of each row in a one-line text
df_text = pd.DataFrame({"text": one_line_data}).reset_index(drop=True)
df_text.head()

Unnamed: 0,text
0,Hello guys\nCan you help me\nI am bit confused...
1,"i have 13 pro with iOS 16.7 , but i want to up..."
2,"i have 13 pro with iOS 16.7 , but i want to up..."
3,not good . wait for 17.1 i already updated to ...
4,"i have 13 pro with iOS 16.7 , but i want to up..."


In [None]:
df_text = df_text.sample(frac=1).reset_index(drop=True)  # shuffle
# df_text = df_text.iloc[0:20000]  # smaller dataset
print(df_text.shape)
df_text.head()

(30014, 1)


Unnamed: 0,text
0,Sure mahn! That 4.7 inches stands out and make...
1,Is it worth upgrading from 14 pro max to 15 pr...
2,"Sucks that they're doing this to reduce costs,..."
3,I've bought the phone and the battery life suc...
4,Yes extremely draining even the phone is turne...


In [None]:
# Convert the pandas DataFrame to Transformers document type
dataset_phones = Dataset.from_pandas(df_text)

In [None]:
# Define LoRA method.
# PEFT (Parameter-Efficient Fine-Tuning) adapts the pretrained LLM to several downstream applications without fine-tuning all of a model’s parameters.
# Low-Rank Adaptation (LoRA) is a PEFT method that decomposes a large matrix into two smaller matrices. This reduces the number of finetune parameters.
peft_config = LoraConfig(
    lora_alpha=32,  # Alpha parameter for LoRA scaling
    lora_dropout=0.1,  # Dropout probability for LoRA layers
    r=64,  # LoRA attention dimension
    bias="none",  # Bias type for LoRA.
    task_type="CAUSAL_LM",  # Causal Language Modeling involves predicting the next word in a sequence given all the previous words.
)

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir="/content/",  # The output directory where the model predictions and checkpoints will be written.
    num_train_epochs=1.0,  # Total number of training epochs to perform. 1.0 means covering the whole dataset once.
    per_device_train_batch_size=4,  # Batch size per GPU core
    gradient_accumulation_steps=1,  # Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
    optim="paged_adamw_32bit",  # Optimizer. Adam algorithm with weight decay and paging. Paging helps better memory management.
    logging_steps=25,  # Number of update steps between two logs
    learning_rate=2e-5,  # learning rate for paged_adamw optimizer
    weight_decay=0.001,  # The weight decay to apply to all layers except all bias and LayerNorm weights in paged_adamw optimizer.
    fp16=False,  # Whether to use fp16 mixed 16-bit precision training instead of 32-bit training. Mixed precision training combine the use of different numerical formats. 16-bit model is faster.
    bf16=False,  # Whether to use bf16 mixed 16-bit precision training instead of 32-bit training.
    max_grad_norm=0.3,  # Maximum gradient norm for gradient clipping.
    warmup_ratio=0.03,  # Ratio of total training steps used for changing the learning_rate.
    lr_scheduler_type="cosine",  # Creates a schedule for changing the learning rate that follows the cosine function.
    report_to="tensorboard",  # The integration to report the results and logs. TensorBoard is a tool for tracking and visualizing metrics.
)

In [None]:
# Set a Supervised Finetuning Trainer
trainer = SFTTrainer(
    model=model,  # The model to train
    train_dataset=dataset_phones,  # The dataset to use for training.
    peft_config=peft_config,  # The PeftConfig object to use to initialize the PeftModel.
    dataset_text_field="text",  # The name of the text field of the dataset
    max_seq_length=None,  # Maximum sequence length to use
    tokenizer=tokenizer,  # The tokenizer to use for training.
    args=training_arguments,
    packing=False,  # Pack multiple short examples in the same input sequence to increase efficiency
    #preprocess_logits_for_metrics=preprocess_logits_for_metrics
)



Map:   0%|          | 0/30014 [00:00<?, ? examples/s]

In [None]:
# Perform finetuning
trainer.train()
# Store the finetuned model to Google Drive
trainer.save_model("/content/gdrive/MyDrive/colab_data/model_13B_30000")

In [None]:
# Makes a pipeline to use a pretrained model for inference.
pipe_tuned = pipeline(
    task="text-generation",
    model=loaded_model,
    tokenizer=tokenizer,
    max_new_tokens=512, # the maximum number of the tokens to generate, not including the input prompt. Defines the length of generated text
    )

In [None]:
# Create a LangChain pipeline from the Transformers pipeline.
llm_tuned = HuggingFacePipeline(pipeline=pipe_tuned)

In [None]:
# Run inference. Query a prompt from the finetuned LLM model.
question = "Which one is better for games, Samsung or Huawei?"
query=f'''SYSTEM: You are a helpful, respectful and honest technical assistant. Answer only the qestion. Answer in one paragraph.

USER: {question}

ASSISTANT:
'''
llm_tuned(prompt=query)



"\nBoth Samsung and Huawei are great brands with their own strengths and weaknesses. However, when it comes to gaming, Samsung's Exynos chipset is generally considered to be better than Huawei's Kirin chipset. Samsung's Exynos 2200 chipset is the latest flagship chipset from the company and it is designed to provide a seamless gaming experience. It features a powerful CPU and GPU, along with a dedicated AI engine that helps to improve performance. Additionally, Samsung's Exynos chipset is known for its power efficiency, which means that it can provide a longer battery life while gaming. On the other hand, Huawei's Kirin chipset is also a powerful chipset, but it is not as efficient as Samsung's Exynos chipset. However, Huawei's Kirin chipset is known for its excellent performance and it is considered to be one of the best chipsets for gaming. Ultimately, the choice between Samsung and Huawei will depend on your personal preferences and needs. If you are looking for a powerful and effic