In [1]:
# install necessary libraries
!pip install accelerate peft bitsandbytes transformers trl langchain -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m256.0/261.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.0/136.0 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.9/133.9 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [2]:
# install libraries to work with kaggle dataset
!pip install opendatasets -q
!pip install pandas -q

In [3]:
import warnings
warnings.simplefilter("ignore")

In [4]:
import os
import re
import string
import pandas as pd
from string import Template
from pathlib import Path
from tqdm.notebook import tqdm
import bitsandbytes as bnb
import numpy as np
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from datasets import load_dataset
from langchain.prompts import PromptTemplate

import opendatasets as od
import pandas
import datasets

from IPython.display import Markdown, display

### Base Model configurations

In [5]:
model_name = "NousResearch/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtyp=torch.bfloat16,
)

### Load Model

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
# set configurations for fine tunning
model.config.use_cache = False

Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

### Load Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

### Download a dataset for fine-tunning

In [8]:
od.download("https://www.kaggle.com/datasets/dineshpiyasamara/sentiment-analysis-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: dineshpiyasamara
Your Kaggle Key: ··········
Downloading sentiment-analysis-dataset.zip to ./sentiment-analysis-dataset


100%|██████████| 460k/460k [00:00<00:00, 149MB/s]







### Load dataset

In [9]:
dataset = pd.read_csv("/content/sentiment-analysis-dataset/sentiment_analysis.csv")

In [10]:
dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


### Data Preprocessing

In [11]:
dataset['label'] = dataset['label'].replace({0: 'Positive', 1: 'Negative'})

In [12]:
dataset.head()

Unnamed: 0,id,label,tweet
0,1,Positive,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,Positive,Finally a transparant silicon case ^^ Thanks t...
2,3,Positive,We love this! Would you go? #talk #makememorie...
3,4,Positive,I'm wired I know I'm George I was made that wa...
4,5,Negative,What amazing service! Apple won't even talk to...


In [13]:
# convert uppercase to lowercase
dataset["tweet"] = dataset["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))

# remove urls within tweets
dataset["tweet"] = dataset['tweet'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))

# remove punctuations
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

dataset["tweet"] = dataset["tweet"].apply(remove_punctuations)

In [14]:
dataset.head()

Unnamed: 0,id,label,tweet
0,1,Positive,fingerprint pregnancy test android apps beaut...
1,2,Positive,finally a transparant silicon case thanks to ...
2,3,Positive,we love this would you go talk makememories un...
3,4,Positive,im wired i know im george i was made that way ...
4,5,Negative,what amazing service apple wont even talk to m...


In [15]:
# convert pandas dataframe into DatasetDict object
train_dataset = datasets.Dataset.from_dict(dataset)
dataset_dict = datasets.DatasetDict({"train":train_dataset})

In [16]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'tweet'],
        num_rows: 7920
    })
})

### Prepare the Template

In [17]:
template = """Analyze the sentiment to identify positive or negative. Answer should be one among [Negative, Positive]

Sentiment: {prompt}\n

### Answer: {answer}"""

prompt = PromptTemplate(template=template, input_variables=['prompt', 'answer'])

In [18]:
# get data sample
sample_data = dataset_dict['train'][0]
sample_data

{'id': 1,
 'label': 'Positive',
 'tweet': 'fingerprint pregnancy test  android apps beautiful cute health igers iphoneonly iphonesia iphone'}

In [19]:
display(Markdown(prompt.format(prompt=sample_data['tweet'],
                               answer=sample_data['label'])))

Analyze the sentiment to identify positive or negative. Answer should be one among [Negative, Positive]

Sentiment: fingerprint pregnancy test  android apps beautiful cute health igers iphoneonly iphonesia iphone


### Answer: Positive

In [20]:
# apply prompt formatting to whole dataset
def format_text(example):
    text = prompt.format(prompt=example['tweet'],
                         answer=example['label'])
    return {"text": text}

In [21]:
dataset_dict = dataset_dict.map(format_text)

Map:   0%|          | 0/7920 [00:00<?, ? examples/s]

In [22]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'tweet', 'text'],
        num_rows: 7920
    })
})

### Model Fine-tunning

In [23]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


In [24]:
# this method used to find linear layers in the model
def find_linear_layers(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        # 4 bits for qlora
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)

In [25]:
target_modules = find_linear_layers(model)

# set up qlora configurations
qlora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=target_modules,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

LoRA module names: ['up_proj', 'gate_proj', 'k_proj', 'o_proj', 'down_proj', 'q_proj', 'v_proj']


In [26]:
# set up training arguments
training_args = TrainingArguments(
    output_dir="./SFT-llama2-7b",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    logging_steps=20,
    logging_strategy="steps",
    warmup_steps=2,
    # num_train_epochs=1,
    max_steps=4,
    optim="paged_adamw_8bit",
    fp16=True,
    run_name="baseline-llama2-sft",
    save_total_limit=1,
    report_to="none"
)

In [27]:
# initialize the supervised fine-tunning object
supervised_finetuning_trainer = SFTTrainer(
    model,
    train_dataset=dataset_dict['train'],
    args=training_args,
    tokenizer=tokenizer,
    peft_config=qlora_config,
    dataset_text_field="text",
    max_seq_length=3000,
    data_collator=DataCollatorForCompletionOnlyLM(tokenizer=tokenizer,
                                                  response_template="Answer:")
)

Map:   0%|          | 0/7920 [00:00<?, ? examples/s]

In [28]:
# fine-tune the model with new dataset
supervised_finetuning_trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=4, training_loss=0.0, metrics={'train_runtime': 13.4284, 'train_samples_per_second': 2.383, 'train_steps_per_second': 0.298, 'total_flos': 100347951415296.0, 'train_loss': 0.0, 'epoch': 0.0})

In [29]:
# save fine-tunned model
model_to_save = supervised_finetuning_trainer.model.module if hasattr(supervised_finetuning_trainer.model, 'module') else supervised_finetuning_trainer.model
model_to_save.save_pretrained("SFT-llama2-7b")

### Get Predictions

In [30]:
# lora_config = LoraConfig.from_pretrained('SFT-llama2-7b')
# model = get_peft_model(model, lora_config)

In [31]:
# prepare template
template = """Analyze the sentiment to identify positive or negative. Answer should be one among [Negative, Positive]

Sentiment: {prompt}\n

### Answer: {answer}"""

prompt = PromptTemplate(template=template, input_variables=['prompt', 'answer'])

In [32]:
class Perplexity(nn.Module):
    def __init__(self, reduce: bool = True):
        super().__init__()
        self.loss_fn = nn.CrossEntropyLoss()
        self.reduce = reduce

    def forward(self, logits, labels):
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        perplexity = []
        for i in range(labels.shape[0]):
            perplexity.append(self.loss_fn(shift_logits[i], shift_labels[i]))
        perplexity = torch.stack(perplexity, dim=0)
        if self.reduce:
            perplexity = torch.mean(perplexity)
        return perplexity

perp = Perplexity()

In [33]:
def format_text_test(example):
    text = prompt.format(prompt=example['tweet'],
                         answer='')
    return {"text": text}

In [37]:
# prepare test sample
data_train = {'tweet': ['Tried a new recipe, and it was a disaster. I should stick to ordering takeout']}
df_train = pd.DataFrame(data_train)
dataset_train = datasets.Dataset.from_pandas(df_train)
dataset_dict = datasets.DatasetDict({'train': dataset_train})

In [38]:
test_dataset = dataset_dict.map(format_text_test)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [39]:
# get prediction
for idx in tqdm(range(len(test_dataset["train"])), total=len(test_dataset["train"])):

    with torch.no_grad():
        cols = ["Negative", "Positive"]
        perps = []
        samples = []
        for col in cols:
            prompt = test_dataset['train'][idx]['text']
            samples.append(prompt + col)
        inputs = tokenizer(samples, return_tensors="pt", add_special_tokens=False, padding=True, truncation=True).to("cuda")

        print(samples)

        output = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        output = output.logits
        labels = inputs["input_ids"]
        labels.masked_fill_(~inputs["attention_mask"].bool(), -100)
        for j in range(len(cols)):
            p = perp(output[j].unsqueeze(0), labels[j].unsqueeze(0))
            perps.append(p.detach().cpu())

        del inputs
        del labels
        del output
        del p

    perps = np.array(perps)
    print(perps)

  0%|          | 0/1 [00:00<?, ?it/s]

['Analyze the sentiment to identify positive or negative. Answer should be one among [Negative, Positive]\n\nSentiment: Random acts of kindness make the world a better place. Someone bought my coffee today!\n\n\n### Answer: Negative', 'Analyze the sentiment to identify positive or negative. Answer should be one among [Negative, Positive]\n\nSentiment: Random acts of kindness make the world a better place. Someone bought my coffee today!\n\n\n### Answer: Positive']
[3.1746397 2.9995096]
