In [3]:
!pip install accelerate peft bitsandbytes transformers trl langchain -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
from string import Template
from pathlib import Path


import os

import warnings
warnings.simplefilter("ignore")

from tqdm.notebook import tqdm

# for training
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
# for traing set
from datasets import load_dataset
from langchain.prompts import PromptTemplate
import matplotlib.pyplot as plt
import bitsandbytes as bnb
import numpy as np

from IPython.display import Markdown, display

In [5]:
# change model_name to the model of your choice.
# This can be either name of the model on huggingface (requires internet) or path to the model
model_name = "NousResearch/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtyp=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
# this should be set as False for finetuning
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [7]:
!pip install opendatasets -q
!pip install pandas -q

In [8]:
import opendatasets as od
import pandas

od.download("https://www.kaggle.com/datasets/dineshpiyasamara/sentiment-analysis-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: aizenittechnologies
Your Kaggle Key: ··········
Downloading sentiment-analysis-dataset.zip to ./sentiment-analysis-dataset


100%|██████████| 460k/460k [00:00<00:00, 118MB/s]







In [26]:
data = pd.read_csv("/content/sentiment-analysis-dataset/sentiment_analysis.csv")

In [27]:
data

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [28]:
data['label'] = data['label'].replace({0: 'Positive', 1: 'Negative'})

In [29]:
data

Unnamed: 0,id,label,tweet
0,1,Positive,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,Positive,Finally a transparant silicon case ^^ Thanks t...
2,3,Positive,We love this! Would you go? #talk #makememorie...
3,4,Positive,I'm wired I know I'm George I was made that wa...
4,5,Negative,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,Positive,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,Positive,We would like to wish you an amazing day! Make...
7917,7918,Positive,Helping my lovely 90 year old neighbor with he...
7918,7919,Positive,Finally got my #smart #pocket #wifi stay conne...


In [35]:
import datasets
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_dict(data)
my_dataset_dict = datasets.DatasetDict({"train":train_dataset})

In [36]:
my_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'tweet'],
        num_rows: 7920
    })
})

In [37]:
# prepare template
template = """Analyze the sentiment to identify positive or negative. Answer should be one among [Negative, Positive]

Sentiment: {prompt}\n

### Answer: {answer}"""

prompt = PromptTemplate(template=template, input_variables=['prompt', 'answer'])

In [38]:
sample = my_dataset_dict['train'][0]

In [39]:
display(Markdown(prompt.format(prompt=sample['tweet'],
                               answer=sample['label'])))

Analyze the sentiment to identify positive or negative. Answer should be one among [Negative, Positive]

Sentiment: #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone


### Answer: Positive

In [21]:
def format_text(example):
    """ fill inputs in promt for a sample  """
    text = prompt.format(prompt=example['tweet'],
                         answer=example['label'])
    return {"text": text}

In [40]:
my_dataset_dict = my_dataset_dict.map(format_text)

Map:   0%|          | 0/7920 [00:00<?, ? examples/s]

In [41]:
my_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'tweet', 'text'],
        num_rows: 7920
    })
})

In [42]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


In [43]:
def find_linear_layers(model):
    """ find linear layers in given transformer model """
    lora_module_names = set()
    for name, module in model.named_modules():
        # 4 bits for qlora
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)


target_modules = find_linear_layers(model)
#for llama 2 (they need different target module)
qlora_config = LoraConfig(
    r=16,  # dimension of the updated matrices
    lora_alpha=64,  # parameter for scaling
    target_modules=target_modules, # this chooses on which layers QLoRA is applied
    lora_dropout=0.1,  # dropout probability for layers
    bias="none",
    task_type="CAUSAL_LM",
)

LoRA module names: ['gate_proj', 'down_proj', 'k_proj', 'q_proj', 'up_proj', 'v_proj', 'o_proj']


In [45]:
# "max_steps=1" is just for testing execution
training_args = TrainingArguments(
    output_dir="./SFT-llama2-7b",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    logging_steps=20,
    logging_strategy="steps",
    warmup_steps=2,
#     num_train_epochs=1,
    max_steps=1,
    optim="paged_adamw_8bit",
    fp16=True,
    run_name="baseline-llama2-sft",
    save_total_limit=1,  # can be increased, but but beware of kaggle notebook output size limit
    report_to="none"
)

In [47]:
supervised_finetuning_trainer = SFTTrainer(
    model,
    train_dataset=my_dataset_dict['train'],
    args=training_args,
    tokenizer=tokenizer,
    peft_config=qlora_config,
    dataset_text_field="text",
    max_seq_length=3000,
    data_collator=DataCollatorForCompletionOnlyLM(tokenizer=tokenizer,
                                                  response_template="Answer:")
)

Map:   0%|          | 0/7920 [00:00<?, ? examples/s]

In [48]:
supervised_finetuning_trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=1, training_loss=0.0, metrics={'train_runtime': 7.2203, 'train_samples_per_second': 1.108, 'train_steps_per_second': 0.138, 'total_flos': 37331352354816.0, 'train_loss': 0.0, 'epoch': 0.0})

In [49]:
model_to_save = supervised_finetuning_trainer.model.module if hasattr(supervised_finetuning_trainer.model, 'module') else supervised_finetuning_trainer.model
model_to_save.save_pretrained("outputs")

In [68]:
# prepare template
template = """Analyze the sentiment to identify positive or negative. Answer should be one among [Negative, Positive]

Sentiment: {prompt}\n

### Answer: {answer}"""

prompt = PromptTemplate(template=template, input_variables=['prompt', 'answer'])

In [69]:
def format_text_test(example):
    """ fill inputs in promt for a sample  """
    text = prompt.format(prompt=example['tweet'],
                         answer='')
    return {"text": text}

In [70]:
from datasets import Dataset, DatasetDict

# Create sample data
data_train = {'tweet': ['i am dissapointed. hate you. it is false']}

df_train = pd.DataFrame(data_train)

dataset_train = Dataset.from_pandas(df_train)

dataset_dict = DatasetDict({'train': dataset_train})

In [71]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['tweet'],
        num_rows: 1
    })
})

In [72]:
test_dataset = dataset_dict.map(format_text_test)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [73]:
test_dataset

DatasetDict({
    train: Dataset({
        features: ['tweet', 'text'],
        num_rows: 1
    })
})

In [74]:
from torch import nn
class Perplexity(nn.Module):
    def __init__(self, reduce: bool = True):
        super().__init__()
        self.loss_fn = nn.CrossEntropyLoss()
        self.reduce = reduce

    def forward(self, logits, labels):
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        perplexity = []
        for i in range(labels.shape[0]):
            perplexity.append(self.loss_fn(shift_logits[i], shift_labels[i]))
        perplexity = torch.stack(perplexity, dim=0)
        if self.reduce:
            perplexity = torch.mean(perplexity)
        return perplexity

perp = Perplexity()

In [75]:
preds = []
for idx in tqdm(range(len(test_dataset["train"])), total=len(test_dataset["train"])):

    with torch.no_grad():
        cols = ["Negative", "Positive"]
        perps = []
        samples = []
        for col in cols:
            prompt = test_dataset['train'][idx]['text']
            samples.append(prompt + col)
        inputs = tokenizer(samples, return_tensors="pt", add_special_tokens=False, padding=True, truncation=True).to("cuda")

        print(samples)

        output = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        output = output.logits
        labels = inputs["input_ids"]
        labels.masked_fill_(~inputs["attention_mask"].bool(), -100)
        for j in range(len(cols)):
            p = perp(output[j].unsqueeze(0), labels[j].unsqueeze(0))
            perps.append(p.detach().cpu())

        del inputs
        del labels
        del output
        del p

    perps = np.array(perps)
    print(perps)
    predictions = [np.array(cols)[np.argsort(perps)]]
    preds.append(predictions)

  0%|          | 0/1 [00:00<?, ?it/s]

['Analyze the sentiment to identify positive or negative. Answer should be one among [Negative, Positive]\n\nSentiment: i am dissapointed. hate you. it is false\n\n\n### Answer: Negative', 'Analyze the sentiment to identify positive or negative. Answer should be one among [Negative, Positive]\n\nSentiment: i am dissapointed. hate you. it is false\n\n\n### Answer: Positive']
[3.755079  3.8036175]


In [78]:
preds[0][0][0]

'Negative'