## Qlora fine tuning on llama2

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install trl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

Installing necessary libraries

In [2]:
from datasets import load_dataset, Dataset, DatasetDict
from dataclasses import dataclass, field
from typing import Optional
import torch
from peft import LoraConfig
from tqdm import tqdm
import pandas as pd
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, AutoTokenizer, pipeline
from trl import SFTTrainer

tqdm.pandas()

# nli_data = load_dataset('klue', 'nli')

# print(nli_data)

# train_data = nli_data['train']
# validation_data = nli_data['validation']


train_data = pd.read_csv('train.csv', sep='\t')
validation_data = pd.read_csv('validation.csv', sep='\t')
dataset_dict = DatasetDict({
    'train': train_data,
    'validation': validation_data
})
train_data = dataset_dict['train']
print(train_data)
validation_data = dataset_dict['validation']
print(validation_data)

                                                   text     label
0     The production is to be liquidated before June...  negative
1     The market making in accordance with the agree...   neutral
2     The largest construction company in Finland , ...   neutral
3     The transaction , which includes US$ 1.5 billi...   neutral
4     MADISON , Wis. , Feb. 6 - PRNewswire - -- Fisk...  positive
...                                                 ...       ...
3401  Based on the first quarter result , existing o...  negative
3402  Aldata said that there are still a number of o...   neutral
3403  The casing comprises a first side casing membe...   neutral
3404  The most significant capital expenditure items...   neutral
3405  Stockmann MasterCard has widened the scope of ...  positive

[3406 rows x 2 columns]
                                                  text     label
0    According to Scanfil , demand for telecommunic...  negative
1    Kemira 's partner in the project is St. Peters..

Arrange the data

In [3]:
import random


# id_to_label = {0:'entailment', 1:'neutral', 2:'contradiction'}

# question_template = "### Human: 다음 두 문장의 관계를 entailment, neutral, contradiction 중 하나로 분류해줘. "
# train_instructions = [f'{question_template}\npremise: {x}\nhypothesis: {y}\n\n### Assistant: {id_to_label[z]}' for x,y,z in zip(train_data['premise'],train_data['hypothesis'],train_data['label'])]
# validation_instructions = [f'{question_template}\npremise: {x}\nhypothesis: {y}\n\n### Assistant: {id_to_label[z]}' for x,y,z in zip(validation_data['premise'],validation_data['hypothesis'],validation_data['label'])]

id_to_label = {'negative':0, 'neutral':1, 'positive':2}

question_template = "### Classify the sentiment of the following sentence as negative, neutral or positive."

train_instructions = [f'{question_template}\nStock comments: {x}\n\n### Answer: {y}' for x,y in zip(train_data['text'],train_data['label'])]
validation_instructions = [f'{question_template}\nStock comments: {x}\n\n### Answer: {y}' for x,y in zip(validation_data['text'],validation_data['label'])]


# random.seed(42)
# train_instructions = random.sample(train_instructions, len(train_instructions))
# train_instructions = train_instructions[:300]
# random.seed(42)
# validation_instructions = random.sample(validation_instructions, len(validation_instructions))
# # validation_instructions = validation_instructions[:100]


ds_train = Dataset.from_dict({"text": train_instructions})
ds_validation = Dataset.from_dict({"text": validation_instructions})
instructions_ds_dict = DatasetDict({"train": ds_train, "eval": ds_validation})

In [4]:
print(len(train_instructions))

3406


In [5]:
instructions_ds_dict['train']['text'][0]

'### Classify the sentiment of the following sentence as negative, neutral or positive.\nStock comments: The production is to be liquidated before June 2009 and 325 employees loose their jobs\n\n### Answer: negative'

In [6]:
instructions_ds_dict['eval']['text'][0]

'### Classify the sentiment of the following sentence as negative, neutral or positive.\nStock comments: According to Scanfil , demand for telecommunications network products has fluctuated significantly in the third quarter of 2006 , and the situation is expected to remain unstable for the rest of the year\n\n### Answer: negative'


Training, used beomi/llama-2

In [7]:
model_name = "beomi/llama-2-ko-7b"


@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default=model_name, metadata={"help": "the model name"})
    dataset_text_field: Optional[str] = field(default="text", metadata={"help": "the text field of the dataset"})
    log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
    learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"})
    batch_size: Optional[int] = field(default=4, metadata={"help": "the batch size"})
    seq_length: Optional[int] = field(default=512, metadata={"help": "Input sequence length"})
    gradient_accumulation_steps: Optional[int] = field(
        default=2, metadata={"help": "the number of gradient accumulation steps"}
    )
    load_in_8bit: Optional[bool] = field(default=False, metadata={"help": "load the model in 8 bits precision"})
    load_in_4bit: Optional[bool] = field(default=True, metadata={"help": "load the model in 4 bits precision"})
    use_peft: Optional[bool] = field(default=True, metadata={"help": "Wether to use PEFT or not to train adapters"})
    trust_remote_code: Optional[bool] = field(default=True, metadata={"help": "Enable `trust_remote_code`"})
    output_dir: Optional[str] = field(default="output", metadata={"help": "the output directory"})
    peft_lora_r: Optional[int] = field(default=64, metadata={"help": "the r parameter of the LoRA adapters"})
    peft_lora_alpha: Optional[int] = field(default=16, metadata={"help": "the alpha parameter of the LoRA adapters"})
    logging_steps: Optional[int] = field(default=1, metadata={"help": "the number of logging steps"})
    use_auth_token: Optional[bool] = field(default=False, metadata={"help": "Use HF auth token to access the model"})
    num_train_epochs: Optional[int] = field(default=5, metadata={"help": "the number of training epochs"}) #train epoch
    max_steps: Optional[int] = field(default=-1, metadata={"help": "the number of training steps"})
    save_steps: Optional[int] = field(
        default=100, metadata={"help": "Number of updates steps before two checkpoint saves"}
    )
    save_total_limit: Optional[int] = field(default=10, metadata={"help": "Limits total number of checkpoints."})
    push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Push the model to HF Hub"})
    hub_model_id: Optional[str] = field(default=None, metadata={"help": "The name of the model on HF Hub"})


script_args = ScriptArguments()

4-bit quantization implemented

In [8]:
if script_args.load_in_8bit and script_args.load_in_4bit:
    raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
elif script_args.load_in_8bit or script_args.load_in_4bit:
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=script_args.load_in_8bit, load_in_4bit=script_args.load_in_4bit
    )
    device_map = {"": 0}
    torch_dtype = torch.bfloat16
else:
    device_map = None
    quantization_config = None
    torch_dtype = None

model = AutoModelForCausalLM.from_pretrained(
    script_args.model_name,
    quantization_config=quantization_config,
    device_map=device_map,
    trust_remote_code=script_args.trust_remote_code,
    torch_dtype=torch_dtype,
    use_auth_token=script_args.use_auth_token,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/15 [00:00<?, ?it/s]

model-00001-of-00015.safetensors:   0%|          | 0.00/919M [00:00<?, ?B/s]

model-00002-of-00015.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00003-of-00015.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00004-of-00015.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00005-of-00015.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00006-of-00015.safetensors:   0%|          | 0.00/944M [00:00<?, ?B/s]

model-00007-of-00015.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00008-of-00015.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00009-of-00015.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00010-of-00015.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00011-of-00015.safetensors:   0%|          | 0.00/944M [00:00<?, ?B/s]

model-00012-of-00015.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00013-of-00015.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00014-of-00015.safetensors:   0%|          | 0.00/742M [00:00<?, ?B/s]

model-00015-of-00015.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Qlora trainer defined

In [9]:
dataset = instructions_ds_dict

training_args = TrainingArguments(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    learning_rate=script_args.learning_rate,
    logging_steps=script_args.logging_steps,
    num_train_epochs=script_args.num_train_epochs,
    max_steps=script_args.max_steps,
    report_to=script_args.log_with,
    save_steps=script_args.save_steps,
    save_total_limit=script_args.save_total_limit,
    push_to_hub=script_args.push_to_hub,
    hub_model_id=script_args.hub_model_id,
)

if script_args.use_peft:
    peft_config = LoraConfig(
        r=script_args.peft_lora_r,
        lora_alpha=script_args.peft_lora_alpha,
        bias="none",
        task_type="CAUSAL_LM",
    )
else:
    peft_config = None

trainer = SFTTrainer(
    model=model,
    args=training_args,
    max_seq_length=script_args.seq_length,
    train_dataset=dataset['train'],
    eval_dataset=dataset['eval'],
    dataset_text_field=script_args.dataset_text_field,
    peft_config=peft_config,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


tokenizer_config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.55M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



Map:   0%|          | 0/3406 [00:00<?, ? examples/s]

Map:   0%|          | 0/378 [00:00<?, ? examples/s]



Trained on Google colab, 2hr on L4

In [11]:
# trainer.predict(instructions_ds_dict['eval']['text'][1])

Evaluation

In [12]:
model.eval()
# from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, AutoTokenizer, pipeline
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map={'':0},
)

In [13]:
print(instructions_ds_dict['eval']['text'][1])
print(instructions_ds_dict['eval']['text'][1].split('### Answer: ')[0] + '### Answer:')

### Classify the sentiment of the following sentence as negative, neutral or positive.
Stock comments: Kemira 's partner in the project is St. Petersburg Water Works

### Answer: neutral
### Classify the sentiment of the following sentence as negative, neutral or positive.
Stock comments: Kemira 's partner in the project is St. Petersburg Water Works

### Answer:


In [18]:
results = []

for seq in sequences:
  # print(seq)
  result = seq[0]['generated_text'].split('### Answer:')[1]
  results.append(result)

labels = []
instructions_ds_dict['eval']['text'][0]
print(instructions_ds_dict['eval']['text'][0])
for label in instructions_ds_dict['eval']['text']:
  result = label.split('### Answer:')[1]
  labels.append(result)
print("label is",labels[0])

print("Accuracy: ", (len([1 for x, y in zip(results, labels) if y in x]) / len(labels)))

### Classify the sentiment of the following sentence as negative, neutral or positive.
Stock comments: According to Scanfil , demand for telecommunications network products has fluctuated significantly in the third quarter of 2006 , and the situation is expected to remain unstable for the rest of the year

### Answer: negative
label is  negative
Accuracy:  0.828042328042328


In [None]:
print(len(instructions_ds_dict['eval']))

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
output_merged_dir = "/content/drive/MyDrive/merged_result/news_classification_llama2_7b/final_merged_checkpoint"

In [21]:

model.save_pretrained( "/content/drive/MyDrive/merged_result/news_classification_llama2_7b/final_merged_checkpoint")



In [22]:
tokenizer.save_pretrained(output_merged_dir)

('/content/drive/MyDrive/merged_result/news_classification_llama2_7b/final_merged_checkpoint/tokenizer_config.json',
 '/content/drive/MyDrive/merged_result/news_classification_llama2_7b/final_merged_checkpoint/special_tokens_map.json',
 '/content/drive/MyDrive/merged_result/news_classification_llama2_7b/final_merged_checkpoint/tokenizer.json')