In [None]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq peft==0.5.0 --progress-bar off
!pip install -qqq bitsandbytes==0.41.1 --progress-bar off
!pip install -qqq trl==0.7.1 --progress-bar off
!pip install einops
!pip install -qqq datasets==2.15.0

In [None]:
import json
import re
from pprint import pprint
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from trl import SFTTrainer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

In [None]:
notebook_login()

In [None]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [None]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

In [None]:
model.config.quantization_config.to_dict()

In [None]:
lora_r = 16
lora_alpha = 64
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
dataset = load_dataset('chienpham/vnpara_train')
dataset = dataset['train'].train_test_split(test_size = 0.1)
dataset['validation'] = dataset['test']

In [None]:
DEFAULT_SYSTEM_PROMPT = """
Evaluate the semantic similarity of these two sentences, answer with only one character: 0 means the two sentences has different meaning, 1 means the two sentences has the same meaning.
""".strip()


def generate_training_prompt(
    s1: str, s2:str, lbl: int, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    s = 'Yes.' if lbl == 1 else 'No.'
    return f"""### Question: Do these two sentences have the same meaning? 
Sentence 1: {s1}
Sentence 2: {s2}
Answer "Yes" or "No".

### Answer:
{s}
"""
def generate_text(row):
    s1 = row['sentence1']
    s2 = row['sentence2']
    label = row['label']
    return {
        "text": generate_training_prompt(s1, s2, label),
    }
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_text)
        .remove_columns(
            [
                "sentence1",
                "sentence2",
                "label"
            ]
        )
    )

In [None]:
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer = tokenizer, 
    torch_dtype=torch.bfloat16, 
    device_map="auto"
)



In [None]:
prompt = """
Evaluate the semantic similarity of these two sentences, answer with only one character: 0 means the two sentences has different meaning, 1 means the two sentences has the same meaning.
Sentence 1: Hacker còn tuyên bố " sẽ dòm ngó các trung tâm và công ty bảo mật " khác.
Sentence 2: Thông điệp nhóm hacker này để lại là: "Chúng tôi sẽ dòm ngó các trung tâm và công ty bảo mật".
"""

In [None]:
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_text)
        .remove_columns(
            [
                "sentence1",
                "sentence2",
                "label"
            ]
        )
    )

In [None]:
dataset["train"] = process_dataset(dataset["train"])
dataset["validation"] = process_dataset(dataset["validation"])

In [None]:
dataset

In [None]:
OUTPUT_DIR = "experiments"
training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

lora_r = 16
lora_alpha = 64
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
trainer.train()

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

In [None]:
from peft import get_peft_model
lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

In [None]:
model.push_to_hub("Oillim/Mistral-7b-vnpara")