In [1]:
from datasets import Dataset, DatasetDict, load_dataset
import ast
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from unsloth import FastLanguageModel

In [3]:

PROMPT_TEMPLATE = "[INST] Extract span of text from the customer review associated with the topic - {}. Customer Review : '{}'[/INST] {}{}"
csv_path = "./arcelik_llm_training_set.csv"
base_model_id = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
max_seq_length = 130
load_in_4bit =True

In [4]:
def get_model_and_tokenizer(model_id: str, max_length, load_in_4bit,):
    model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    return model, tokenizer

In [5]:

model, tokenizer = get_model_and_tokenizer(base_model_id,
                                           max_seq_length,
                                           load_in_4bit)

==((====))==  Unsloth: Fast Mistral patching release 2024.2
   \\   /|    GPU: NVIDIA RTX A4000. Max memory: 15.731 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = False.
 "-____-"     Apache 2 free license: http://github.com/unslothai/unsloth




In [25]:

EOS_TOKEN = tokenizer.eos_token

def format_row(row):
    indices = ast.literal_eval(row['indices'])
    span = row['sentence'][indices[-1]:indices[1]]
    try:
        assert span == row['span']
    except Exception:
        print(row['sentence'])
        print(span)
        print(row['span'])
        print(indices)
        print("next")
    formatted_string = PROMPT_TEMPLATE.format(row['topic'], row['sentence'], span, EOS_TOKEN)
    # result = tokenizer(formatted_string,truncation=True,
        # max_length=max_length,
        # padding="max_length",)
    # result["input_ids"].append(tokenizer.eos_token_id)
    # new_record ["labels"] = result["input_ids"].copy()
    return {'text' : formatted_string}

def get_data(csv_path, valid_set_ratio = 0.15):
    data = load_dataset('csv', data_files="arcelik_llm_training_set.csv")
    dataset = data.map(format_row, remove_columns=data.column_names['train'])
    # Split the dataset into training and evaluation sets
    train_test_split = dataset['train'].train_test_split(test_size=0.15)
    dataset = DatasetDict({
        'train': train_test_split['train'],
        'eval': train_test_split['test']
    })
    return dataset



In [26]:

train_dataset = get_data(csv_path)

In [29]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 22825
    })
    eval: Dataset({
        features: ['text'],
        num_rows: 4029
    })
})

In [27]:
train_dataset['train'][0]

{'text': "[INST] Extract span of text from the customer review associated with the topic - Value. Customer Review : 'I like this model and its heavy also.'[/INST] </s>"}

In [28]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        learning_rate = 2e-4,
        num_train_epochs=2,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

ValueError: Column to remove ['train', 'eval'] not in the dataset. Current columns in the dataset: ['text']