In [1]:
!pip install peft trl accelerate bitsandbytes
!pip install -U datasets



In [2]:
import os
import torch
import json
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import json
from os import listdir, makedirs
from os.path import isfile, join, splitext, exists

# Assume the data set is in the below subfolder
inputDataPrefix = "data/"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Loads the samples in the train, validation, or test set
def loadMOROCODataSamples(subsetName):
# Copyright for function (C) 2018  Andrei M. Butnaru, Radu Tudor Ionescu
    inputSamplesFilePath = (inputDataPrefix + "%s/samples.txt") % (subsetName)
    inputDialectLabelsFilePath = (inputDataPrefix + "%s/dialect_labels.txt") % (subsetName)
    inputCategoryLabelsFilePath = (inputDataPrefix + "%s/category_labels.txt") % (subsetName)
    
    IDs = []
    samples = []
    dialectLabels = []
    categoryLabels = []
    
    # Loading the data samples
    inputSamplesFile = open(inputSamplesFilePath, 'r')
    sampleRows = inputSamplesFile.readlines()
    inputSamplesFile.close()

    for row in sampleRows:
        components = row.split("\t")
        IDs += [components[0]]
        samples += [" ".join(components[1:])]

    # Loading the dialect labels
    inputDialectLabelsFile = open(inputDialectLabelsFilePath, 'r')
    dialectRows = inputDialectLabelsFile.readlines()
    inputDialectLabelsFile.close()
    
    for row in dialectRows:
        components = row.split("\t")
        dialectLabels += [int(components[1])]
    
    # Loading the category labels
    inputCategoryLabelsFile = open(inputCategoryLabelsFilePath, 'r')
    categoryRows = inputCategoryLabelsFile.readlines()
    inputCategoryLabelsFile.close()
    
    for row in categoryRows:
        components = row.split("\t")
        categoryLabels += [int(components[1])]

    # IDs[i] is the ID of the sample samples[i] with the dialect label dialectLabels[i] and the category label categoryLabels[i]
    return IDs, samples, dialectLabels, categoryLabels

def build_instruction_set(task_ids, task_samples, task_labels, format="mistral", task="dialect"):
    """
    Build an instruction set for a specified task in a given format.

    Parameters:
    - task_ids (list): ids from MOROCO
    - task_samples (list): text samples
    - task_labels (list): labels for the given task
    - format (str, optional): model to be used, for the moment Mistral
    - task (str, optional): unused, maybe to switch to other Vardial tasks

    Returns:
    - instruction_set (str): json set with raw and instruction texts
    """

    json_set = []
    for id, sample, label in zip(task_ids, task_samples, task_labels):
        instruction = f"[INST] O să primești un fragment dintr-un articol de știri scris în limba română. Trebuie să îl clasifici în dialectul standard al limbii române, sau în dialectul moldovenesc, folosit în Republica Moldova. Numele de persoane sau de locuri geografice au fost schimbate în \"$NE#\", ca să fie împiedicată folosirea de denumiri specifice pentru identificare, în loc de proprietăți lingvistice.\nFragmentul este acesta:\"{sample}\"\n Alege unul dintre cele doua dialecte pentru clasificare:\n1. dialectul moldovenesc\n2. dialectul standard\n[/INST]\n Dialectul din fragment este {label}."

        json_set.append({
            'id': id,
            'raw_sample': sample,
            'instr_sample': instruction,
            'dialect': label
        })
    
    return json_set

def get_set(split, format="mistral", task="dialect"):
    task_ids, task_samples, task_dialect, task_category = loadMOROCODataSamples(split)
    return build_instruction_set(task_ids, task_samples, task_dialect, format=format, task=task)

def write_set(split, out_root, format="mistral", task="dialect"):
    task_ids, task_samples, task_dialect, task_category = loadMOROCODataSamples(split)
    task0_set = build_instruction_set(task_ids, task_samples, task_dialect, format=format, task=task)
    task0_fp =  os.path.join(out_root, f'{split}_model={format}_task={task}.jsonl')
    with open(task0_fp, 'w') as f:
        for obj in task0_set:
            json.dump(obj, f)
            f.write('\n')

In [4]:
# json_dataset = get_set("train")
# dataset = []
# for obj in json_dataset:
#     instr = obj["instr_sample"]
#     dataset.append(instr)
# dataset = Dataset.from_dict({"text": dataset})
dataset = load_dataset('json', data_files='data/train_model=mistral_task=dialect.jsonl')
dataset.shuffle(seed=1337)

DatasetDict({
    train: Dataset({
        features: ['id', 'raw_sample', 'instr_sample', 'dialect'],
        num_rows: 21719
    })
})

In [5]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Fine-tuned model name
new_model = "mistral-finetuned"

output_dir = "./results"

In [6]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # load_in_8bit=True,
    device_map="auto",
    attn_implementation = 'flash_attention_2'
)
# model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
def formatting_prompts_func(example):
    output_texts = []
    for instr in example['instr_sample']:
        output_texts.append(instr)
    return output_texts
response_format = 'din fragment este '

collator = DataCollatorForCompletionOnlyLM(tokenizer.encode(response_format, add_special_tokens = False), tokenizer=tokenizer)
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=256,
    lora_dropout=0.1,
    r=64,
    bias="none",
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=64,
    optim="adamw_torch_fused",
    save_steps=0,
    logging_steps=5,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=1.,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    # report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    peft_config=peft_config,
    dataset_text_field="instr_sample",
    # max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=2200,
    packing=False,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Your GPU supports bfloat16: accelerate training with bf16=True


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.07s/it]
Map: 100%|██████████| 21719/21719 [00:04<00:00, 5022.72 examples/s]
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtantarudragos[0m ([33mdtant[0m). Use [1m`wandb login --relogin`[0m to force relogin


Fragmentul este acesta:"Decizia lui $NE$ a înfuriat aliați puternici ai $NE$  propriul partid republican şi a provocat o undă de şoc pe $NE$ $NE$ unde indicele $NE$ a pierdut 420 de puncte . Cele mai afectate țări sunt $NE$ $NE$ şi $NE$ de $NE$ şi nu $NE$ relatează $NE$ . Cu riscul de a provoca un război comercial cu principalii săi parteneri comerciali şi în primul rând cu $NE$ $NE$ a anunţat că va promulga săptămâna viitoare taxe vamale mari pentru importurile de oţel şi aluminiu . Vor fi taxe de 25% pentru oţel şi 10% pentru aluminiu, a anunţat $NE$ . Fără să dea prea multe detalii, preşedintele a subliniat că importurile ieftine "distrug industria şi locurile de muncă  din $NE$ $NE$ . Condiţiile suportate de noi în ultimele decenii sunt pur şi simplu ruşinoase, consideră preşedintele american . Într - o reacţie imediată, congresmeni republicani au avertizat tăios ca o asemenea măsură ar fi resimţitaă tot de americanul de rând în preţul bunurilor de consum, de la conserve până la ma

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 23.66 GiB of which 84.75 MiB is free. Including non-PyTorch memory, this process has 21.38 GiB memory in use. Of the allocated memory 20.88 GiB is allocated by PyTorch, and 197.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 1},
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"