In [5]:
# !pip install transformers
# !pip install peft
# !pip install datasets
# !pip install trl
# !pip install --upgrade wandb
# !pip install -U bitsandbytes
# !pip install scipy scikit-learn

In [6]:
# import torch

# # Clear CUDA memory cache
# torch.cuda.empty_cache()

# # Reset GPU by restarting the runtime (useful in Jupyter or Colab)
# import os
# os._exit(00)

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [7]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    set_seed,
    AutoConfig,
    PretrainedConfig,
    EvalPrediction,
    default_data_collator,
    DataCollatorWithPadding,
)
# from transformers.utils import main_process_first

from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset, ClassLabel
from trl import SFTTrainer, setup_chat_format
import numpy as np
import random
import datasets


from load_data import load_glue_datasets, load_ood_eval_datasets

import logging
logger = logging.getLogger(__name__)

from custom_trainer.ft_trainer import FtTrainer




In [8]:
from huggingface_hub import login
import wandb

# Log in to Hugging Face
hf_token = "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"
login(token=hf_token)

# Log in to Weights & Biases
wb_token = "6d093f67d658015661b11157166b74b60a25d11e"
wandb.login(key=wb_token)

# # Initialize a wandb run
# run = wandb.init(
#     project='Fine-tune Llama 3 8B on Medical Dataset', 
#     job_type="training", 
#     anonymous="allow"
# )

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [9]:
from task_utils import task_to_keys, save_dataset

config_name = None
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
task_name = "rte" #Options: rte, mnli, mnli-original, mnli-mismatched, hans, qqp, paws-qqp, cola, cola-ood 

padding = "max_length" #or "max_length" or False
target_tokens = None #or "ĠNo,ĠYes"
target_tokens_logits_only = False
max_seq_length = 256
pattern = "{text1} ?"
data_seed = 42
training_seed = 42
do_train = True
do_eval = True
do_predict = False
max_eval_samples = None
max_predict_samples = None
test_file = None
overwrite_cache = False
output_dir = None
dataset_cache_dir = None
pad_to_max_length = False
fp16 = False
main_process_first=True
max_train_samples=None



In [10]:
raw_datasets, label_list, num_labels, is_regression = load_glue_datasets(task_name, use_auth_token=True, cache_dir=None)
additional_evaluation_datasets = load_ood_eval_datasets()




Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/584k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/621k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/5.22k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

The repository for hans contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hans.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


hans-lexical_overlap-entailment: 5000 examples
hans-lexical_overlap-contradiction: 5000 examples


Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6703 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6703 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6703 [00:00<?, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/677 [00:00<?, ? examples/s]

Filter:   0%|          | 0/677 [00:00<?, ? examples/s]

Filter:   0%|          | 0/677 [00:00<?, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/516 [00:00<?, ? examples/s]

Filter:   0%|          | 0/516 [00:00<?, ? examples/s]

In [11]:
from transformers import (
    LlamaForSequenceClassification,
    AutoModelForSequenceClassification,
    LlamaConfig,
    BitsAndBytesConfig,
)
from peft import LoraConfig, TaskType
import torch

config = AutoConfig.from_pretrained(
    config_name if config_name else base_model,
    finetuning_task=task_name,
    num_labels=num_labels,
)
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.add_special_tokens({"pad_token":"<pad>"})

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
    

model = AutoModelForSequenceClassification.from_pretrained(base_model,
                                                        quantization_config=quant_config,
                                                        ignore_mismatched_sizes=True,
                                                        attn_implementation = "eager",
                                                        device_map="auto",
                                                        )

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 41,943,040 || all params: 7,546,875,904 || trainable%: 0.5558


In [15]:

sentence1_key, sentence2_key = task_to_keys[task_name]

label_to_id = None
if (
    model.config.label2id != PretrainedConfig(
        num_labels=num_labels).label2id
    and task_name is not None
    and not is_regression
):
    # Some have all caps in their config, some don't.
    label_name_to_id = {
        k.lower(): v for k, v in model.config.label2id.items()}
    if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
        label_to_id = {
            i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
    else:
        logger.warning(
            "Your model seems to have been trained with labels, but they don't match the dataset: ",
            f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
            "\nIgnoring the model labels as a result.",
        )

if label_to_id is not None:
    model.config.label2id = label_to_id
    model.config.id2label = {
        id: label for label, id in config.label2id.items()}
elif task_name is not None and not is_regression:
    model.config.label2id = {l: i for i, l in enumerate(label_list)}
    model.config.id2label = {
        id: label for label, id in config.label2id.items()}


if target_tokens is not None and not target_tokens_logits_only:
    # we need to convert the label ids to target ids
    target_tokens = [t.strip() for t in target_tokens.split(",")]
    target_tokens_ids = tokenizer.convert_tokens_to_ids(target_tokens)

    model.config.label2id = {
        l: target_tokens_ids[i] for i, l in enumerate(label_list)}
    model.config.id2label = {
        id: label for label, id in config.label2id.items()}

# Compute max_seq_length
if max_seq_length > tokenizer.model_max_length:
    logger.warning(
        f"The max_seq_length passed ({max_seq_length}) is larger than the maximum length for the"
        f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
    )

max_seq_length = min(max_seq_length, tokenizer.model_max_length)

def preprocess_function(examples):
    # Tokenize the texts

    # Apply a pattern to the inputs
    pattern_examples = [
        pattern.format(
            text1=examples[sentence1_key][idx],
            text2=examples[sentence2_key][idx] if sentence2_key is not None else None)
        for idx in range(len(examples[sentence1_key]))
    ]
    args = (pattern_examples,)
    result = tokenizer(*args, padding=padding,
                        max_length=max_seq_length, truncation=True)

    # Get mask for soft prompt tokens
    # TODO(mm): For GPT-J and GPT-NeoX we have a different tokenizer. Adjust accordingly
    # Get tokens
    result["input_tokens"] = [tokenizer.convert_ids_to_tokens(
        ids) for ids in result["input_ids"]]

    # Decode input
    result["input_text"] = [tokenizer.decode(
        ids) for ids in result["input_ids"]]

    # Replace labels by target tokens indices when using lm_head
    # - special case: when using target logits only, we keep class indices instead of token indices
    if target_tokens is not None and not target_tokens_logits_only:
        result["label"] = [target_tokens_ids[l] for l in examples["label"]]
    else:
        result["label"] = examples["label"]

    result["label_text"] = [model.config.id2label[l] if l != -1 else "unlabeled"
                            for l in result["label"]]

    return result



In [16]:
if target_tokens is not None and not target_tokens_logits_only:
    for split in raw_datasets:
        # raw_datasets[split].features["label"].num_classes = len(tokenizer)
        # raw_datasets[split].features["label"].names = [
        #     f"{idx}" for idx in np.arange(len(tokenizer))]

        new_features = raw_datasets[split].features.copy()
        names = [f"{idx}" for idx in np.arange(len(tokenizer))]
        new_features["label"] = ClassLabel(
            names=names, num_classes=len(tokenizer))
        raw_datasets[split] = raw_datasets[split].cast(new_features)

    for name, dataset in additional_evaluation_datasets.items():
        # dataset.features["label"].num_classes = len(tokenizer)
        # dataset.features["label"].names = [
        #     f"{idx}" for idx in np.arange(len(tokenizer))]

        new_features = dataset.features.copy()
        names = [f"{idx}" for idx in np.arange(len(tokenizer))]
        new_features["label"] = ClassLabel(
            names=names, num_classes=len(tokenizer))
        additional_evaluation_datasets[name] = dataset.cast(new_features)

# before running the pre-processing, subsample datsets if specified

# subsample datasets (if specified)

# we fix the random seed that controls the sampling of the training data
np.random.seed(data_seed)

if do_train:
    if "train" not in raw_datasets:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = raw_datasets["train"]
    if max_train_samples is not None:
        # randomly select a subset of the training data
        max_train_samples = min(
            len(train_dataset), max_train_samples)
        indices = np.random.choice(
            range(len(train_dataset)), size=max_train_samples, replace=False)
        train_dataset = train_dataset.select(indices)

if do_eval:
    # we fix the random seed that controls the sampling of the validation data
    np.random.seed(123)  # we only use this for debugging

    if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
        raise ValueError("--do_eval requires a validation dataset")
    eval_dataset = raw_datasets["validation_matched" if task_name in
                                ["mnli", "mnli-original"] else "validation"]

    # (optional) subsample eval datasets
    if max_eval_samples is not None:
        max_eval_samples = min(
            len(eval_dataset), max_eval_samples)
        # randomly select a subset of the eval data
        indices = np.random.choice(
            range(len(eval_dataset)), size=max_eval_samples, replace=False)
        eval_dataset = eval_dataset.select(indices)

    for name, dataset in additional_evaluation_datasets.items():
        if max_eval_samples is not None:
            max_eval_samples = min(
                len(dataset), max_eval_samples)
            # randomly select a subset of the eval data
            indices = np.random.choice(
                range(len(dataset)), size=max_eval_samples, replace=False)
            dataset = dataset.select(indices)
            additional_evaluation_datasets[name] = dataset

if do_predict or task_name is not None or test_file is not None:
    # we fix the random seed that controls the sampling of the validation data
    np.random.seed(123)  # we only use this for debugging

    if "test" not in raw_datasets and "test_matched" not in raw_datasets:
        raise ValueError("--do_predict requires a test dataset")
    predict_dataset = raw_datasets["test_matched" if task_name in
                                    ["mnli", "mnli-original"] else "test"]
    if max_predict_samples is not None:
        max_predict_samples = min(
            len(predict_dataset), max_predict_samples)
        predict_dataset = predict_dataset.select(
            range(max_predict_samples))

# set all random seeds again (not sure if this is really needed)
set_seed(training_seed)

from accelerate import Accelerator
accelerator = Accelerator()
# tokenize and encode datasets

with accelerator.main_process_first():
    if do_train:
        train_dataset = train_dataset.map(
            preprocess_function,
            batched=True,
            batch_size=1000,
            load_from_cache_file=not overwrite_cache,
            desc="Running tokenizer on training dataset",
        )

    if do_eval:
        eval_dataset = eval_dataset.map(
            preprocess_function,
            batched=True,
            batch_size=1000,
            load_from_cache_file=not overwrite_cache,
            desc="Running tokenizer on validation dataset",
        )

    if do_predict:
        predict_dataset = predict_dataset.map(
            preprocess_function,
            batched=True,
            batch_size=1000,
            load_from_cache_file=not overwrite_cache,
            desc="Running tokenizer on test dataset",
        )

    for name, dataset in additional_evaluation_datasets.items():
        if "hans" in name:
            sentence1_key, sentence2_key = task_to_keys["hans"]
        elif "mnli" in name:
            sentence1_key, sentence2_key = task_to_keys["mnli"]
        elif "paws-qqp" in name:
            sentence1_key, sentence2_key = task_to_keys["paws-qqp"]
        elif "cola-ood" in name:
            sentence1_key, sentence2_key = task_to_keys["cola-ood"]

        dataset = dataset.map(
            preprocess_function,
            batched=True,
            batch_size=1000,
            load_from_cache_file=not overwrite_cache,
            desc=f"Running tokenizer on {name} validation dataset",
        )
        additional_evaluation_datasets[name] = dataset

# Log a few random samples from the training set:
if do_train:
    for index in random.sample(range(len(train_dataset)), 1):
        print(
            f"Sample {index} of the training set: {train_dataset[index]}.")

Running tokenizer on training dataset:   0%|          | 0/2490 [00:00<?, ? examples/s]

Running tokenizer on validation dataset:   0%|          | 0/277 [00:00<?, ? examples/s]

Running tokenizer on hans-lexical_overlap-entailment validation dataset:   0%|          | 0/5000 [00:00<?, ? e…

Running tokenizer on hans-lexical_overlap-contradiction validation dataset:   0%|          | 0/5000 [00:00<?, …

Running tokenizer on mnli_mm-entailment validation dataset:   0%|          | 0/3463 [00:00<?, ? examples/s]

Running tokenizer on mnli_mm-contradiction validation dataset:   0%|          | 0/3240 [00:00<?, ? examples/s]

Running tokenizer on paws-qqp-not-paraphrase validation dataset:   0%|          | 0/486 [00:00<?, ? examples/s…

Running tokenizer on paws-qqp-paraphrase validation dataset:   0%|          | 0/191 [00:00<?, ? examples/s]

Running tokenizer on cola-ood-unacceptable validation dataset:   0%|          | 0/162 [00:00<?, ? examples/s]

Running tokenizer on cola-ood-acceptable validation dataset:   0%|          | 0/354 [00:00<?, ? examples/s]

Sample 456 of the training set: {'sentence1': "A computer system failure closed down share trading at the Tokyo Stock Exchange for most of yesterday, the worst disruption to date for Asia's largest bourse.", 'sentence2': 'The Tokyo Stock Exchange was closed down by computer system failure.', 'label': 0, 'idx': 456, 'input_ids': [128000, 32, 6500, 1887, 8060, 8036, 1523, 4430, 11380, 520, 279, 27286, 12937, 19224, 369, 1455, 315, 13985, 11, 279, 12047, 44219, 311, 2457, 369, 13936, 596, 7928, 293, 5366, 13, 949, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128

In [17]:
output_dir = os.path.abspath("./data")
# Log training and evaluation examples to training_args.output_dir for reproducibility
if do_train:
    save_dataset(train_dataset, path=os.path.join(
        output_dir, f"{task_name}-train.csv"))
if do_eval:
    save_dataset(eval_dataset, path=os.path.join(
        output_dir, f"{task_name}-eval.csv"))
    for name, dataset in additional_evaluation_datasets.items():
        save_dataset(dataset, path=os.path.join(
            output_dir, f"{name}-eval.csv"))


In [18]:
# Get the metric function
if task_name is not None:
    # use default metrics
    metric_script = os.path.abspath("../llmft/metrics/glue.py")
    if task_name == "mnli-original":
        metric = datasets.load_metric(path=metric_script, config_name="mnli",
                                        cache_dir=dataset_cache_dir, keep_in_memory=False)
    else:
        metric = datasets.load_metric(path=metric_script, config_name=task_name,
                                        cache_dir=dataset_cache_dir, keep_in_memory=False)
else:
    metric = datasets.load_metric(
        "accuracy", cache_dir=dataset_cache_dir, keep_in_memory=False)        

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(
        p.predictions, tuple) else p.predictions
    preds = np.squeeze(
        preds) if is_regression else np.argmax(preds, axis=1)

    if task_name is not None:
        result = metric.compute(predictions=preds, references=p.label_ids)

        # When using the lm_head, compute fraction of predictions that are not one of the target tokens
        if target_tokens is not None and not target_tokens_logits_only:
            unique_preds, counts_preds = np.unique(
                preds, return_counts=True)
            unique_preds_counts_dict = dict(
                zip(unique_preds, counts_preds))

            num_of_target_token_predictions = 0
            for idx in target_tokens_ids:
                num_of_target_token_predictions += unique_preds_counts_dict.get(
                    idx, 0)
            num_other_tokens = len(
                preds) - num_of_target_token_predictions
            result["frac_non_target_tokens"] = num_other_tokens / \
                len(preds)

        # # Combine eval metrics
        # if len(result) > 1:
        #     result["combined_score"] = np.mean(
        #         list(result.values())).item()

        return result

    elif is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

# Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
# we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(
        tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

# Initialize our Trainer
if do_eval:
    if len(additional_evaluation_datasets) > 0:
        # add the training task eval dataset
        additional_evaluation_datasets[task_name] = eval_dataset
        eval_datasets = additional_evaluation_datasets
    else:
        eval_datasets = eval_dataset
else:
    eval_datasets = None


  metric = datasets.load_metric(path=metric_script, config_name=task_name,


The repository for glue contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/glue.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


In [20]:
train_dataset[0]

{'sentence1': 'No Weapons of Mass Destruction Found in Iraq Yet.',
 'sentence2': 'Weapons of Mass Destruction Found in Iraq.',
 'label': 1,
 'idx': 0,
 'input_ids': [128000,
  2822,
  47664,
  315,
  9346,
  80847,
  12595,
  304,
  11340,
  14968,
  13,
  949,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256,
  128256

In [12]:
from metrics import CustomMetric

metric_script = os.path.abspath("../llmft/metrics/glue.py")
# Get the metric function
cm = CustomMetric(task_name=task_name, metric_script=metric_script)

cm.compute_metrics


<bound method CustomMetric.compute_metrics of <metrics.CustomMetric object at 0x7b9080fc7790>>

In [13]:
new_model = os.path.abspath("./new_model")
training_arguments = TrainingArguments(
    output_dir=new_model,
    # per_device_train_batch_size=1,
    # per_device_eval_batch_size=1,
    # gradient_accumulation_steps=2,
    # optim="paged_adamw_32bit",
    # num_train_epochs=1,
    # evaluation_strategy="steps",
    # eval_steps=0.2,
    # logging_steps=1,
    # warmup_steps=10,
    # logging_strategy="steps",
    # learning_rate=2e-4,
    # fp16=False,
    # bf16=False,
    # group_by_length=True,
    # report_to="wandb"
)


In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_datasets,
    tokenizer=tokenizer,
    args=training_arguments,
    compute_metrics=cm.compute_metrics
)

In [15]:
trainer.train()



../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [120,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [120,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [120,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [120,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [120,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [120,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [120,

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [20]:
new_model = os.path.abspath("./new_model")
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset if do_train else None,
    eval_dataset=eval_datasets,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    # dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    compute_metrics=compute_metrics
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
