In [None]:
!pip install -q -U torch transformers  langchain sentence-transformers faiss-gpu openpyxl openai bitsandbytes accelerate peft datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m810.5/810.5 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.4/262.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [None]:
import gc
import os
import sys
import threading
import psutil
import torch
from accelerate import Accelerator
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
    set_seed,
)

from peft import LoraConfig, TaskType, get_peft_model


def levenshtein_distance(str1, str2):
    # TC: O(N^2)
    # SC: O(N)
    if str1 == str2:
        return 0
    num_rows = len(str1) + 1
    num_cols = len(str2) + 1
    dp_matrix = list(range(num_cols))
    for i in range(1, num_rows):
        prev = dp_matrix[0]
        dp_matrix[0] = i
        for j in range(1, num_cols):
            temp = dp_matrix[j]
            if str1[i - 1] == str2[j - 1]:
                dp_matrix[j] = prev
            else:
                dp_matrix[j] = min(prev, dp_matrix[j], dp_matrix[j - 1]) + 1
            prev = temp
    return dp_matrix[num_cols - 1]


def get_closest_label(eval_pred, classes):
    min_id = sys.maxsize
    min_edit_distance = sys.maxsize
    for i, class_label in enumerate(classes):
        edit_distance = levenshtein_distance(eval_pred.strip(), class_label)
        if edit_distance < min_edit_distance:
            min_id = i
            min_edit_distance = edit_distance
    return classes[min_id]


# Converting Bytes to Megabytes
def b2mb(x):
    return int(x / 2**20)


# This context manager is used to track the peak memory usage of the process
class TorchTracemalloc:
    def __enter__(self):
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
        self.begin = torch.cuda.memory_allocated()
        self.process = psutil.Process()

        self.cpu_begin = self.cpu_mem_used()
        self.peak_monitoring = True
        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
        peak_monitor_thread.daemon = True
        peak_monitor_thread.start()
        return self

    def cpu_mem_used(self):
        """get resident set size memory for the current process"""
        return self.process.memory_info().rss

    def peak_monitor_func(self):
        self.cpu_peak = -1

        while True:
            self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak)

            # can't sleep or will not catch the peak right (this comment is here on purpose)
            # time.sleep(0.001) # 1msec

            if not self.peak_monitoring:
                break

    def __exit__(self, *exc):
        self.peak_monitoring = False

        gc.collect()
        torch.cuda.empty_cache()
        self.end = torch.cuda.memory_allocated()
        self.peak = torch.cuda.max_memory_allocated()
        self.used = b2mb(self.end - self.begin)
        self.peaked = b2mb(self.peak - self.begin)

        self.cpu_end = self.cpu_mem_used()
        self.cpu_used = b2mb(self.cpu_end - self.cpu_begin)
        self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin)
        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")


def main():
    accelerator = Accelerator()
    model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    tokenizer_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    dataset_name = "fka/awesome-chatgpt-prompts"


    peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
    text_column = "act"
    label_column = "prompt"
    lr = 3e-3
    num_epochs = 20
    batch_size = 8
    seed = 42
    max_length = 64
    do_test = False
    set_seed(seed)

    dataset = load_dataset(dataset_name)

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    def preprocess_function(examples):
        batch_size = len(examples[text_column])
        inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
        targets = [str(x) for x in examples[label_column]]
        model_inputs = tokenizer(inputs)
        labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
        for i in range(batch_size):
            sample_input_ids = model_inputs["input_ids"][i]
            label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
            model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
            labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
            model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
        for i in range(batch_size):
            sample_input_ids = model_inputs["input_ids"][i]
            label_input_ids = labels["input_ids"][i]
            model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
                max_length - len(sample_input_ids)
            ) + sample_input_ids
            model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
                "attention_mask"
            ][i]
            labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
            model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
            model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
            labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def test_preprocess_function(examples):
        batch_size = len(examples[text_column])
        inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
        model_inputs = tokenizer(inputs)
        # print(model_inputs)
        for i in range(batch_size):
            sample_input_ids = model_inputs["input_ids"][i]
            model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
                max_length - len(sample_input_ids)
            ) + sample_input_ids
            model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
                "attention_mask"
            ][i]
            model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
            model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        return model_inputs

    with accelerator.main_process_first():
        processed_datasets = dataset.map(
            preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=dataset["train"].column_names,
            load_from_cache_file=True,
            desc="Running tokenizer on dataset",
        )
    accelerator.wait_for_everyone()

    train_dataset = processed_datasets["train"]

    with accelerator.main_process_first():
        processed_datasets = dataset.map(
            test_preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=dataset["train"].column_names,
            load_from_cache_file=False,
            desc="Running tokenizer on dataset",
        )
    eval_dataset = processed_datasets["train"]
    test_dataset = processed_datasets["train"]

    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )
    eval_dataloader = DataLoader(
        eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )
    test_dataloader = DataLoader(
        test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )

    print(next(iter(train_dataloader)))

    # creating model
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    # lr scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler = accelerator.prepare(
        model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler
    )
    accelerator.print(model)

    is_ds_zero_3 = False
    if getattr(accelerator.state, "deepspeed_plugin", None):
        is_ds_zero_3 = accelerator.state.deepspeed_plugin.zero_stage == 3

    for epoch in range(num_epochs):
        with TorchTracemalloc() as tracemalloc:
            model.train()
            total_loss = 0
            for step, batch in enumerate(tqdm(train_dataloader)):
                outputs = model(**batch)
                loss = outputs.loss
                total_loss += loss.detach().float()
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
        accelerator.print(f"GPU Memory before entering the train : {b2mb(tracemalloc.begin)}")
        accelerator.print(f"GPU Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
        accelerator.print(f"GPU Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
        accelerator.print(
            f"GPU Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
        )

        accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}")
        accelerator.print(f"CPU Memory consumed at the end of the train (end-begin): {tracemalloc.cpu_used}")
        accelerator.print(f"CPU Peak Memory consumed during the train (max-begin): {tracemalloc.cpu_peaked}")
        accelerator.print(
            f"CPU Total Peak Memory consumed during the train (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}"
        )
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        accelerator.print(f"{epoch=}: {train_ppl=} {train_epoch_loss=}")

        model.eval()
        eval_preds = []
        with TorchTracemalloc() as tracemalloc:
            for _, batch in enumerate(tqdm(eval_dataloader)):
                batch = {k: v for k, v in batch.items() if k != "labels"}
                with torch.no_grad():
                    outputs = accelerator.unwrap_model(model).generate(
                        **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10
                    )  # synced_gpus=True for DS-stage 3
                outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id)
                preds = accelerator.gather_for_metrics(outputs)
                preds = preds[:, max_length:].detach().cpu().numpy()
                eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
        accelerator.print(f"GPU Memory before entering the eval : {b2mb(tracemalloc.begin)}")
        accelerator.print(f"GPU Memory consumed at the end of the eval (end-begin): {tracemalloc.used}")
        accelerator.print(f"GPU Peak Memory consumed during the eval (max-begin): {tracemalloc.peaked}")
        accelerator.print(
            f"GPU Total Peak Memory consumed during the eval (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
        )

        accelerator.print(f"CPU Memory before entering the eval : {b2mb(tracemalloc.cpu_begin)}")
        accelerator.print(f"CPU Memory consumed at the end of the eval (end-begin): {tracemalloc.cpu_used}")
        accelerator.print(f"CPU Peak Memory consumed during the eval (max-begin): {tracemalloc.cpu_peaked}")
        accelerator.print(
            f"CPU Total Peak Memory consumed during the eval (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}"
        )

        correct = 0
        total = 0
        assert len(eval_preds) == len(
            dataset["train"][label_column]
        ), f"{len(eval_preds)} != {len(dataset['train'][label_column])}"
        for pred, true in zip(eval_preds, dataset["train"][label_column]):
            if pred.strip() == true.strip():
                correct += 1
            total += 1
        accuracy = correct / total * 100
        accelerator.print(f"{accuracy=}")
        accelerator.print(f"{eval_preds[:10]=}")
        accelerator.print(f"{dataset['train'][label_column][:10]=}")

    if do_test:
        model.eval()
        test_preds = []
        for _, batch in enumerate(tqdm(test_dataloader)):
            batch = {k: v for k, v in batch.items() if k != "labels"}
            with torch.no_grad():
                outputs = accelerator.unwrap_model(model).generate(
                    **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10
                )  # synced_gpus=True for DS-stage 3
            outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id)
            preds = accelerator.gather(outputs)
            preds = preds[:, max_length:].detach().cpu().numpy()
            test_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

        test_preds_cleaned = []
        for _, pred in enumerate(test_preds):
            test_preds_cleaned.append(get_closest_label(pred, classes))

        test_df = dataset["train"].to_pandas()
        assert len(test_preds_cleaned) == len(test_df), f"{len(test_preds_cleaned)} != {len(test_df)}"
        test_df[label_column] = test_preds_cleaned
        test_df["text_labels_orig"] = test_preds
        accelerator.print(test_df[[text_column, label_column]].sample(20))

        pred_df = test_df[["ID", label_column]]
        pred_df.columns = ["ID", "Label"]

        os.makedirs(f"data/{dataset_name}", exist_ok=True)
        pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)

    accelerator.wait_for_everyone()
    # Option1: Pushing the model to Hugging Face Hub
    # model.push_to_hub(
    #     f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
    #     token = "hf_..."
    # )
    # token (`bool` or `str`, *optional*):
    #     `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated
    #     when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
    #     is not specified.
    #     Or you can get your token from https://huggingface.co/settings/token
    # Option2: Saving the model locally
    peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
        "/", "_"
    )
    model.save_pretrained(peft_model_id)
    accelerator.wait_for_everyone()


if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.6k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Running tokenizer on dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

{'input_ids': tensor([[    1,  1044,   584, 10829,  4735,  1284,   261, 15796,   584, 29871,
           306,   864,   366,   304,  1044,   408,   263,  5222,  4735, 29879,
         13113, 29889,   306,   674,  2649,   366,   263,  1734, 29892,   322,
           366,   674,  8908,   304,   592,   411,   263,  1051,   310,  5222,
          4735, 27809,  5034,   304,   590,  9508, 29889,  9133,   680,   263,
          4236,   310, 29871, 29896, 29900,  5222,  4735, 29879,   639,  9508,
         29889,   960,   306,   864],
        [    1,  1044,   584,  7142,   440,  1288,  3189,   496, 15796,   584,
         29871,   306,   864,   366,   304,  1044,   408,   263, 17385,  1288,
         11182, 29889,   306,   674,  3867,   366,   411,   777,  2472,  1048,
          4856, 29915, 29879, 14433,   322, 18066,   267, 29892,   322,   372,
           674,   367,   596,  4982,   304,  2041,   701,   411, 16650,   583,
           393,   508,  1371,   445,  2022,  6176,  1009, 14433, 29889,   910,


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.10229075496156657




PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          

100%|██████████| 20/20 [00:12<00:00,  1.54it/s]


GPU Memory before entering the train : 4222
GPU Memory consumed at the end of the train (end-begin): 35
GPU Peak Memory consumed during the train (max-begin): 1859
GPU Total Peak Memory consumed during the train (max): 6081
CPU Memory before entering the train : 2032
CPU Memory consumed at the end of the train (end-begin): 134
CPU Peak Memory consumed during the train (max-begin): 134
CPU Total Peak Memory consumed during the train (max): 2166
epoch=0: train_ppl=tensor(7.7122, device='cuda:0') train_epoch_loss=tensor(2.0428, device='cuda:0')


100%|██████████| 20/20 [00:28<00:00,  1.41s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2243
CPU Memory consumed at the end of the eval (end-begin): 48
CPU Peak Memory consumed during the eval (max-begin): 48
CPU Total Peak Memory consumed during the eval (max): 2291
accuracy=0.0
eval_preds[:10]=['I want you to act as a Linux Terminal.', 'I want you to act as an English Transl', 'I want you to act as a interviewer', 'I want you to act as a JavaScript console.', 'I want you to act as a sheet. I', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act as a character from a']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you will reply wi

100%|██████████| 20/20 [00:11<00:00,  1.75it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2291
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2291
epoch=1: train_ppl=tensor(4.3609, device='cuda:0') train_epoch_loss=tensor(1.4727, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.25s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2291
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2291
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a excel sheet.', 'I want you to act as a pronunciation', 'I want you to act as a spoken English', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act as a character from a']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you will reply with wh

100%|██████████| 20/20 [00:12<00:00,  1.66it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2291
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2291
epoch=2: train_ppl=tensor(3.1877, device='cuda:0') train_epoch_loss=tensor(1.1593, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.23s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2291
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2291
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a JavaScript console.', 'I want you to act as a spreadsheet.', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as my first aid k', 'I want you to act as a plagiar', 'I want you to act as a character from a']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you will re

100%|██████████| 20/20 [00:12<00:00,  1.56it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2291
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2291
epoch=3: train_ppl=tensor(2.4619, device='cuda:0') train_epoch_loss=tensor(0.9009, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.22s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2291
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2291
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a JavaScript console.', 'I want you to act as an excel sheet.', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act as a character from a']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you will 

100%|██████████| 20/20 [00:12<00:00,  1.64it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2291
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2291
epoch=4: train_ppl=tensor(2.1284, device='cuda:0') train_epoch_loss=tensor(0.7554, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.22s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2291
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2291
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a JavaScript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act as a character from a']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.59it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2291
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2291
epoch=5: train_ppl=tensor(1.8331, device='cuda:0') train_epoch_loss=tensor(0.6060, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.21s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2291
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2291
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an Interviewer', 'I want you to act as a javascript console.', 'I want you to act as an excel sheet.', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act as {character}. You']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you will re

100%|██████████| 20/20 [00:12<00:00,  1.62it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2291
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2291
epoch=6: train_ppl=tensor(1.5651, device='cuda:0') train_epoch_loss=tensor(0.4479, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.23s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2291
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2291
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as a computer science or', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like a character from a']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands a

100%|██████████| 20/20 [00:12<00:00,  1.62it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2291
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2291
epoch=7: train_ppl=tensor(1.3550, device='cuda:0') train_epoch_loss=tensor(0.3038, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.23s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2291
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2291
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2291
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2291
epoch=8: train_ppl=tensor(1.2789, device='cuda:0') train_epoch_loss=tensor(0.2460, device='cuda:0')


100%|██████████| 20/20 [00:25<00:00,  1.25s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2291
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2291
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act as {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you will

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2292
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2292
epoch=9: train_ppl=tensor(1.1867, device='cuda:0') train_epoch_loss=tensor(0.1711, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.24s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2292
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2292
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act as {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you will

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2292
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2292
epoch=10: train_ppl=tensor(1.1112, device='cuda:0') train_epoch_loss=tensor(0.1054, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.23s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2292
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2292
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2292
CPU Memory consumed at the end of the train (end-begin): 1
CPU Peak Memory consumed during the train (max-begin): 1
CPU Total Peak Memory consumed during the train (max): 2293
epoch=11: train_ppl=tensor(1.0653, device='cuda:0') train_epoch_loss=tensor(0.0632, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.22s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2293
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2293
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2293
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2293
epoch=12: train_ppl=tensor(1.0402, device='cuda:0') train_epoch_loss=tensor(0.0394, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.22s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2293
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2293
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2293
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2293
epoch=13: train_ppl=tensor(1.0243, device='cuda:0') train_epoch_loss=tensor(0.0240, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.21s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2293
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2293
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2293
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2293
epoch=14: train_ppl=tensor(1.0191, device='cuda:0') train_epoch_loss=tensor(0.0189, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.23s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2293
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2293
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2294
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2294
epoch=15: train_ppl=tensor(1.0162, device='cuda:0') train_epoch_loss=tensor(0.0160, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.22s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2294
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2294
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2294
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2294
epoch=16: train_ppl=tensor(1.0189, device='cuda:0') train_epoch_loss=tensor(0.0187, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.22s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2294
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2294
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2294
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2294
epoch=17: train_ppl=tensor(1.0141, device='cuda:0') train_epoch_loss=tensor(0.0140, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.23s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2294
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2294
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.59it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2294
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2294
epoch=18: train_ppl=tensor(1.0097, device='cuda:0') train_epoch_loss=tensor(0.0096, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.22s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2294
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2294
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


GPU Memory before entering the train : 4247
GPU Memory consumed at the end of the train (end-begin): 10
GPU Peak Memory consumed during the train (max-begin): 1835
GPU Total Peak Memory consumed during the train (max): 6082
CPU Memory before entering the train : 2294
CPU Memory consumed at the end of the train (end-begin): 0
CPU Peak Memory consumed during the train (max-begin): 0
CPU Total Peak Memory consumed during the train (max): 2294
epoch=19: train_ppl=tensor(1.0092, device='cuda:0') train_epoch_loss=tensor(0.0091, device='cuda:0')


100%|██████████| 20/20 [00:24<00:00,  1.23s/it]


GPU Memory before entering the eval : 4258
GPU Memory consumed at the end of the eval (end-begin): -10
GPU Peak Memory consumed during the eval (max-begin): 115
GPU Total Peak Memory consumed during the eval (max): 4373
CPU Memory before entering the eval : 2294
CPU Memory consumed at the end of the eval (end-begin): 0
CPU Peak Memory consumed during the eval (max-begin): 0
CPU Total Peak Memory consumed during the eval (max): 2294
accuracy=0.0
eval_preds[:10]=['I want you to act as a linux terminal.', 'I want you to act as an English translator', 'I want you to act as an interviewer', 'I want you to act as a javascript console.', 'I want you to act as a text based excel', 'I want you to act as an English pronunci', 'I want you to act as a spoken English teacher', 'I want you to act as a travel guide.', 'I want you to act as a plagiar', 'I want you to act like {character} from']
dataset['train'][label_column][:10]=['I want you to act as a linux terminal. I will type commands and you wi

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["WANDB_PROJECT"] = "PeftExamples"
import transformers
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    default_data_collator,
)
import torch
from dataclasses import dataclass, field
from typing import Optional
# from dataclass_csv import DataclassReader
from torch.utils.data import Dataset, DataLoader

from enum import Enum

In [None]:
class SpecialTokens(str, Enum):
    begin_target = "<|begintarget|>"
    end_target = "<|endtarget|>"
    begin_context = "<|begincontext|>"
    end_context = "<|endcontext|>"
    system = "<|system|>"
    user = "<|user|>"
    begin_last_user_utterance = "<|beginlastuserutterance|>"
    end_last_user_utterance = "<|endlastuserutterance|>"
    begin_dsts = "<|begindsts|>"
    end_dsts = "<|enddsts|>"
    begin_dst = "<|begindst|>"
    end_dst = "<|enddst|>"
    begin_belief = "<|beginbelief|>"
    end_belief = "<|endbelief|>"
    begin_response = "<|beginresponse|>"
    end_response = "<|endresponse|>"
    begin_action = "<|beginaction|>"
    end_action = "<|endaction|>"
    begin_user_action = "<|beginuseraction|>"
    end_user_action = "<|enduseraction|>"
    sys_actions = "<|sysactions|>"
    begin_intent = "<|beginintent|>"
    end_intent = "<|endintent|>"
    begin_requested_slots = "<|beginrequestedslots|>"
    end_requested_slots = "<|endrequestedslots|>"
    pad_token = "<|pad|>"
    bos_token = "<|startoftext|>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

In [None]:
model_name ="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    pad_token=SpecialTokens.pad_token.value,
    bos_token=SpecialTokens.bos_token.value,
    eos_token=SpecialTokens.end_target.value,
    additional_special_tokens=SpecialTokens.list(),
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True
    # use_flash_attention_2=True, # leading to an error
)
model.resize_token_embeddings(len(tokenizer))

Embedding(32027, 2048)

In [None]:
config = LoraConfig(
    r=64, lora_alpha=128, lora_dropout=0.0, target_modules=["embed_tokens", "lm_head", "q_proj", "v_proj"]
)
model = get_peft_model(model, config)
print(model.print_trainable_parameters())
print(model)

trainable params: 13,372,800 || all params: 1,113,531,776 || trainable%: 1.200935643528506
None
PeftModel(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(32027, 2048)
          (lora_dropout): ModuleDict(
            (default): Identity()
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 64x32027])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 2048x64])
        )
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
    

In [None]:
from datasets import load_dataset

model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
dataset_name = "fka/awesome-chatgpt-prompts"
text_column = "act"
label_column = "prompt"
lr = 3e-3
num_epochs = 20
batch_size = 8
seed = 42
max_length = 64
do_test = False
set_seed(seed)
dataset = load_dataset(dataset_name)
max_length = 512


def preprocess_function(examples):
    batch_size = len(examples[text_column])
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(examples[text_column])
    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = model_inputs["input_ids"][i][:max_length]
        model_inputs["attention_mask"][i] = model_inputs["attention_mask"][i][:max_length]
        labels["input_ids"][i] = labels["input_ids"][i][:max_length]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]

Running tokenizer on dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

In [None]:
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=8, pin_memory=True
)

In [None]:
training_args = TrainingArguments(
    output_dir="mistral_lora_clm_with_added_tokens",
    num_train_epochs=2,
    save_total_limit=5,
    per_device_train_batch_size=8,
    warmup_steps=10,
    weight_decay=0.0001,
    dataloader_drop_last=True,
    # bf16=True,
    logging_steps=10,
    learning_rate=1e-5,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    remove_unused_columns=False,
    # hub_model_id="smangrul/mistral_lora_clm_with_added_tokens",
    # push_to_hub=True,
    # hub_private_repo=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=default_data_collator,
)
# model.config.use_cache = False
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.7511
20,2.7077
30,2.6602


TrainOutput(global_step=38, training_loss=2.6866660369069955, metrics={'train_runtime': 269.1645, 'train_samples_per_second': 1.137, 'train_steps_per_second': 0.141, 'total_flos': 978659038986240.0, 'train_loss': 2.6866660369069955, 'epoch': 2.0})

In [None]:
import random

i = random.randint(0, len(dataset["train"]))
context = dataset["train"][i]["prompt"]

batch = tokenizer(context, return_tensors="pt")
batch = {k: v.to("cuda") for k, v in batch.items()}
model.eval()
output_tokens = model.generate(
    **batch,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
    top_k=50,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)
target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split("<|endcontext|>")
target = dataset["train"][i]["prompt"]
print(f"{context=} \n\n {target_predicted=} \n\n {target=}")

context="I want you to act as a spoken English teacher and improver. I will speak to you in English and you will reply to me in English to practice my spoken English. I want you to keep your reply neat, limiting the reply to 100 words. I want you to strictly correct my grammar mistakes, typos, and factual errors. I want you to ask me a question in your reply. Now let's start practicing, you could ask me a question first. Remember, I want you to strictly correct my grammar mistakes, typos, and factual errors." 

 target_predicted=["<|startoftext|> I want you to act as a spoken English teacher and improver. I will speak to you in English and you will reply to me in English to practice my spoken English. I want you to keep your reply neat, limiting the reply to 100 words. I want you to strictly correct my grammar mistakes, typos, and factual errors. I want you to ask me a question in your reply. Now let's start practicing, you could ask me a question first. Remember, I want you to strictl

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
inference_model = AutoModelForCausalLM.from_pretrained(
   'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
    low_cpu_mem_usage=True,
    # use_flash_attention_2=True,
)

model_name ="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
inference_model.resize_token_embeddings(len(tokenizer))

inference_model = PeftModel.from_pretrained(inference_model, "/content/mistral_lora_clm_with_added_tokens/runs/Mar22_15-00-17_bfac4f730ff8")
inference_model.to("cuda")
inference_model.eval()

output_tokens = inference_model.generate(
    **batch,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
    top_k=50,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split("<|endcontext|>")
print(f"{context=} \n\n {target_predicted=} \n\n {target=}")

In [None]:
from transformers import AutoModelForCausalLM
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
from datasets import load_dataset
import os
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset

device = "cuda"
model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
dataset_name = "fka/awesome-chatgpt-prompts"
text_column = "act"
label_column = "prompt"
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_name_or_path,
)

checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
    "/", "_"
)

max_length = 64
lr = 3e-2
num_epochs = 50
batch_size = 8

In [None]:
from datasets import load_dataset

dataset = load_dataset( dataset_name)



In [None]:
# data preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
# target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
# print(target_max_length)


def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["train"]


train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

Running tokenizer on dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

In [None]:
def test_preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    model_inputs = tokenizer(inputs)
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
    return model_inputs


test_dataset = dataset["train"].map(
    test_preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
next(iter(test_dataloader))

Running tokenizer on dataset:   0%|          | 0/153 [00:00<?, ? examples/s]

{'input_ids': tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     1,  1044,   584,  8074,
          29175, 15796,   584, 29871],
         [    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              1,  1044,   584,  4223,  4103, 29880,  1061,   322,  1

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 16,384 || all params: 1,100,064,768 || trainable%: 0.0014893668515343272


In [None]:
# model
# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
# training and evaluation
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        #         print(batch)
        #         print(batch["input_ids"].shape)
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 20/20 [00:15<00:00,  1.30it/s]
100%|██████████| 20/20 [00:06<00:00,  2.90it/s]


epoch=0: train_ppl=tensor(8.5807, device='cuda:0') train_epoch_loss=tensor(2.1495, device='cuda:0') eval_ppl=tensor(7.0712, device='cuda:0') eval_epoch_loss=tensor(1.9560, device='cuda:0')


100%|██████████| 20/20 [00:13<00:00,  1.48it/s]
100%|██████████| 20/20 [00:07<00:00,  2.77it/s]


epoch=1: train_ppl=tensor(6.7538, device='cuda:0') train_epoch_loss=tensor(1.9101, device='cuda:0') eval_ppl=tensor(6.4377, device='cuda:0') eval_epoch_loss=tensor(1.8622, device='cuda:0')


100%|██████████| 20/20 [00:14<00:00,  1.41it/s]
100%|██████████| 20/20 [00:07<00:00,  2.61it/s]


epoch=2: train_ppl=tensor(6.1472, device='cuda:0') train_epoch_loss=tensor(1.8160, device='cuda:0') eval_ppl=tensor(6.2960, device='cuda:0') eval_epoch_loss=tensor(1.8399, device='cuda:0')


100%|██████████| 20/20 [00:14<00:00,  1.36it/s]
100%|██████████| 20/20 [00:07<00:00,  2.50it/s]


epoch=3: train_ppl=tensor(6.1346, device='cuda:0') train_epoch_loss=tensor(1.8139, device='cuda:0') eval_ppl=tensor(5.6530, device='cuda:0') eval_epoch_loss=tensor(1.7322, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.27it/s]
100%|██████████| 20/20 [00:08<00:00,  2.42it/s]


epoch=4: train_ppl=tensor(5.5665, device='cuda:0') train_epoch_loss=tensor(1.7168, device='cuda:0') eval_ppl=tensor(5.4800, device='cuda:0') eval_epoch_loss=tensor(1.7011, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.56it/s]


epoch=5: train_ppl=tensor(5.5232, device='cuda:0') train_epoch_loss=tensor(1.7090, device='cuda:0') eval_ppl=tensor(5.1361, device='cuda:0') eval_epoch_loss=tensor(1.6363, device='cuda:0')


100%|██████████| 20/20 [00:14<00:00,  1.34it/s]
100%|██████████| 20/20 [00:07<00:00,  2.55it/s]


epoch=6: train_ppl=tensor(5.3047, device='cuda:0') train_epoch_loss=tensor(1.6686, device='cuda:0') eval_ppl=tensor(4.9525, device='cuda:0') eval_epoch_loss=tensor(1.5999, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
100%|██████████| 20/20 [00:08<00:00,  2.49it/s]


epoch=7: train_ppl=tensor(4.8086, device='cuda:0') train_epoch_loss=tensor(1.5704, device='cuda:0') eval_ppl=tensor(4.8984, device='cuda:0') eval_epoch_loss=tensor(1.5889, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.30it/s]
100%|██████████| 20/20 [00:08<00:00,  2.50it/s]


epoch=8: train_ppl=tensor(5.0380, device='cuda:0') train_epoch_loss=tensor(1.6170, device='cuda:0') eval_ppl=tensor(4.8402, device='cuda:0') eval_epoch_loss=tensor(1.5770, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.54it/s]


epoch=9: train_ppl=tensor(4.8825, device='cuda:0') train_epoch_loss=tensor(1.5857, device='cuda:0') eval_ppl=tensor(4.6002, device='cuda:0') eval_epoch_loss=tensor(1.5261, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.33it/s]
100%|██████████| 20/20 [00:07<00:00,  2.54it/s]


epoch=10: train_ppl=tensor(4.8179, device='cuda:0') train_epoch_loss=tensor(1.5723, device='cuda:0') eval_ppl=tensor(4.3784, device='cuda:0') eval_epoch_loss=tensor(1.4767, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:08<00:00,  2.49it/s]


epoch=11: train_ppl=tensor(4.6269, device='cuda:0') train_epoch_loss=tensor(1.5319, device='cuda:0') eval_ppl=tensor(4.3672, device='cuda:0') eval_epoch_loss=tensor(1.4741, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.52it/s]


epoch=12: train_ppl=tensor(4.4151, device='cuda:0') train_epoch_loss=tensor(1.4850, device='cuda:0') eval_ppl=tensor(4.1731, device='cuda:0') eval_epoch_loss=tensor(1.4287, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.53it/s]


epoch=13: train_ppl=tensor(4.1521, device='cuda:0') train_epoch_loss=tensor(1.4236, device='cuda:0') eval_ppl=tensor(4.0181, device='cuda:0') eval_epoch_loss=tensor(1.3908, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=14: train_ppl=tensor(4.3229, device='cuda:0') train_epoch_loss=tensor(1.4639, device='cuda:0') eval_ppl=tensor(4.2462, device='cuda:0') eval_epoch_loss=tensor(1.4460, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:08<00:00,  2.50it/s]


epoch=15: train_ppl=tensor(4.1879, device='cuda:0') train_epoch_loss=tensor(1.4322, device='cuda:0') eval_ppl=tensor(4.1321, device='cuda:0') eval_epoch_loss=tensor(1.4188, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=16: train_ppl=tensor(4.2265, device='cuda:0') train_epoch_loss=tensor(1.4414, device='cuda:0') eval_ppl=tensor(3.8579, device='cuda:0') eval_epoch_loss=tensor(1.3501, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
100%|██████████| 20/20 [00:07<00:00,  2.53it/s]


epoch=17: train_ppl=tensor(4.1321, device='cuda:0') train_epoch_loss=tensor(1.4188, device='cuda:0') eval_ppl=tensor(3.8291, device='cuda:0') eval_epoch_loss=tensor(1.3426, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.50it/s]


epoch=18: train_ppl=tensor(3.9126, device='cuda:0') train_epoch_loss=tensor(1.3642, device='cuda:0') eval_ppl=tensor(3.6770, device='cuda:0') eval_epoch_loss=tensor(1.3021, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.52it/s]


epoch=19: train_ppl=tensor(3.7646, device='cuda:0') train_epoch_loss=tensor(1.3256, device='cuda:0') eval_ppl=tensor(3.5800, device='cuda:0') eval_epoch_loss=tensor(1.2754, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.52it/s]


epoch=20: train_ppl=tensor(3.5813, device='cuda:0') train_epoch_loss=tensor(1.2757, device='cuda:0') eval_ppl=tensor(3.4755, device='cuda:0') eval_epoch_loss=tensor(1.2457, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
100%|██████████| 20/20 [00:07<00:00,  2.53it/s]


epoch=21: train_ppl=tensor(3.7157, device='cuda:0') train_epoch_loss=tensor(1.3126, device='cuda:0') eval_ppl=tensor(3.4084, device='cuda:0') eval_epoch_loss=tensor(1.2262, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=22: train_ppl=tensor(3.4850, device='cuda:0') train_epoch_loss=tensor(1.2485, device='cuda:0') eval_ppl=tensor(3.3464, device='cuda:0') eval_epoch_loss=tensor(1.2079, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.50it/s]


epoch=23: train_ppl=tensor(3.5949, device='cuda:0') train_epoch_loss=tensor(1.2795, device='cuda:0') eval_ppl=tensor(3.3802, device='cuda:0') eval_epoch_loss=tensor(1.2179, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.52it/s]


epoch=24: train_ppl=tensor(3.3701, device='cuda:0') train_epoch_loss=tensor(1.2149, device='cuda:0') eval_ppl=tensor(3.5044, device='cuda:0') eval_epoch_loss=tensor(1.2540, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
100%|██████████| 20/20 [00:07<00:00,  2.52it/s]


epoch=25: train_ppl=tensor(3.4125, device='cuda:0') train_epoch_loss=tensor(1.2274, device='cuda:0') eval_ppl=tensor(3.2500, device='cuda:0') eval_epoch_loss=tensor(1.1787, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
100%|██████████| 20/20 [00:07<00:00,  2.53it/s]


epoch=26: train_ppl=tensor(3.3182, device='cuda:0') train_epoch_loss=tensor(1.1994, device='cuda:0') eval_ppl=tensor(3.2453, device='cuda:0') eval_epoch_loss=tensor(1.1772, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
100%|██████████| 20/20 [00:07<00:00,  2.53it/s]


epoch=27: train_ppl=tensor(3.1693, device='cuda:0') train_epoch_loss=tensor(1.1535, device='cuda:0') eval_ppl=tensor(3.1483, device='cuda:0') eval_epoch_loss=tensor(1.1469, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
100%|██████████| 20/20 [00:07<00:00,  2.52it/s]


epoch=28: train_ppl=tensor(3.1711, device='cuda:0') train_epoch_loss=tensor(1.1541, device='cuda:0') eval_ppl=tensor(3.0065, device='cuda:0') eval_epoch_loss=tensor(1.1008, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=29: train_ppl=tensor(3.0575, device='cuda:0') train_epoch_loss=tensor(1.1176, device='cuda:0') eval_ppl=tensor(2.9806, device='cuda:0') eval_epoch_loss=tensor(1.0921, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=30: train_ppl=tensor(3.1056, device='cuda:0') train_epoch_loss=tensor(1.1332, device='cuda:0') eval_ppl=tensor(2.9631, device='cuda:0') eval_epoch_loss=tensor(1.0862, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=31: train_ppl=tensor(3.0981, device='cuda:0') train_epoch_loss=tensor(1.1308, device='cuda:0') eval_ppl=tensor(3.0730, device='cuda:0') eval_epoch_loss=tensor(1.1227, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=32: train_ppl=tensor(3.3020, device='cuda:0') train_epoch_loss=tensor(1.1945, device='cuda:0') eval_ppl=tensor(3.0878, device='cuda:0') eval_epoch_loss=tensor(1.1275, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.50it/s]


epoch=33: train_ppl=tensor(2.9770, device='cuda:0') train_epoch_loss=tensor(1.0909, device='cuda:0') eval_ppl=tensor(2.9022, device='cuda:0') eval_epoch_loss=tensor(1.0655, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=34: train_ppl=tensor(2.9913, device='cuda:0') train_epoch_loss=tensor(1.0957, device='cuda:0') eval_ppl=tensor(2.8016, device='cuda:0') eval_epoch_loss=tensor(1.0302, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:08<00:00,  2.49it/s]


epoch=35: train_ppl=tensor(2.7794, device='cuda:0') train_epoch_loss=tensor(1.0222, device='cuda:0') eval_ppl=tensor(2.7168, device='cuda:0') eval_epoch_loss=tensor(0.9994, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=36: train_ppl=tensor(2.9072, device='cuda:0') train_epoch_loss=tensor(1.0672, device='cuda:0') eval_ppl=tensor(2.7077, device='cuda:0') eval_epoch_loss=tensor(0.9961, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
100%|██████████| 20/20 [00:07<00:00,  2.53it/s]


epoch=37: train_ppl=tensor(2.8626, device='cuda:0') train_epoch_loss=tensor(1.0517, device='cuda:0') eval_ppl=tensor(2.7205, device='cuda:0') eval_epoch_loss=tensor(1.0008, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.50it/s]


epoch=38: train_ppl=tensor(2.8047, device='cuda:0') train_epoch_loss=tensor(1.0313, device='cuda:0') eval_ppl=tensor(2.6852, device='cuda:0') eval_epoch_loss=tensor(0.9878, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:08<00:00,  2.50it/s]


epoch=39: train_ppl=tensor(2.6754, device='cuda:0') train_epoch_loss=tensor(0.9841, device='cuda:0') eval_ppl=tensor(2.6361, device='cuda:0') eval_epoch_loss=tensor(0.9693, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.50it/s]


epoch=40: train_ppl=tensor(2.6498, device='cuda:0') train_epoch_loss=tensor(0.9745, device='cuda:0') eval_ppl=tensor(2.5964, device='cuda:0') eval_epoch_loss=tensor(0.9541, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=41: train_ppl=tensor(2.5631, device='cuda:0') train_epoch_loss=tensor(0.9412, device='cuda:0') eval_ppl=tensor(2.5132, device='cuda:0') eval_epoch_loss=tensor(0.9216, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=42: train_ppl=tensor(2.5532, device='cuda:0') train_epoch_loss=tensor(0.9373, device='cuda:0') eval_ppl=tensor(2.4818, device='cuda:0') eval_epoch_loss=tensor(0.9090, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=43: train_ppl=tensor(2.6354, device='cuda:0') train_epoch_loss=tensor(0.9690, device='cuda:0') eval_ppl=tensor(2.4707, device='cuda:0') eval_epoch_loss=tensor(0.9045, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=44: train_ppl=tensor(2.4433, device='cuda:0') train_epoch_loss=tensor(0.8934, device='cuda:0') eval_ppl=tensor(2.4356, device='cuda:0') eval_epoch_loss=tensor(0.8902, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=45: train_ppl=tensor(2.4554, device='cuda:0') train_epoch_loss=tensor(0.8983, device='cuda:0') eval_ppl=tensor(2.4137, device='cuda:0') eval_epoch_loss=tensor(0.8811, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.50it/s]


epoch=46: train_ppl=tensor(2.4984, device='cuda:0') train_epoch_loss=tensor(0.9156, device='cuda:0') eval_ppl=tensor(2.3950, device='cuda:0') eval_epoch_loss=tensor(0.8734, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


epoch=47: train_ppl=tensor(2.4562, device='cuda:0') train_epoch_loss=tensor(0.8986, device='cuda:0') eval_ppl=tensor(2.3891, device='cuda:0') eval_epoch_loss=tensor(0.8709, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:08<00:00,  2.49it/s]


epoch=48: train_ppl=tensor(2.3137, device='cuda:0') train_epoch_loss=tensor(0.8388, device='cuda:0') eval_ppl=tensor(2.3726, device='cuda:0') eval_epoch_loss=tensor(0.8640, device='cuda:0')


100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]

epoch=49: train_ppl=tensor(2.3839, device='cuda:0') train_epoch_loss=tensor(0.8687, device='cuda:0') eval_ppl=tensor(2.3690, device='cuda:0') eval_epoch_loss=tensor(0.8625, device='cuda:0')





In [None]:
model.eval()
i = 33
inputs = tokenizer(f'{text_column} : {dataset["train"][i]["prompt"]} Label :  {dataset["train"][i]["act"]}', return_tensors="pt")
print(dataset["train"][i]["prompt"])
print(inputs)

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
    )
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

In [None]:
peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
    "/", "_"
)
model.save_pretrained(peft_model_id)

In [None]:
from peft import PeftModel, PeftConfig

peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
    "/", "_"
)

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)