<a href="https://colab.research.google.com/github/elephanti/NLPProject2024/blob/main/LAMBADA%2B_Data_Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations & imports

In [1]:
!pip install bitsandbytes
!pip install transformers
!pip install accelerate
!pip install peft
!pip install datasets
!pip install evaluate
!pip install flash-attn --no-build-isolation
!pip install trl
!pip install vllm

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->bitsandbytes)
 

In [2]:
from huggingface_hub import login
from datasets import Dataset
import pandas as pd
import os
import gc
import torch

# DataAugmenter

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel
from vllm import LLM, SamplingParams
import torch
from datasets import Dataset
from trl import SFTTrainer
import pandas as pd
import os
import gc
from sklearn.utils import shuffle

MODELS = {
    "mistral": {
        "base_model": "mistralai/Mistral-7B-v0.3",
        "bos_token": "<s>",
        "eos_token": "</s>",
        "sep_token": "[SEP]"
    },
    "llama": {
        "base_model": "meta-llama/Meta-Llama-3-8B",
        "bos_token": "<|begin_of_text|>",
        "eos_token": "<|end_of_text|>",
        "sep_token": "[SEP]"
    }
}


class DataAugmenter:
    def __init__(self, experiment_name, token, model_name, repo_id, tokenizer_padding_side='right', add_special_tokens=False,
                 max_length=1024, quantize_4bit=True, quantize_8bit=False, use_flash_attn=False, use_peft=True):
        self.experiment_name = experiment_name
        self.model_name = model_name
        self.tokenizer_padding_side = tokenizer_padding_side
        self.add_special_tokens = add_special_tokens
        self.max_length = max_length
        self.quantize_4bit = quantize_4bit
        self.quantize_8bit = quantize_8bit
        self.use_flash_attn = use_flash_attn
        self.use_peft = use_peft
        self.repo_id = repo_id
        self.token = token

        if model_name not in MODELS:
            raise ValueError(f"Model {model_name} not supported. Supported models: {list(MODELS.keys())}")

        self.base_model = MODELS[model_name]["base_model"]
        self.bos_token = MODELS[model_name]["bos_token"]
        self.eos_token = MODELS[model_name]["eos_token"]
        self.sep_token = MODELS[model_name]["sep_token"]

        if quantize_4bit and quantize_8bit:
            raise ValueError("Only one of quantize_4bit and quantize_8bit can be True")

    def _init_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(self.base_model, trust_remote_code=True, token=self.token)
        tokenizer.padding_side = self.tokenizer_padding_side

        if self.model_name == "mistral":
            tokenizer.pad_token = tokenizer.unk_token

        elif self.model_name == "llama":
            tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})

        if not self.add_special_tokens:
            tokenizer.add_eos_token = False
            tokenizer.add_bos_token = False

        tokenizer.model_max_length = self.max_length

        return tokenizer

    def _init_model_for_training(self, tokenizer):
        if self.quantize_4bit:
            bnb_config = BitsAndBytesConfig(
                load_in_4bit= True,
                bnb_4bit_quant_type= "nf4",
                bnb_4bit_compute_dtype= torch.bfloat16,
                bnb_4bit_use_double_quant= False,
            )

        elif self.quantize_8bit:
            bnb_config = BitsAndBytesConfig(
                load_in_8bit= True
            )

        else:
            bnb_config = None

        base_model = AutoModelForCausalLM.from_pretrained(
            self.base_model,
            quantization_config=bnb_config,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            attn_implementation="flash_attention_2" if self.use_flash_attn else None,
            trust_remote_code=True,
            token=self.token
        )
        model = base_model

        if self.quantize_4bit or self.quantize_8bit:
            base_model = prepare_model_for_kbit_training(base_model)

        if self.use_peft:
            lora_config = LoraConfig(
                lora_alpha=16,
                lora_dropout=0.1,
                r=64,
                bias="none",
                task_type="CAUSAL_LM",
                target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
            )

            model = get_peft_model(base_model, lora_config)

        model.config.use_cache = False # silence the warnings
        model.config.pretraining_tp = 1
        model.gradient_checkpointing_enable()
        model.enable_input_require_grads()
        model.resize_token_embeddings(len(tokenizer))

        return model, base_model

    def merge_and_upload(self):
        base_model = AutoModelForCausalLM.from_pretrained(
        self.base_model,
        token=self.token,
        torch_dtype=torch.float16,
        attn_implementation="flash_attention_2" if self.use_flash_attn else None,
        trust_remote_code=True,
        device_map="auto",
        offload_folder="offload")

        model = PeftModel.from_pretrained(
            base_model,
            f"{self.repo_id}/{self.experiment_name}-lora",
            torch_dtype=torch.float16,
            device_map="auto",
            offload_folder="offload",
            token=self.token
        )

        print("Merging model")
        merged_model = model.merge_and_unload()

        print("Saving model")
        merged_model.save_pretrained(f"{self.experiment_name}-merged")

        print("Uploading model")
        merged_model.push_to_hub(f"{self.experiment_name}-merged", token=self.token)

        del base_model
        del model
        del merged_model
        self.clean()

    def load_and_preprocess_dataset(self, filename:str) -> Dataset:
        """
        Load a dataset from a csv file, preprocess it and return it as a Hugging Face Dataset object

        Args:
            filename (str): Path to the csv file containing the dataset

        Returns:
            Dataset: Hugging Face Dataset object
        """
        df = pd.read_csv(filename)

        if not all(col in df.columns for col in ['text', 'label', 'description']):
            raise ValueError(f"File {filename} must have columns 'text' and 'label' and 'description'")

        df['text'] = df.apply(lambda row: self.format_sample(row['label'], row['text'], row['description']), axis=1)
        df = shuffle(df)
        dataset = Dataset.from_pandas(df)
        return dataset

    def train_llm(self, train_dataset: Dataset, output_dir=None, optim="adamw_bnb_8bit", num_train_epochs=4,
              per_device_train_batch_size=4, gradient_accumulation_steps=1, save_steps=50, logging_steps=1,
              learning_rate=2.5e-5, weight_decay=0.001, fp16=False, bf16=False, max_grad_norm=0.3, max_steps=-1,
              warmup_ratio=0.03, group_by_length=True, gradient_checkpointing=True, lr_scheduler_type="constant", packing=False):
        """
        Train the model on the given dataset

        Args:
            train_dataset (Dataset): Hugging Face Dataset object containing the training data
            output_dir (str): Path to the directory where the trained model will be saved
            optim (str, optional): Optimizer to use. Defaults to "adamw_bnb_8bit".
            num_train_epochs (int, optional): Number of training epochs. Defaults to 4.
            per_device_train_batch_size (int, optional): Batch size per GPU. Defaults to 1.
            gradient_accumulation_steps (int, optional): Number of steps to accumulate gradients for. Defaults to 1.
            save_steps (int, optional): Number of steps after which to save the model. Defaults to 50.
            logging_steps (int, optional): Number of steps after which to log the training metrics. Defaults to 1.
            learning_rate (float, optional): Learning rate. Defaults to 2.5e-5.
            weight_decay (float, optional): Weight decay. Defaults to 0.001.
            fp16 (bool, optional): Whether to use FP16 training. Defaults to False.
            bf16 (bool, optional): Whether to use BF16 training. Defaults to False.
            max_grad_norm (float, optional): Maximum gradient norm. Defaults to 0.3.
            max_steps (int, optional): Maximum number of training steps. Defaults to -1.
            warmup_ratio (float, optional): Warmup ratio. Defaults to 0.03.
            group_by_length (bool, optional): Whether to group samples by length. Defaults to True.
            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to True.
            lr_scheduler_type (str, optional): Learning rate scheduler type. Defaults to "constant".
            packing (bool, optional): Whether to use packing. Defaults to False.
        """
        print("Initialising tokenizer")
        tokenizer = self._init_tokenizer()

        print("Initialising model")
        model, base_model = self._init_model_for_training(tokenizer)

        model.config.pad_token_id = tokenizer.pad_token_id

        training_arguments = TrainingArguments(
            output_dir=output_dir if output_dir else f'{self.experiment_name}_outputs',
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            save_steps=save_steps,
            logging_steps=logging_steps,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            fp16=fp16,
            bf16=bf16,
            max_grad_norm=max_grad_norm,
            max_steps=max_steps,
            warmup_ratio=warmup_ratio,
            group_by_length=group_by_length,
            gradient_checkpointing=gradient_checkpointing,
            lr_scheduler_type=lr_scheduler_type
        )

        trainer = SFTTrainer(
            model=model,
            args=training_arguments,
            train_dataset=train_dataset,
            dataset_text_field='text',
            max_seq_length=tokenizer.model_max_length,
            tokenizer=tokenizer,
            packing=packing
        )

        trainer.train()

        name = f"{self.experiment_name}-lora"
        model.save_pretrained(name)
        model.push_to_hub(name, token=self.token)
        tokenizer.save_pretrained(name)
        tokenizer.push_to_hub(name, token=self.token)

        del base_model
        del model
        del tokenizer
        del trainer
        self.clean()

    def augment(self, labels, descriptions, num_of_samples_per_label=10, min_length=10, max_length=100, temperature=1, top_k=30, top_p=0.90,
                repetition_penalty=1.5, tokenizer_side_override=None, generation_folder='generated_data', gpu_memory_utilization=0.9, model_name=None, tokenizer_name=None):
        os.environ['CUDA_VISIBLE_DEVICES']="0"

        if not model_name:
            model_name = f"{self.repo_id}/{self.experiment_name}-merged"

        if not tokenizer_name:
            tokenizer_name = self.base_model

        llm = LLM(model=model_name, tokenizer=tokenizer_name, max_model_len=self.max_length,  gpu_memory_utilization=gpu_memory_utilization, dtype="auto")

        inputs = [self.format_sample(label, description=description) for label, description in zip(labels, descriptions)]

        sampling_params = SamplingParams(
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            max_tokens=max_length,
            min_tokens=min_length,
            n=num_of_samples_per_label)

        outputs = llm.generate(inputs, sampling_params=sampling_params)

        del llm
        self.clean()

        samples = []

        for output in outputs:
            label = output.prompt.split(self.sep_token, 1)[0].split(self.bos_token)[1].strip()

            for x in output.outputs:
                samples.append((label, x.text))

        result_df = pd.DataFrame(samples, columns=['label', 'text'])

        os.makedirs(generation_folder, exist_ok=True)
        result_df.to_csv(f'{generation_folder}/{self.experiment_name}_augmented_data.csv', index=False)

        return result_df

    def format_sample(self, label, text = None, description=None):
        if not text:
            return f"{self.bos_token}{label} {self.sep_token} {description} {self.sep_token}"

        return f"{self.bos_token}{label} {self.sep_token} {description} {self.sep_token} {text}{self.eos_token}"


    def clean(self):
        gc.collect()
        torch.cuda.empty_cache()

In [4]:
EXPERIMENT_NAME = "Mistral_7B_lambada_plus"
MODEL_TYPE = "mistral"

# HuggingFace Login

In [5]:
TOKEN = "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"

In [6]:
login(TOKEN, add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Datasets loading & preprocessing

In [7]:
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/ATIS/sampled_subsets/ver1/atis_5_subset.csv -P /content/datasets/ATIS/VER1
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/ATIS/sampled_subsets/ver1/atis_10_subset.csv -P /content/datasets/ATIS/VER1
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/ATIS/sampled_subsets/ver1/atis_20_subset.csv -P /content/datasets/ATIS/VER1
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/ATIS/sampled_subsets/ver1/atis_50_subset.csv -P /content/datasets/ATIS/VER1
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/ATIS/sampled_subsets/ver1/atis_100_subset.csv -P /content/datasets/ATIS/VER1

--2024-07-25 10:52:21--  https://github.com/elephanti/NLPProject2024/raw/main/datasets/ATIS/sampled_subsets/ver1/atis_5_subset.csv
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/elephanti/NLPProject2024/main/datasets/ATIS/sampled_subsets/ver1/atis_5_subset.csv [following]
--2024-07-25 10:52:21--  https://raw.githubusercontent.com/elephanti/NLPProject2024/main/datasets/ATIS/sampled_subsets/ver1/atis_5_subset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12893 (13K) [text/plain]
Saving to: ‘/content/datasets/ATIS/VER1/atis_5_subset.csv’


2024-07-25 10:52:22 (37.5 MB/s) - ‘/content/datasets

In [8]:
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/TREC/sampled_subsets/ver1/trec_5_subset.csv -P /content/datasets/TREC/VER1
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/TREC/sampled_subsets/ver1/trec_10_subset.csv -P /content/datasets/TREC/VER1
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/TREC/sampled_subsets/ver1/trec_20_subset.csv -P /content/datasets/TREC/VER1
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/TREC/sampled_subsets/ver1/trec_50_subset.csv -P /content/datasets/TREC/VER1
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/TREC/sampled_subsets/ver1/trec_100_subset.csv -P /content/datasets/TREC/VER1

--2024-07-25 10:52:24--  https://github.com/elephanti/NLPProject2024/raw/main/datasets/TREC/sampled_subsets/ver1/trec_5_subset.csv
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/elephanti/NLPProject2024/main/datasets/TREC/sampled_subsets/ver1/trec_5_subset.csv [following]
--2024-07-25 10:52:24--  https://raw.githubusercontent.com/elephanti/NLPProject2024/main/datasets/TREC/sampled_subsets/ver1/trec_5_subset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 32861 (32K) [text/plain]
Saving to: ‘/content/datasets/TREC/VER1/trec_5_subset.csv’


2024-07-25 10:52:24 (4.61 MB/s) - ‘/content/datasets

In [9]:
data_augmenter = DataAugmenter(
    experiment_name = f"{EXPERIMENT_NAME}_ATIS_VER_1_10",
    model_name = MODEL_TYPE,
    repo_id="ALivshits",
    token=TOKEN
)

In [10]:
atis_train_5_dataset = data_augmenter.load_and_preprocess_dataset('datasets/ATIS/VER1/atis_5_subset.csv')
atis_train_10_dataset = data_augmenter.load_and_preprocess_dataset('datasets/ATIS/VER1/atis_10_subset.csv')
atis_train_20_dataset = data_augmenter.load_and_preprocess_dataset('datasets/ATIS/VER1/atis_20_subset.csv')
atis_train_50_dataset = data_augmenter.load_and_preprocess_dataset('datasets/ATIS/VER1/atis_50_subset.csv')
atis_train_100_dataset = data_augmenter.load_and_preprocess_dataset('datasets/ATIS/VER1/atis_100_subset.csv')

In [11]:
trec_train_5_dataset = data_augmenter.load_and_preprocess_dataset('datasets/TREC/VER1/trec_5_subset.csv')
trec_train_10_dataset = data_augmenter.load_and_preprocess_dataset('datasets/TREC/VER1/trec_10_subset.csv')
trec_train_20_dataset = data_augmenter.load_and_preprocess_dataset('datasets/TREC/VER1/trec_20_subset.csv')
trec_train_50_dataset = data_augmenter.load_and_preprocess_dataset('datasets/TREC/VER1/trec_50_subset.csv')
trec_train_100_dataset = data_augmenter.load_and_preprocess_dataset('datasets/TREC/VER1/trec_100_subset.csv')

In [12]:
atis_labels_and_desriptions = atis_train_5_dataset.to_pandas()[['label', 'description']].drop_duplicates()
trec_labels_and_desriptions = trec_train_5_dataset.to_pandas()[['label', 'description']].drop_duplicates()
atis_intents = atis_labels_and_desriptions['label'].unique()
trec_intents = trec_labels_and_desriptions['label'].unique()
atis_descriptions = atis_labels_and_desriptions['description'].unique()
trec_descriptions = trec_labels_and_desriptions['description'].unique()

In [13]:
atis_labels_and_desriptions

Unnamed: 0,label,description
0,ground_service,Requesting information about available ground ...
1,aircraft,Inquiring about the types of planes used by sp...
2,meal,Asking about meal services provided on specifi...
3,flight_no,Asking for the flight number of specific fligh...
4,airport,Requesting information about airports in speci...
5,city,Asking about cities serviced by specific airli...
6,airfare,"Seeking information on the cost of flights, in..."
9,capacity,Inquiring about the seating capacity of specif...
10,quantity,"Inquiring about the number of flights, stops, ..."
16,day_name,Inquiring about the days of the week when flig...


In [14]:
data_augmenter.clean()
del data_augmenter

# Genereration without fine tuning

In [24]:
data_augmenter = DataAugmenter(
    experiment_name = f"{EXPERIMENT_NAME}_raw_ATIS",
    model_name = MODEL_TYPE,
    repo_id="ALivshits",
    token=TOKEN
)

generated_samples = data_augmenter.augment(atis_intents[:3], atis_descriptions[:3], num_of_samples_per_label=3, model_name="mistralai/Mistral-7B-v0.3")
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

INFO 07-25 09:17:51 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='mistralai/Mistral-7B-v0.3', speculative_config=None, tokenizer='mistralai/Mistral-7B-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=mistralai/Mistral-7B-v0.3, use_v2_block_manager=False, enable_prefix_caching=False)


tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

INFO 07-25 09:17:55 model_runner.py:680] Starting to load model mistralai/Mistral-7B-v0.3...
INFO 07-25 09:17:55 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

consolidated.safetensors:   0%|          | 0.00/14.5G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 07-25 09:21:48 model_runner.py:692] Loading model weights took 13.5083 GB
INFO 07-25 09:21:49 gpu_executor.py:102] # GPU blocks: 11593, # CPU blocks: 2048
INFO 07-25 09:21:49 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-25 09:21:49 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-25 09:22:12 model_runner.py:1181] Graph capturing finished in 23 secs.


Processed prompts: 100%|██████████| 3/3 [00:01<00:00,  1.89it/s, est. speed input: 57.84 toks/s, output: 561.40 toks/s]


            label                                               text
0        capacity  \n- ## What is Seat Capacity? How does it Impa...
1        capacity   How much luggage can I take for free?\n06/21/...
2        capacity  \n\ncarriage (also carriageway) [NOUN/VERB, SE...
3    abbreviation  \n(2017-3) 978-154625771-X / SBN: 05/22\nCover...
4    abbreviation  \nI’m traveling on a business trip and I need ...
5    abbreviation  \nRequesting that a passenger who is physicall...
6  ground_service   Ground transportation refers to any type of t...
7  ground_service   Ground service is the process of transporting...
8  ground_service  \n1 answer(s) 403 views What is the best airpo...


In [25]:
data_augmenter = DataAugmenter(
    experiment_name = f"{EXPERIMENT_NAME}_raw_TREC",
    model_name = MODEL_TYPE,
    repo_id="ALivshits",
    token=TOKEN
)

generated_samples = data_augmenter.augment(trec_intents[:3], trec_descriptions[:3], num_of_samples_per_label=3, model_name="mistralai/Mistral-7B-v0.3")
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

INFO 07-25 09:22:15 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='mistralai/Mistral-7B-v0.3', speculative_config=None, tokenizer='mistralai/Mistral-7B-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=mistralai/Mistral-7B-v0.3, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 07-25 09:22:16 model_runner.py:680] Starting to load model mistralai/Mistral-7B-v0.3...
INFO 07-25 09:22:16 weight_utils.py:

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 07-25 09:22:33 model_runner.py:692] Loading model weights took 13.5005 GB
INFO 07-25 09:22:34 gpu_executor.py:102] # GPU blocks: 11597, # CPU blocks: 2048
INFO 07-25 09:22:34 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-25 09:22:34 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-25 09:22:57 model_runner.py:1181] Graph capturing finished in 23 secs.


Processed prompts: 100%|██████████| 3/3 [00:01<00:00,  1.89it/s, est. speed input: 47.97 toks/s, output: 564.94 toks/s]


         label                                               text
0  LOC:country  \nCountry codes are used for several purposes ...
1  LOC:country  \n\n# LOCATION/Country Name Server (CN) - CN =...
2  LOC:country   SEPIA; Sepia – a brown or blackish pigment pr...
3    LOC:mount  \nMountaineering or mountain climbing is the s...
4    LOC:mount   The mountain ranges of the world are listed i...
5    LOC:mount  \nby Dirk Loeper & Jan Gullberg (Sweden) 2017-...
6   ENTY:other  \nA new section for 2015 is the FAQ about ETYM...
7   ENTY:other  \nMUSIC [AUG, SEP] Discussions about music in ...
8   ENTY:other  \n1) Are the NPC's in ETY, like Arianrhod and ...


In [21]:
gc.collect()
torch.cuda.empty_cache()

# Finetune model & generate

## ATIS

### ATIS Ver1 - 5 samples

In [15]:
data_augmenter = DataAugmenter(
    experiment_name = f"{EXPERIMENT_NAME}_ATIS_5",
    model_name = MODEL_TYPE,
    repo_id="ALivshits",
    token=TOKEN
)

In [28]:
data_augmenter.train_llm(atis_train_5_dataset, logging_steps=10)

Initialising tokenizer
Initialising model


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


Map:   0%|          | 0/82 [00:00<?, ? examples/s]



Step,Training Loss
10,3.5713
20,2.8462
30,2.1569
40,1.7961
50,1.3855
60,1.308
70,1.0921
80,1.0176




adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

In [30]:
data_augmenter.merge_and_upload()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Merging model
Saving model
Uploading model


model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

In [16]:
# Generate
generated_samples = data_augmenter.augment(atis_intents, atis_descriptions, 100)
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

INFO 07-25 09:53:23 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='ALivshits/Mistral_7B_lambada_plus_ATIS_5-merged', speculative_config=None, tokenizer='mistralai/Mistral-7B-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=ALivshits/Mistral_7B_lambada_plus_ATIS_5-merged, use_v2_block_manager=False, enable_prefix_caching=False)


tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

INFO 07-25 09:53:26 model_runner.py:680] Starting to load model ALivshits/Mistral_7B_lambada_plus_ATIS_5-merged...
INFO 07-25 09:53:27 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 07-25 09:58:15 model_runner.py:692] Loading model weights took 13.5083 GB
INFO 07-25 09:58:17 gpu_executor.py:102] # GPU blocks: 3067, # CPU blocks: 2048
INFO 07-25 09:58:19 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-25 09:58:19 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-25 09:58:44 model_runner.py:1181] Graph capturing finished in 25 secs.


Processed prompts: 100%|██████████| 18/18 [00:17<00:00,  1.03it/s, est. speed input: 30.01 toks/s, output: 1590.83 toks/s]


       label                                               text
0   aircraft          what kind of airplanes does southwest fly
1   aircraft                     what is an airbus 320 aircraft
2   aircraft   what aircraft does american use between bosto...
3   aircraft                         what type is an airbus 309
4   aircraft              what kinds and models are boeing 720s
5   aircraft   what type aircraft does united use for first ...
6   aircraft   what type plane does united fly from denver t...
7   aircraft   what are all airplanes that fly from boston t...
8   aircraft   what type aircraft is operated between san fr...
9   aircraft   what type of plane is flown between boston an...
10  aircraft                   what are all boeing 735 aircraft
11  aircraft   what airplanes does delta fly between boston ...
12  aircraft    what are all aircrafts with first class seating
13  aircraft   list all airplanes that fly from boston to de...
14  aircraft   what kind plane does amer

### ATIS Ver1 - 10 samples

In [15]:
data_augmenter = DataAugmenter(
    experiment_name = f"{EXPERIMENT_NAME}_ATIS_10",
    model_name = MODEL_TYPE,
    repo_id="ALivshits",
    token=TOKEN
)

In [18]:
data_augmenter.train_llm(atis_train_10_dataset, logging_steps=10)

Initialising tokenizer
Initialising model


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


Map:   0%|          | 0/155 [00:00<?, ? examples/s]



Step,Training Loss
10,3.57
20,2.8326
30,2.2351
40,1.8547
50,1.4947
60,1.2635
70,1.1582
80,1.1143
90,0.9168
100,0.9602




adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

In [19]:
data_augmenter.merge_and_upload()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Merging model
Saving model
Uploading model


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

In [16]:
# Generate
generated_samples = data_augmenter.augment(atis_intents, atis_descriptions, 100)
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

INFO 07-25 10:21:10 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='ALivshits/Mistral_7B_lambada_plus_ATIS_10-merged', speculative_config=None, tokenizer='mistralai/Mistral-7B-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=ALivshits/Mistral_7B_lambada_plus_ATIS_10-merged, use_v2_block_manager=False, enable_prefix_caching=False)


tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

INFO 07-25 10:21:13 model_runner.py:680] Starting to load model ALivshits/Mistral_7B_lambada_plus_ATIS_10-merged...
INFO 07-25 10:21:13 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 07-25 10:21:59 model_runner.py:692] Loading model weights took 13.5083 GB
INFO 07-25 10:22:00 gpu_executor.py:102] # GPU blocks: 11060, # CPU blocks: 2048
INFO 07-25 10:22:02 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-25 10:22:02 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-25 10:22:25 model_runner.py:1181] Graph capturing finished in 23 secs.


Processed prompts: 100%|██████████| 18/18 [00:05<00:00,  3.50it/s, est. speed input: 101.96 toks/s, output: 5108.68 toks/s]


      label                                               text
0   airfare        how much is a flight from denver to atlanta
1   airfare                show me fares from boston to denver
2   airfare                show me fares from boston to denver
3   airfare        how much is a flight from atlanta to denver
4   airfare             show me fares from denver to baltimore
5   airfare      how much is a flight from denver to baltimore
6   airfare        how much is a flight from denver to oakland
7   airfare   how much is a flight from philadelphia to denver
8   airfare        how much is a flight from atlanta to boston
9   airfare        how much is a flight from atlanta to dallas
10  airfare      how much is a flight from denver to milwaukee
11  airfare                show me fares from boston to dallas
12  airfare   how much is a flight from washington to balti...
13  airfare   how much is an airline ticket from denver to ...
14  airfare            list fares from denver to washin

### ATIS Ver1 - 20 samples

In [15]:
data_augmenter = DataAugmenter(
    experiment_name = f"{EXPERIMENT_NAME}_ATIS_20",
    model_name = MODEL_TYPE,
    repo_id="ALivshits",
    token=TOKEN
)

In [18]:
data_augmenter.train_llm(atis_train_20_dataset, logging_steps=20)

Initialising tokenizer
Initialising model


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


Map:   0%|          | 0/287 [00:00<?, ? examples/s]



Step,Training Loss
20,3.1683
40,1.9841
60,1.4279
80,1.0843
100,1.0104
120,0.9062
140,0.8658
160,0.8401
180,0.7798
200,0.759




adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

In [20]:
data_augmenter.merge_and_upload()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Merging model
Saving model
Uploading model


model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

In [16]:
# Generate
generated_samples = data_augmenter.augment(atis_intents, atis_descriptions, 100)
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

INFO 07-25 10:52:29 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='ALivshits/Mistral_7B_lambada_plus_ATIS_20-merged', speculative_config=None, tokenizer='mistralai/Mistral-7B-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=ALivshits/Mistral_7B_lambada_plus_ATIS_20-merged, use_v2_block_manager=False, enable_prefix_caching=False)


tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

INFO 07-25 10:52:31 model_runner.py:680] Starting to load model ALivshits/Mistral_7B_lambada_plus_ATIS_20-merged...
INFO 07-25 10:52:32 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 07-25 10:54:34 model_runner.py:692] Loading model weights took 13.5083 GB
INFO 07-25 10:54:35 gpu_executor.py:102] # GPU blocks: 3067, # CPU blocks: 2048
INFO 07-25 10:54:38 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-25 10:54:38 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-25 10:55:05 model_runner.py:1181] Graph capturing finished in 27 secs.


Processed prompts: 100%|██████████| 18/18 [00:16<00:00,  1.09it/s, est. speed input: 31.86 toks/s, output: 1651.27 toks/s]


             label                                               text
0   ground_service   what is the best way to get from orlando airp...
1   ground_service   what is the best way to get from orlando airp...
2   ground_service   what is the best way to get from orlando airp...
3   ground_service   what is the best way to get into downtown pit...
4   ground_service   what is the cheapest way to get from orlando ...
5   ground_service   what kinds of rental cars are there in dallas...
6   ground_service   what is the cheapest way to get into san fran...
7   ground_service   what types of transport are there from denver...
8   ground_service   what kinds of transport are there from atlant...
9   ground_service   what types of limousine service are there fro...
10  ground_service   what is the easiest way to get from orlando a...
11  ground_service   what kinds of rental cars are there at pittsb...
12  ground_service      i 'll need a rental car at the dallas airport
13  ground_service  

### ATIS Ver1 - 50 samples

In [17]:
data_augmenter = DataAugmenter(
    experiment_name = f"{EXPERIMENT_NAME}_ATIS_50",
    model_name = MODEL_TYPE,
    repo_id="ALivshits",
    token=TOKEN
)

In [18]:
data_augmenter.train_llm(atis_train_50_dataset, save_steps=500, logging_steps=50)

Initialising tokenizer
Initialising model


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


Map:   0%|          | 0/546 [00:00<?, ? examples/s]



Step,Training Loss
50,2.3255
100,1.1162
150,0.9779
200,0.8127
250,0.7863
300,0.7054
350,0.6563
400,0.595
450,0.4759
500,0.4586




adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

In [19]:
data_augmenter.merge_and_upload()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Merging model
Saving model
Uploading model


model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
# Generate
generated_samples = data_augmenter.augment(atis_intents, atis_descriptions, 100)
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

INFO 07-25 11:21:09 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='ALivshits/Mistral_7B_lambada_plus_ATIS_50-merged', speculative_config=None, tokenizer='mistralai/Mistral-7B-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=ALivshits/Mistral_7B_lambada_plus_ATIS_50-merged, use_v2_block_manager=False, enable_prefix_caching=False)


generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

INFO 07-25 11:21:10 model_runner.py:680] Starting to load model ALivshits/Mistral_7B_lambada_plus_ATIS_50-merged...
INFO 07-25 11:21:10 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 07-25 11:26:22 model_runner.py:692] Loading model weights took 13.5005 GB
INFO 07-25 11:26:23 gpu_executor.py:102] # GPU blocks: 3204, # CPU blocks: 2048
INFO 07-25 11:26:23 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-25 11:26:23 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-25 11:26:51 model_runner.py:1181] Graph capturing finished in 28 secs.


Processed prompts: 100%|██████████| 18/18 [00:18<00:00,  1.01s/it, est. speed input: 28.82 toks/s, output: 1527.39 toks/s]


             label                                               text
0   ground_service   what kind of limousine service is there in bo...
1   ground_service   what kind of limousine service is there in bo...
2   ground_service            what is the limousine service in boston
3   ground_service        what type of limousine service is in boston
4   ground_service        what type of limousine service is in boston
5   ground_service   what kind of limousine service is there in at...
6   ground_service   what kind of limousine service is there in at...
7   ground_service   what type of limousine service is there in bo...
8   ground_service        what kind of limousine service is in boston
9   ground_service        what kind of limousine service is in boston
10  ground_service        what type of limousine service is in denver
11  ground_service            what is the limousine service in denver
12  ground_service   what type of limousine service is there in wa...
13  ground_service  

### ATIS Ver1 - 100 samples

In [21]:
data_augmenter = DataAugmenter(
    experiment_name = f"{EXPERIMENT_NAME}_ATIS_100",
    model_name = MODEL_TYPE,
    repo_id="ALivshits",
    token=TOKEN
)

In [22]:
data_augmenter.train_llm(atis_train_100_dataset, num_train_epochs=4, save_steps=200, logging_steps=50)

Initialising tokenizer
Initialising model


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


Map:   0%|          | 0/824 [00:00<?, ? examples/s]



Step,Training Loss
50,2.1843


Step,Training Loss
50,2.1843
100,1.1043
150,0.9457
200,0.854
250,0.7662
300,0.7294
350,0.6819
400,0.6273
450,0.5222
500,0.4925




adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

In [23]:
data_augmenter.merge_and_upload()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Merging model
Saving model
Uploading model


model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

In [24]:
# Generate
generated_samples = data_augmenter.augment(atis_intents, atis_descriptions, 100)
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

INFO 07-25 12:03:28 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='ALivshits/Mistral_7B_lambada_plus_ATIS_100-merged', speculative_config=None, tokenizer='mistralai/Mistral-7B-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=ALivshits/Mistral_7B_lambada_plus_ATIS_100-merged, use_v2_block_manager=False, enable_prefix_caching=False)


generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

INFO 07-25 12:03:29 model_runner.py:680] Starting to load model ALivshits/Mistral_7B_lambada_plus_ATIS_100-merged...
INFO 07-25 12:03:29 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 07-25 12:05:53 model_runner.py:692] Loading model weights took 13.5005 GB
INFO 07-25 12:05:54 gpu_executor.py:102] # GPU blocks: 3204, # CPU blocks: 2048
INFO 07-25 12:05:54 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-25 12:05:54 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-25 12:06:22 model_runner.py:1181] Graph capturing finished in 28 secs.


Processed prompts: 100%|██████████| 18/18 [00:16<00:00,  1.07it/s, est. speed input: 31.17 toks/s, output: 1507.98 toks/s]


             label                                               text
0   ground_service   what kind of limousine service is there in bo...
1   ground_service   what kind of limousine service is there in de...
2   ground_service       what kind of limo service is there in denver
3   ground_service   what kind of limousine service is there in ba...
4   ground_service     what is the type of ground transport in denver
5   ground_service     what is the type of ground transport in denver
6   ground_service    is there limo service at the pittsburgh airport
7   ground_service    is there limo service at the pittsburgh airport
8   ground_service   what kind of ground transport is there in atl...
9   ground_service            what is the limousine service in denver
10  ground_service    what kind of limo service is there in baltimore
11  ground_service             is there a limousine service in dallas
12  ground_service   what kinds of limousine service is there in b...
13  ground_service  

## TREC

### TREC Ver1 - 5 samples

In [None]:
data_augmenter = DataAugmenter(
    experiment_name = "Llama3_8B_lambada_plus_TREC_5",
    model_name = "llama",
    repo_id="ALivshits",
    token=TOKEN
)

In [None]:
data_augmenter.train_llm(trec_train_5_dataset, logging_steps=10)

Initialising tokenizer


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Initialising model


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


Map:   0%|          | 0/250 [00:00<?, ? examples/s]



Step,Training Loss
10,4.5467
20,4.2462
30,3.842
40,3.1232
50,3.0105
60,2.6376
70,2.3283
80,2.2277
90,2.138
100,2.1094




adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

In [None]:
data_augmenter.merge_and_upload()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Merging model
Saving model
Uploading model


model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

In [None]:
# Generate
generated_samples = data_augmenter.augment(trec_intents, trec_descriptions, 100, tokenizer_name="ALivshits/Llama3_8B_lambada_plus_TREC_5-lora")
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

INFO 07-23 13:04:04 llm_engine.py:176] Initializing an LLM engine (v0.5.3) with config: model='ALivshits/Llama3_8B_lambada_plus_TREC_5-merged', speculative_config=None, tokenizer='ALivshits/Llama3_8B_lambada_plus_TREC_5-lora', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=ALivshits/Llama3_8B_lambada_plus_TREC_5-merged, use_v2_block_manager=False, enable_prefix_caching=False)


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

INFO 07-23 13:04:08 model_runner.py:680] Starting to load model ALivshits/Llama3_8B_lambada_plus_TREC_5-merged...
INFO 07-23 13:04:09 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-23 13:09:01 model_runner.py:692] Loading model weights took 14.9595 GB
INFO 07-23 13:09:03 gpu_executor.py:102] # GPU blocks: 1951, # CPU blocks: 2048
INFO 07-23 13:09:05 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-23 13:09:05 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-23 13:09:32 model_runner.py:1181] Graph capturing finished in 27 secs.


Processed prompts: 100%|██████████| 50/50 [01:02<00:00,  1.26s/it, est. speed input: 18.54 toks/s, output: 1260.39 toks/s]


         label                                               text
0   ENTY:sport   What sport was invented by James Naismith in ...
1   ENTY:sport          What 's the most popular sport in China ?
2   ENTY:sport   What sport did Bill Clinton play in high scho...
3   ENTY:sport   What sport did Jim Thorpe compete in at the 1...
4   ENTY:sport   What was the most popular sport in ancient Ro...
5   ENTY:sport      What sport is played at the Stade de France ?
6   ENTY:sport   What sport 's governing body has its headquar...
7   ENTY:sport    What is the name of Wimbledon 's tennis court ?
8   ENTY:sport   Who holds the record for most home runs in a ...
9   ENTY:sport       What sport does Tiger Woods play ? [] Golf .
10  ENTY:sport   What NFL team has appeared in the most Super ...
11  ENTY:sport   Which NFL team has won the most Super Bowl ch...
12  ENTY:sport   What are some of the more unusual Olympic rec...
13  ENTY:sport        What 's the national sport of New Zealand ?
14  ENTY:s

In [None]:
gc.collect()
torch.cuda.empty_cache()

### TREC Ver1 - 10 samples

In [None]:
data_augmenter = DataAugmenter(
    experiment_name = "Llama3_8B_lambada_plus_TREC_10",
    model_name = "llama",
    repo_id="ALivshits",
    token=TOKEN
)

# Train
# data_augmenter.train_llm(trec_train_10_dataset, logging_steps=10)

# data_augmenter.merge_and_upload()

# Generate
generated_samples = data_augmenter.augment(trec_intents, trec_descriptions, 100, tokenizer_name="ALivshits/Llama3_8B_lambada_plus_TREC_10-lora")
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

INFO 07-23 18:29:46 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='ALivshits/Llama3_8B_lambada_plus_TREC_10-merged', speculative_config=None, tokenizer='ALivshits/Llama3_8B_lambada_plus_TREC_10-lora', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=ALivshits/Llama3_8B_lambada_plus_TREC_10-merged, use_v2_block_manager=False, enable_prefix_caching=False)


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

INFO 07-23 18:29:50 model_runner.py:680] Starting to load model ALivshits/Llama3_8B_lambada_plus_TREC_10-merged...
INFO 07-23 18:29:51 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-23 18:30:38 model_runner.py:692] Loading model weights took 14.9595 GB
INFO 07-23 18:30:40 gpu_executor.py:102] # GPU blocks: 9944, # CPU blocks: 2048
INFO 07-23 18:30:42 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-23 18:30:42 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-23 18:31:06 model_runner.py:1181] Graph capturing finished in 24 secs.


Processed prompts: 100%|██████████| 50/50 [00:19<00:00,  2.53it/s, est. speed input: 58.96 toks/s, output: 4114.90 toks/s]


          label                                               text
0   ENTY:dismed                  What is a fear of the number 13 ?
1   ENTY:dismed              What is a fear of long words ? [/sep]
2   ENTY:dismed    What is the second most common form of cancer ?
3   ENTY:dismed                       What is a fear of food ? 's?
4   ENTY:dismed                What is a phobia of the number 13 ?
5   ENTY:dismed         What is a fear of germs ? --- Mysophobia .
6   ENTY:dismed         What 's a disease that affects the lungs ?
7   ENTY:dismed           What is the term for a fear of mirrors ?
8   ENTY:dismed                 What is a fear of dirt ? ___phobia
9   ENTY:dismed   What is a fear of snakes called ? Copyright R...
10  ENTY:dismed   What is a fear of snakes called ? Copyright R...
11  ENTY:dismed                  What is anorexia nervosa ? [/sep]
12  ENTY:dismed   What is the most common sexually transmitted ...
13  ENTY:dismed   What is a fear of public speaking ? --- glos

In [None]:
gc.collect()
torch.cuda.empty_cache()

### TREC Ver1 - 20 Samples

In [None]:
data_augmenter = DataAugmenter(
    experiment_name = "Llama3_8B_lambada_plus_TREC_20",
    model_name = "llama",
    repo_id="ALivshits",
    token=TOKEN
)

# Train
data_augmenter.train_llm(trec_train_20_dataset, logging_steps=20)

data_augmenter.merge_and_upload()

# Generate
generated_samples = data_augmenter.augment(trec_intents, trec_descriptions, 100, tokenizer_name="ALivshits/Llama3_8B_lambada_plus_TREC_20-lora")
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

Initialising tokenizer


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Initialising model


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


Map:   0%|          | 0/962 [00:00<?, ? examples/s]



Step,Training Loss
20,4.2902
40,3.4621
60,2.7913
80,2.2629
100,2.1677
120,2.0486
140,1.729
160,1.8319
180,1.6697
200,1.4992




adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Merging model
Saving model
Uploading model


model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

INFO 07-23 18:53:22 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='ALivshits/Llama3_8B_lambada_plus_TREC_20-merged', speculative_config=None, tokenizer='ALivshits/Llama3_8B_lambada_plus_TREC_20-lora', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=ALivshits/Llama3_8B_lambada_plus_TREC_20-merged, use_v2_block_manager=False, enable_prefix_caching=False)


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

INFO 07-23 18:53:27 model_runner.py:680] Starting to load model ALivshits/Llama3_8B_lambada_plus_TREC_20-merged...
INFO 07-23 18:53:27 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-23 19:02:02 model_runner.py:692] Loading model weights took 14.9575 GB
INFO 07-23 19:02:03 gpu_executor.py:102] # GPU blocks: 10499, # CPU blocks: 2048
INFO 07-23 19:02:03 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-23 19:02:03 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-23 19:02:28 model_runner.py:1181] Graph capturing finished in 25 secs.


Processed prompts: 100%|██████████| 50/50 [00:19<00:00,  2.52it/s, est. speed input: 58.84 toks/s, output: 4198.93 toks/s]


          label                                               text
0   ENTY:dismed                  What is a fear of the number 13 ?
1   ENTY:dismed        What is a fear of thunderstorms ? ___phobia
2   ENTY:dismed               What is a fear of glass ? ___ phobia
3   ENTY:dismed                What is a fear of snow ? ___ phobia
4   ENTY:dismed    What is the fear of long-term monogamy called ?
5   ENTY:dismed          What is a fear of cold ? ________________
6   ENTY:dismed              What is the fear of cold ? ___ phobia
7   ENTY:dismed              What is the term for a fear of snow ?
8   ENTY:dismed              What is a fear of cold ? ____________
9   ENTY:dismed           What is the term for a fear of heights ?
10  ENTY:dismed   What is the fear of long-term monogamous rela...
11  ENTY:dismed        What is the fear of long-term illness ? ---
12  ENTY:dismed   What is the fear of long-term illness ? ___ph...
13  ENTY:dismed     Which is the most common type of skin canc

In [None]:
gc.collect()
torch.cuda.empty_cache()

### TREC Ver1 - 50 Samples

In [None]:
data_augmenter = DataAugmenter(
    experiment_name = "Llama3_8B_lambada_plus_TREC_50",
    model_name = "llama",
    repo_id="ALivshits",
    token=TOKEN
)

# Train
data_augmenter.train_llm(trec_train_50_dataset, num_train_epochs=4, save_steps=200, logging_steps=50)

data_augmenter.merge_and_upload()

# Generate
generated_samples = data_augmenter.augment(trec_intents, trec_descriptions, 100, tokenizer_name="ALivshits/Llama3_8B_lambada_plus_TREC_50-lora")
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

Initialising tokenizer


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Initialising model


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


Map:   0%|          | 0/2138 [00:00<?, ? examples/s]



Step,Training Loss
50,3.6843
100,2.2794
150,1.886
200,1.6327
250,1.472
300,1.4171
350,1.3124
400,1.1354
450,0.932
500,0.914




Step,Training Loss
50,3.6843
100,2.2794
150,1.886
200,1.6327
250,1.472
300,1.4171
350,1.3124
400,1.1354
450,0.932
500,0.914




adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Merging model
Saving model
Uploading model


model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

INFO 07-23 19:46:06 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='ALivshits/Llama3_8B_lambada_plus_TREC_50-merged', speculative_config=None, tokenizer='ALivshits/Llama3_8B_lambada_plus_TREC_50-lora', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=ALivshits/Llama3_8B_lambada_plus_TREC_50-merged, use_v2_block_manager=False, enable_prefix_caching=False)


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

INFO 07-23 19:46:11 model_runner.py:680] Starting to load model ALivshits/Llama3_8B_lambada_plus_TREC_50-merged...
INFO 07-23 19:46:12 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-23 19:52:55 model_runner.py:692] Loading model weights took 14.9575 GB
INFO 07-23 19:52:56 gpu_executor.py:102] # GPU blocks: 10499, # CPU blocks: 2048
INFO 07-23 19:52:56 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-23 19:52:56 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-23 19:53:21 model_runner.py:1181] Graph capturing finished in 25 secs.


Processed prompts: 100%|██████████| 50/50 [00:18<00:00,  2.67it/s, est. speed input: 62.33 toks/s, output: 4162.82 toks/s]


          label                                               text
0   ENTY:dismed             What is a fear of cockroaches called ?
1   ENTY:dismed             What is a fear of cockroaches called ?
2   ENTY:dismed             What is a fear of cockroaches called ?
3   ENTY:dismed   What is a fear of insanity ? , or being insane .
4   ENTY:dismed           What is the fear of cockroaches called ?
5   ENTY:dismed     What is a fear of glass ? , or broken objects?
6   ENTY:dismed     What is a fear of insanity ? , or being crazy?
7   ENTY:dismed                What is a fear of insanity ? ` . ``
8   ENTY:dismed         What is a fear of thunder ? , or lightning
9   ENTY:dismed          What is a fear of insanity ? ____________
10  ENTY:dismed          What is the term for a fear of insanity ?
11  ENTY:dismed     What is a fear of insanity ? , : phobophobia .
12  ENTY:dismed                   What is a fear of mirrors ? .?.?
13  ENTY:dismed                What is a fear of insanity ? . 

### TREC Ver1 - 100 samples

In [14]:
data_augmenter = DataAugmenter(
    experiment_name = "Llama3_8B_lambada_plus_TREC_100",
    model_name = "llama",
    repo_id="ALivshits",
    token=TOKEN
)

# Train
data_augmenter.train_llm(trec_train_100_dataset, num_train_epochs=4, save_steps=200, logging_steps=50)

data_augmenter.merge_and_upload()

# Generate
generated_samples = data_augmenter.augment(trec_intents, trec_descriptions, 100, tokenizer_name="ALivshits/Llama3_8B_lambada_plus_TREC_100-lora")
print(generated_samples.head(20))

data_augmenter.clean()
del data_augmenter

Initialising tokenizer


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Initialising model


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/3629 [00:00<?, ? examples/s]



Step,Training Loss
50,3.6815
100,2.2994
150,1.8435
200,1.6751
250,1.5167
300,1.4104
350,1.3352
400,1.0895
450,0.9554
500,0.9257




Step,Training Loss
50,3.6815
100,2.2994
150,1.8435
200,1.6751
250,1.5167
300,1.4104
350,1.3352
400,1.0895
450,0.9554
500,0.9257




README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Merging model
Saving model
Uploading model


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

INFO 07-25 08:25:53 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='ALivshits/Llama3_8B_lambada_plus_TREC_100-merged', speculative_config=None, tokenizer='ALivshits/Llama3_8B_lambada_plus_TREC_100-lora', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=ALivshits/Llama3_8B_lambada_plus_TREC_100-merged, use_v2_block_manager=False, enable_prefix_caching=False)


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

INFO 07-25 08:25:56 model_runner.py:680] Starting to load model ALivshits/Llama3_8B_lambada_plus_TREC_100-merged...
INFO 07-25 08:25:57 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-25 08:33:32 model_runner.py:692] Loading model weights took 14.9598 GB
INFO 07-25 08:33:33 gpu_executor.py:102] # GPU blocks: 10479, # CPU blocks: 2048
INFO 07-25 08:33:35 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-25 08:33:35 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-25 08:33:58 model_runner.py:1181] Graph capturing finished in 23 secs.


Processed prompts: 100%|██████████| 50/50 [00:18<00:00,  2.72it/s, est. speed input: 63.49 toks/s, output: 4126.39 toks/s]


          label                                               text
0   ENTY:termeq               What 's another name for aspartame ?
1   ENTY:termeq               What 's another name for aspartame ?
2   ENTY:termeq               What 's another name for aspartame ?
3   ENTY:termeq               What 's another name for aspartame ?
4   ENTY:termeq               What 's another name for aspartame ?
5   ENTY:termeq               What 's another name for aspartame ?
6   ENTY:termeq               What 's another name for aspartame ?
7   ENTY:termeq                 What 's the term for a young fox ?
8   ENTY:termeq   What other name were the `` Little Rascals ''...
9   ENTY:termeq   What other name were the `` Little Rascals ''...
10  ENTY:termeq    What is the name for clouds that produce rain ?
11  ENTY:termeq                      How do you say 2 in Latin ? ,
12  ENTY:termeq                      How do you say 2 in Latin ? ,
13  ENTY:termeq                      How do you say 2 in Latin

# Download generated data

In [None]:
from google.colab import files
files.download("/content/generated_data/Mistral_7B_4bit_raw_ATIS_augmented_data.csv")
files.download("/content/generated_data/Mistral_7B_4bit_tuned_ATIS_100_augmented_data.csv")
files.download("/content/generated_data/Mistral_7B_4bit_tuned_ATIS_10_augmented_data.csv")
files.download("/content/generated_data/Mistral_7B_4bit_tuned_ATIS_20_augmented_data.csv")
files.download("/content/generated_data/Mistral_7B_4bit_tuned_ATIS_5_augmented_data.csv")
files.download("/content/generated_data/Mistral_7B_4bit_tuned_ATIS_50_augmented_data.csv")