<a href="https://colab.research.google.com/github/elephanti/NLPProject2024/blob/main/LAMBADA_Data_Augmentation_Mistral_7B_v0_3_Full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LAMBADA - Fine tuning Mistral 7B for data augmentation

## Installations & Imports

In [1]:
!pip install -U bitsandbytes
!pip install -U transformers
!pip install -U accelerate
!pip install -U peft
!pip install -U datasets
!pip install -U evaluate
!pip install flash-attn --no-build-isolation
!pip install -U trl
!pip install wandb

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (4

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb
from datasets import Dataset, load_dataset
from trl import SFTTrainer
import pandas as pd
from sklearn.utils import shuffle
from huggingface_hub import notebook_login

## Login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
wandb.login(key = "7f4d8d8963c96f20cb342cdf530b65db738b4e18")
run = wandb.init(
    project='LAMBADA Data Augmentation - Mistral 7B v0.3 Quantified',
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33melephanti12345[0m ([33melephanti12345-runi[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Dataset loading & preprocessing

In [5]:
!wget https://github.com/elephanti/NLPProject2024/raw/main/datasets/ATIS/sampled_subsets/ver1/atis_10_subset.csv -P /content/datasets/ATIS/VER1

--2024-07-18 11:43:12--  https://github.com/elephanti/NLPProject2024/raw/main/datasets/ATIS/sampled_subsets/ver1/atis_10_subset.csv
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/elephanti/NLPProject2024/main/datasets/ATIS/sampled_subsets/ver1/atis_10_subset.csv [following]
--2024-07-18 11:43:12--  https://raw.githubusercontent.com/elephanti/NLPProject2024/main/datasets/ATIS/sampled_subsets/ver1/atis_10_subset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9956 (9.7K) [text/plain]
Saving to: ‘/content/datasets/ATIS/VER1/atis_10_subset.csv’


2024-07-18 11:43:13 (109 MB/s) - ‘/content/d

In [6]:
BOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"
SEP_TOKEN = "[SEP]"

In [7]:
def load_and_preprocess_dataset(filename:str) -> Dataset:
    df = pd.read_csv(filename)
    df['text'] = df.apply(lambda row: f"{BOS_TOKEN}{row['label']} {SEP_TOKEN} {row['text']}{EOS_TOKEN}", axis=1)
    df = shuffle(df)
    dataset = Dataset.from_pandas(df)
    return dataset

In [8]:
train_dataset = load_and_preprocess_dataset('datasets/ATIS/VER1/atis_10_subset.csv')

In [9]:
train_dataset['text'][:5]


['<s>aircraft [SEP] what kind of aircraft does delta use before 8 am on august second from boston to denver</s>',
 '<s>airfare [SEP] round trip fares from denver to philadelphia under 1000 dollars</s>',
 '<s>ground_service [SEP] what about a car rental in denver</s>',
 '<s>meal [SEP] are snacks served on tower air</s>',
 '<s>quantity [SEP] how many first class flights does united have leaving from all airports today</s>']

## Finetune Mistral 7B v0.3

In [17]:
BASE_MODEL = "mistralai/Mistral-7B-v0.3"
NEW_MODEL = "Mistral-7B-v0.3-atis-10-lambada-aug-full"

In [11]:
model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        attn_implementation="flash_attention_2",
        trust_remote_code=True,
        use_auth_token=True
)



config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, use_auth_token=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_eos_token = False
tokenizer.add_bos_token = False
tokenizer.model_max_length = 1024



tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [13]:
config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

model = get_peft_model(model, config)

In [14]:
model.config.use_cache = False # silence the warnings
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.resize_token_embeddings(len(tokenizer))

Embedding(32768, 4096)

In [15]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="adamw_bnb_8bit",
    save_steps=50,
    logging_steps=1,
    learning_rate=2.5e-5,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    gradient_checkpointing=True,
    lr_scheduler_type="constant",
)

In [16]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    dataset_text_field='text',
    max_seq_length=tokenizer.model_max_length,
    tokenizer=tokenizer,
    packing=False
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/157 [00:00<?, ? examples/s]



Step,Training Loss
1,4.399
2,4.2161
3,4.3432
4,5.1465
5,5.3807
6,4.5177
7,5.4061
8,5.5346
9,5.8088
10,3.8817




TrainOutput(global_step=156, training_loss=2.49944890386019, metrics={'train_runtime': 205.1045, 'train_samples_per_second': 3.062, 'train_steps_per_second': 0.761, 'total_flos': 549492463362048.0, 'train_loss': 2.49944890386019, 'epoch': 3.9745222929936306})

In [18]:
trainer.model.save_pretrained(NEW_MODEL)
wandb.finish()
model.config.use_cache = True
model.eval()

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▁▂▃▂▄▃▁▃▁▃▂▂▄▁▃█▁▁▁▄▁▁▂▁▂▁▂▂▂▁▁▁▂▂▄▂▃▁▂▂
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,▆▇█▆▆▄▄▃▃▄▄▄▄▃▄▃▃▃▃▂▃▂▂▂▂▁▂▂▂▂▂▂▂▁▂▁▂▁▂▂

0,1
total_flos,549492463362048.0
train/epoch,3.97452
train/global_step,156.0
train/grad_norm,2.89062
train/learning_rate,3e-05
train/loss,1.7544
train_loss,2.49945
train_runtime,205.1045
train_samples_per_second,3.062
train_steps_per_second,0.761


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralFlashAttention2(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_

In [19]:
tokenizer.save_pretrained(NEW_MODEL)

('Mistral-7B-v0.3-atis-10-lambada-aug-full/tokenizer_config.json',
 'Mistral-7B-v0.3-atis-10-lambada-aug-full/special_tokens_map.json',
 'Mistral-7B-v0.3-atis-10-lambada-aug-full/tokenizer.model',
 'Mistral-7B-v0.3-atis-10-lambada-aug-full/added_tokens.json',
 'Mistral-7B-v0.3-atis-10-lambada-aug-full/tokenizer.json')

In [20]:
model.push_to_hub(NEW_MODEL, use_auth_token=True)
tokenizer.push_to_hub(NEW_MODEL, use_auth_token=True)



adapter_model.safetensors:   0%|          | 0.00/185M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ALivshits/Mistral-7B-v0.3-atis-10-lambada-aug-full/commit/9665435004fda551c75aaba9b5c1fe731e9cd979', commit_message='Upload tokenizer', commit_description='', oid='9665435004fda551c75aaba9b5c1fe731e9cd979', pr_url=None, pr_revision=None, pr_num=None)

## Generate synthetic data

In [21]:
def generate_samples(intents, model, tokenizer, num_samples=10):
    inputs = [f'{BOS_TOKEN}{intent} {SEP_TOKEN}' for intent in intents]
    tokenized_prompts = tokenizer(inputs, padding=True, return_tensors='pt')
    model_op = model.generate(input_ids=tokenized_prompts['input_ids'].to('cuda'),
                              attention_mask=tokenized_prompts['attention_mask'].to('cuda'),
                              min_length=10,
                              max_length=100,
                              temperature=1,
                              top_k=30,
                              top_p=0.90,
                              repetition_penalty=1.5,
                              do_sample=True,
                              num_return_sequences=10,
                              use_cache=True)
    generated_text = tokenizer.batch_decode(model_op, skip_special_tokens=True)
    samples = [item.split(SEP_TOKEN) for item in generated_text]
    samples = [(label.strip(), text.strip()) for (label, text) in samples]
    return pd.DataFrame(samples, columns=['label', 'text'])

In [22]:
intents = train_dataset.to_pandas()['label'].unique()
intents

array(['aircraft', 'airfare', 'ground_service', 'meal', 'quantity',
       'airport', 'ground_fare', 'flight_no', 'capacity', 'flight',
       'airline', 'distance', 'city', 'abbreviation', 'cheapest',
       'flight_time', 'restriction', 'day_name'], dtype=object)

In [24]:
tokenizer.padding_side = "left"

In [25]:
intents = train_dataset.to_pandas()['label'].unique()
generated_samples = generate_samples(intents, model, tokenizer)
generated_samples

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Unnamed: 0,label,text
0,aircraft,airplane type
1,aircraft,i want to know what type of aircraft boeing 73...
2,aircraft,what kind of aircraft is an sa
3,aircraft,airplanes with the most legroom
4,aircraft,what type of aircraft does continental use
...,...,...
175,day_name,list airline flights for saturday
176,day_name,what is the day of arrival and departure
177,day_name,what day of the week does us airways flight 70...
178,day_name,what is the day of a flight from new york to b...


In [26]:
# Extract the 'text' column from both DataFrames
train_texts = train_dataset.to_pandas()['text']
generated_texts = generated_samples['text']

# Find duplicates
duplicates = set(train_texts).intersection(generated_texts)

if duplicates:
  print("Duplicates found:")
  for duplicate in duplicates:
    print(duplicate)
else:
  print("No duplicates found between generated samples and training dataset.")


No duplicates found between generated samples and training dataset.


In [27]:
generated_samples.to_csv('generated_samples_A100_mistral_full.csv', index=False)
