In [None]:
!pip install bitsandbytes peft trl loralib datasets

# LORA

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, DataCollatorWithPadding, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch


# load Model and dataset
model_checkpoint = 'distilbert-base-uncased'
dataset = load_dataset("glue", "sst2")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["idx", "sentence"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(f"The memory size of this model is : {round(model.get_memory_footprint()/1024**3, 2)}")

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["distilbert.transformer.layer.1.attention.q_lin",
                    "distilbert.transformer.layer.2.attention.q_lin",
                    "distilbert.transformer.layer.3.attention.q_lin",
                    "distilbert.transformer.layer.4.attention.q_lin",
                    "distilbert.transformer.layer.5.attention.q_lin"],
    lora_dropout=0.05,
    task_type="SEQ_CLS"
)

lora_model = get_peft_model(model, lora_config)
print(f"Trainable parameters are : {lora_model.print_trainable_parameters()}")
print(f"The memory size of Lora model is : {round(lora_model.get_memory_footprint()/1024**3, 2)}")

train_args = TrainingArguments("Lora_Training", per_device_train_batch_size=16, num_train_epochs=3, gradient_accumulation_steps=4)
trainer = Trainer(model=lora_model, args=train_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator)
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

The memory size of this model is : 0.25
trainable params: 715,010 || all params: 67,670,020 || trainable%: 1.0566
Trainable parameters are : None
The memory size of Lora model is : 0.25


Step,Training Loss
500,1.7013
1000,1.3089
1500,1.2532
2000,1.2127
2500,1.1975
3000,1.1891


TrainOutput(global_step=3156, training_loss=1.305305287563, metrics={'train_runtime': 727.8902, 'train_samples_per_second': 277.579, 'train_steps_per_second': 4.336, 'total_flos': 3430765087217664.0, 'train_loss': 1.305305287563, 'epoch': 2.9990498812351545})

In [8]:
text_list = ["this one is a pass.", "not a fan, don't recommend.", "it 's just incredibly dull.",
             "impresses you with its open-endedness and surprises."]

id2label = {0: 'Negative', 1:'Positive'}
label2id = {'Negative':0, 'Positive':1}

device = next(lora_model.parameters()).device

# Lets infer this for 5 time to see if its consistant or not

for i in range(5):
  for i in text_list:
    tokenized_text = tokenizer(i, return_tensors="pt")
    tokenized_text = tokenized_text.to(device)
    print(f"{i}: {id2label[torch.argmax(lora_model(**tokenized_text).logits).tolist()]}")
  print("            ")

print(f"The size of the model is {round(lora_model.get_memory_footprint()/1024**3, 2)}")

this one is a pass.: Positive
not a fan, don't recommend.: Negative
it 's just incredibly dull.: Negative
impresses you with its open-endedness and surprises.: Positive
            
this one is a pass.: Negative
not a fan, don't recommend.: Negative
it 's just incredibly dull.: Negative
impresses you with its open-endedness and surprises.: Positive
            
this one is a pass.: Positive
not a fan, don't recommend.: Negative
it 's just incredibly dull.: Negative
impresses you with its open-endedness and surprises.: Positive
            
this one is a pass.: Positive
not a fan, don't recommend.: Negative
it 's just incredibly dull.: Negative
impresses you with its open-endedness and surprises.: Positive
            
this one is a pass.: Negative
not a fan, don't recommend.: Negative
it 's just incredibly dull.: Negative
impresses you with its open-endedness and surprises.: Positive
            
The size of the model is 0.25


In [28]:
for name, module in model.named_modules():
  print(f"{name}: the module is : {module}")

: the module is : DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.

In [None]:
lora_model.config

# QLORA

In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import torch


# load Model and dataset
model_checkpoint = 'distilbert-base-uncased'
dataset = load_dataset("glue", "sst2")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

bnb_config = BitsAndBytesConfig(
    # load_in_4bit=False,
    # bnb_4bit_use_double_quant=True,
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=torch.float16
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, quantization_config=bnb_config)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["idx", "sentence"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(f"The memory size of this quantized model is : {round(model.get_memory_footprint()/1024**3, 2)}")

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.1, target_modules=["distilbert.transformer.layer.0.attention.q_lin",
                                      "distilbert.transformer.layer.1.attention.q_lin",
                                      "distilbert.transformer.layer.2.attention.q_lin",
                                      "distilbert.transformer.layer.3.attention.q_lin",
                                      "distilbert.transformer.layer.4.attention.q_lin"])

lora_model = get_peft_model(model, lora_config)
print(f"Trainable parameters are : {lora_model.print_trainable_parameters()}")
print(f"The memory size of Lora model is : {round(lora_model.get_memory_footprint()/1024**3, 2)}")

train_args = TrainingArguments("Lora_Training", per_device_train_batch_size=16, num_train_epochs=3, gradient_accumulation_steps=4)
trainer = Trainer(model=lora_model, args=train_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator)
trainer.train()


`low_cpu_mem_usage` was None, now default to True since model is quantized.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The memory size of this quantized model is : 0.06
trainable params: 122,880 || all params: 67,077,890 || trainable%: 0.1832
Trainable parameters are : None
The memory size of Lora model is : 0.07


Step,Training Loss
500,2.564
1000,2.526
1500,2.46
2000,2.42
2500,2.38
3000,2.366


TrainOutput(global_step=3156, training_loss=2.4554816223067175, metrics={'train_runtime': 437.384, 'train_samples_per_second': 461.944, 'train_steps_per_second': 7.216, 'total_flos': 3384422738586624.0, 'train_loss': 2.4554816223067175, 'epoch': 2.9990498812351545})

In [16]:
text_list = ["this one is a pass.", "not a fan, don't recommend.", "it 's just incredibly dull.",
             "impresses you with its open-endedness and surprises."]

id2label = {0: 'Negative', 1:'Positive'}
label2id = {'Negative':0, 'Positive':1}

device = next(lora_model.parameters()).device

# Lets infer this for 5 time to see if its consistant or not

for i in range(5):
  for i in text_list:
    tokenized_text = tokenizer(i, return_tensors="pt")
    tokenized_text = tokenized_text.to(device)
    print(f"{i}: {id2label[torch.argmax(lora_model(**tokenized_text).logits).tolist()]}")
  print("            ")

print(f"The size of the model is {round(lora_model.get_memory_footprint()/1024**3, 2)}")

this one is a pass.: Positive
not a fan, don't recommend.: Negative
it 's just incredibly dull.: Positive
impresses you with its open-endedness and surprises.: Positive
            
this one is a pass.: Negative
not a fan, don't recommend.: Positive
it 's just incredibly dull.: Positive
impresses you with its open-endedness and surprises.: Positive
            
this one is a pass.: Negative
not a fan, don't recommend.: Positive
it 's just incredibly dull.: Positive
impresses you with its open-endedness and surprises.: Positive
            
this one is a pass.: Positive
not a fan, don't recommend.: Positive
it 's just incredibly dull.: Positive
impresses you with its open-endedness and surprises.: Positive
            
this one is a pass.: Negative
not a fan, don't recommend.: Positive
it 's just incredibly dull.: Positive
impresses you with its open-endedness and surprises.: Positive
            
The size of the model is 0.07


In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [11]:
dataset['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}