In [1]:
from pathlib import Path
from sklearn.model_selection import train_test_split

from datasets import load_dataset, Dataset, DatasetDict
from transformers import DistilBertTokenizerFast, DistilBertModel
import numpy as np

from sklearn.metrics import accuracy_score
import torch
import gc
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
from transformers import DistilBertForSequenceClassification, DistilBertForMaskedLM, Trainer, TrainingArguments

from utils import read_imdb_split

  from .autonotebook import tqdm as notebook_tqdm


### Read data and create a dataset
Use Imdb dataset for this exercise. The dataset contains movie reviews and review sentiment (positive or negative).

# TODO: Download data

In [2]:
dataset = load_dataset("stanfordnlp/imdb")

In [6]:
print(dataset["train"][0].keys())
print(dataset["train"][0]["text"])
print(dataset["train"][0]["label"])

dict_keys(['text', 'label'])
I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes

Split the train data into training set and evaluation set. Dataset class has a method for that directly.

In [16]:
dataset_train_eval = dataset["train"].train_test_split(test_size=0.2)
dataset_train_eval

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

### Fetch tokenizer and encode the data

In [18]:
model_name='distilbert-base-uncased'

In [19]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [20]:
def tokenize_function(examples):
    tokenizer.truncation_side = "left"
    return tokenizer(
        text = examples["text"],
        return_tensors="np",
        truncation=True,
        padding=True,
        max_length=512
    )

In [21]:
dataset_encoded = dataset_train_eval.map(tokenize_function, batched=True)

Map: 100%|██████████| 20000/20000 [00:02<00:00, 7489.14 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 7664.09 examples/s]


## TODO
What did the tokenizer function add to the dataset and how does it look now?

In [22]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [23]:
print(dataset_encoded["train"]["label"][:3])
print(dataset_encoded["train"]["text"][:3])
print(dataset_encoded["train"]["input_ids"][:3])
print(dataset_encoded["train"]["attention_mask"][:3])


[0, 0, 1]
['When I first saw this film it was not an impressive one. Now that I have seen it again with some friends on DVD ( they had not viewed it on the silver screen ), my opinion remains the same. The subject matter is puerile and the performances are weak.', 'i wont go and give them my 10 bucks i went and bought the fourth season of the original and the best. At least my kids enjoy it and can watch it without me worrying about what they are seeing. I have a teenager and she thinks the previews are ridiculous and would rather watch the original. And she thinks Jessica Simpson is a horrible daisy in fact she thinks she looks more like a slut than daisy duke. Those shorts she might as well not be wearing anything at all. And since when is American Pie have anything to do with the Dukes SHAME ON them for putting that nasty line in there about having sex with a car. That in itself should have gotten the movie a R rating. The only good thing that might come out of this is a reunion mov

We use the model distilbert-base-uncased as a base model.

In [25]:
base_model = DistilBertModel.from_pretrained(model_name)

In [26]:
base_model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [31]:
test_base_model_input = "Hello I'm a [MASK] model"
encoded_input = tokenizer(test_base_model_input, return_tensors="pt")
output = base_model(**encoded_input)
output

BaseModelOutput(last_hidden_state=tensor([[[-0.1777,  0.0843,  0.1978,  ..., -0.0331,  0.0377,  0.2354],
         [-0.4424,  0.4954,  0.3510,  ..., -0.1129,  0.0776,  0.1919],
         [ 0.0464, -0.0689,  0.1267,  ..., -0.2017,  0.0292,  0.7273],
         ...,
         [ 0.1660, -0.4356,  0.4011,  ..., -0.1673,  0.3813, -0.1179],
         [-0.0399, -0.0672,  0.1509,  ..., -0.0883,  0.0239, -0.3008],
         [ 0.1154,  0.4180,  0.0893,  ...,  0.5059, -0.5427, -0.0266]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [29]:
base_model_classification = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Question:
What is the difference between these models?

Can use use base_model_classification directly for a classification task?

### Fine-tune model
Test first with smaller number of samples (num_train_spochs = 1, number of train samples = 500)

In [33]:

n_train_samples = 500
n_eval_samples = 100

training_args = TrainingArguments(
    output_dir='./distilbert_classification_results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./distilbert_classification_logs',
    logging_steps=10,
    #use_mps_device=True
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Dataset.from_dict(dataset_encoded["train"][:n_train_samples]),
    eval_dataset=Dataset.from_dict(dataset_encoded["test"][:n_eval_samples])
)
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 10%|█         | 10/96 [00:11<01:30,  1.05s/it]

{'loss': 0.6933, 'grad_norm': 1.022898554801941, 'learning_rate': 1e-05, 'epoch': 0.31}


 21%|██        | 20/96 [00:21<01:19,  1.04s/it]

{'loss': 0.6974, 'grad_norm': 1.1077734231948853, 'learning_rate': 2e-05, 'epoch': 0.62}


 31%|███▏      | 30/96 [00:32<01:09,  1.05s/it]

{'loss': 0.6772, 'grad_norm': 1.0888217687606812, 'learning_rate': 3e-05, 'epoch': 0.94}


 42%|████▏     | 40/96 [00:42<00:58,  1.04s/it]

{'loss': 0.6464, 'grad_norm': 6.09039831161499, 'learning_rate': 4e-05, 'epoch': 1.25}


 52%|█████▏    | 50/96 [00:52<00:48,  1.05s/it]

{'loss': 0.5769, 'grad_norm': 3.251227617263794, 'learning_rate': 5e-05, 'epoch': 1.56}


 62%|██████▎   | 60/96 [01:03<00:38,  1.06s/it]

{'loss': 0.3713, 'grad_norm': 4.515497207641602, 'learning_rate': 3.91304347826087e-05, 'epoch': 1.88}


 73%|███████▎  | 70/96 [01:12<00:26,  1.03s/it]

{'loss': 0.2293, 'grad_norm': 4.927664756774902, 'learning_rate': 2.826086956521739e-05, 'epoch': 2.19}


 83%|████████▎ | 80/96 [01:23<00:16,  1.05s/it]

{'loss': 0.189, 'grad_norm': 6.698153972625732, 'learning_rate': 1.739130434782609e-05, 'epoch': 2.5}


 94%|█████████▍| 90/96 [01:33<00:06,  1.05s/it]

{'loss': 0.2123, 'grad_norm': 2.632554769515991, 'learning_rate': 6.521739130434783e-06, 'epoch': 2.81}


100%|██████████| 96/96 [01:39<00:00,  1.04s/it]

{'train_runtime': 99.549, 'train_samples_per_second': 15.068, 'train_steps_per_second': 0.964, 'train_loss': 0.45547864586114883, 'epoch': 3.0}





TrainOutput(global_step=96, training_loss=0.45547864586114883, metrics={'train_runtime': 99.549, 'train_samples_per_second': 15.068, 'train_steps_per_second': 0.964, 'train_loss': 0.45547864586114883, 'epoch': 3.0})

## TODO

Test the fine-tuned model with some test reviews.

Hint: If you encounter an error regarding memory not allocated in some device, allocate the tokens in the proper device first.
Example:

`tokenizer(test_review, return_tensors="pt).to("mps")`

In [34]:
test_review = dataset["test"]["text"][0]
test_review

'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as they have

In [37]:
token_ids = tokenizer(test_review, return_tensors="pt").to("mps")
model(**token_ids)

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.1251, -1.4822]], device='mps:0', grad_fn=<LinearBackward0>), hidden_states=None, attentions=None)

In [39]:
test_review = "I really liked the movie."
token_ids = tokenizer(test_review, return_tensors="pt").to("mps")
model(**token_ids)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.1088,  1.1103]], device='mps:0', grad_fn=<LinearBackward0>), hidden_states=None, attentions=None)

## TODO

Is the true base model also changed when fine-tuning as above? Can you check that somehow?

Hint: Check how to access the model weights

`base_model_classification._modules["embeddings"].word_embeddings.weight`

In [47]:
(base_model._modules["embeddings"].word_embeddings.weight-model._modules["distilbert"].embeddings.word_embeddings.weight.to("cpu")).detach().numpy().max()

0.0018000305

## Model accuracy

In [48]:
# Here is a method for doing inference in batches, you can use this to evaluate the model accuracy
def inference(model, dataset_encoded, batch_size: int, n_samples: int, device: str="mps"):
    print(f"Inference on {device}...")
    model.to(device)

    for i in range(0,n_samples//batch_size):
        print("Batch: ", i)
        input_ids = torch.LongTensor(dataset_encoded["input_ids"][i*batch_size:(i+1)*batch_size]).to(device)
        attention_mask = torch.LongTensor(dataset_encoded["attention_mask"][i*batch_size:(i+1)*batch_size]).to(device)

        with torch.no_grad():   
            logits = model(input_ids = input_ids,
                        attention_mask = attention_mask).logits
        if i == 0:
            logits_all = logits
        else:
            logits_all = torch.cat((logits_all, logits),0)

    if device == "cpu":
        predicted_labels = np.argmax(logits_all.detach().numpy(), axis=1)
    else:
        predicted_labels = np.argmax(logits_all.cpu().detach().numpy(), axis=1)
    test_set_labels = dataset_encoded["label"][:(n_samples//batch_size*batch_size)]
    
    print(predicted_labels[:20])
    print(test_set_labels[:20])
    print(f"Model accuracy is: {accuracy_score(test_set_labels, predicted_labels)}")

In [40]:
test_set_encoded = dataset["test"].map(tokenize_function, batched=True)

Map: 100%|██████████| 25000/25000 [00:03<00:00, 7946.37 examples/s]


In [49]:
inference(model=model, dataset_encoded=test_set_encoded, batch_size=200, n_samples=1000, device="mps")

Inference on mps...
Batch:  0
Batch:  1
Batch:  2
Batch:  3
Batch:  4
[0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Model accuracy is: 0.841


### Test Freezing layers

In [51]:
model_freezed = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

for name, param in model_freezed.named_parameters():
     if "distilbert." in name:
        param.requires_grad = False

trainer_freezed = Trainer(
    model=model_freezed,
    args=training_args,
    train_dataset=Dataset.from_dict(dataset_encoded["train"][:n_train_samples]),
    eval_dataset=Dataset.from_dict(dataset_encoded["test"][:n_eval_samples])
)
trainer_freezed.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 10%|█         | 10/96 [00:04<00:36,  2.38it/s]

{'loss': 0.6996, 'grad_norm': 0.8149346709251404, 'learning_rate': 1e-05, 'epoch': 0.31}


 21%|██        | 20/96 [00:08<00:32,  2.37it/s]

{'loss': 0.6939, 'grad_norm': 0.8579943776130676, 'learning_rate': 2e-05, 'epoch': 0.62}


 31%|███▏      | 30/96 [00:12<00:28,  2.36it/s]

{'loss': 0.6951, 'grad_norm': 0.9352133274078369, 'learning_rate': 3e-05, 'epoch': 0.94}


 42%|████▏     | 40/96 [00:16<00:23,  2.38it/s]

{'loss': 0.69, 'grad_norm': 1.6305299997329712, 'learning_rate': 4e-05, 'epoch': 1.25}


 52%|█████▏    | 50/96 [00:20<00:19,  2.37it/s]

{'loss': 0.6744, 'grad_norm': 1.1356620788574219, 'learning_rate': 5e-05, 'epoch': 1.56}


 62%|██████▎   | 60/96 [00:25<00:15,  2.34it/s]

{'loss': 0.6895, 'grad_norm': 0.8405741453170776, 'learning_rate': 3.91304347826087e-05, 'epoch': 1.88}


 73%|███████▎  | 70/96 [00:29<00:10,  2.42it/s]

{'loss': 0.6863, 'grad_norm': 0.8541117906570435, 'learning_rate': 2.826086956521739e-05, 'epoch': 2.19}


 83%|████████▎ | 80/96 [00:33<00:06,  2.36it/s]

{'loss': 0.6705, 'grad_norm': 0.9103043675422668, 'learning_rate': 1.739130434782609e-05, 'epoch': 2.5}


 94%|█████████▍| 90/96 [00:37<00:02,  2.37it/s]

{'loss': 0.6685, 'grad_norm': 0.9460690021514893, 'learning_rate': 6.521739130434783e-06, 'epoch': 2.81}


100%|██████████| 96/96 [00:39<00:00,  2.41it/s]

{'train_runtime': 39.8307, 'train_samples_per_second': 37.659, 'train_steps_per_second': 2.41, 'train_loss': 0.6835849533478419, 'epoch': 3.0}





TrainOutput(global_step=96, training_loss=0.6835849533478419, metrics={'train_runtime': 39.8307, 'train_samples_per_second': 37.659, 'train_steps_per_second': 2.41, 'train_loss': 0.6835849533478419, 'epoch': 3.0})

In [19]:
inference(model=model_freezed, dataset_encoded=dataset_encoded, batch_size=200, n_samples=1000, device="mps")

Inference on mps...
Batch:  0
Batch:  1
Batch:  2
Batch:  3
Batch:  4
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
[1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0]
0.573


## PEFT training

In [21]:
peft_config = LoraConfig(target_modules = ['word_embeddings', 'q_lin', 'k_lin', 'v_lin', 'out_lin','pre_classifier','classifier'])
model_lora = get_peft_model(model, peft_config)
model_lora.print_trainable_parameters()

trainable params: 563,680 || all params: 67,518,690 || trainable%: 0.8348503207037933


In [22]:
trainer_lora= Trainer(
    model=model_lora,
    args=training_args,
    train_dataset=Dataset.from_dict(dataset_encoded["train"][:n_train_samples]),
    eval_dataset=Dataset.from_dict(dataset_encoded["eval"][:n_eval_samples])
)
trainer_lora.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
                                                 
 44%|████▎     | 164/375 [32:19<03:48,  1.08s/it]

{'loss': 0.0609, 'grad_norm': 1.6377030611038208, 'learning_rate': 1e-05, 'epoch': 0.31}


                                                 
 44%|████▎     | 164/375 [32:28<03:48,  1.08s/it]

{'loss': 0.1007, 'grad_norm': 0.39237526059150696, 'learning_rate': 2e-05, 'epoch': 0.62}


                                                 
 44%|████▎     | 164/375 [32:38<03:48,  1.08s/it]

{'loss': 0.0702, 'grad_norm': 0.587874710559845, 'learning_rate': 3e-05, 'epoch': 0.94}


                                                 
 44%|████▎     | 164/375 [32:48<03:48,  1.08s/it]

{'loss': 0.0378, 'grad_norm': 0.280877947807312, 'learning_rate': 4e-05, 'epoch': 1.25}


                                                 
 44%|████▎     | 164/375 [32:57<03:48,  1.08s/it]

{'loss': 0.0488, 'grad_norm': 0.08956748247146606, 'learning_rate': 5e-05, 'epoch': 1.56}


                                                 
 44%|████▎     | 164/375 [33:07<03:48,  1.08s/it]

{'loss': 0.1093, 'grad_norm': 0.2687084376811981, 'learning_rate': 3.91304347826087e-05, 'epoch': 1.88}


                                                 
 44%|████▎     | 164/375 [33:15<03:48,  1.08s/it]

{'loss': 0.043, 'grad_norm': 0.04600665718317032, 'learning_rate': 2.826086956521739e-05, 'epoch': 2.19}


                                                 
 44%|████▎     | 164/375 [33:25<03:48,  1.08s/it]

{'loss': 0.031, 'grad_norm': 0.0911879688501358, 'learning_rate': 1.739130434782609e-05, 'epoch': 2.5}


                                                 
 44%|████▎     | 164/375 [33:34<03:48,  1.08s/it]

{'loss': 0.0717, 'grad_norm': 0.6064560413360596, 'learning_rate': 6.521739130434783e-06, 'epoch': 2.81}


                                                 
100%|██████████| 96/96 [01:31<00:00,  1.05it/s]t]

{'train_runtime': 91.6019, 'train_samples_per_second': 16.375, 'train_steps_per_second': 1.048, 'train_loss': 0.06575289865334828, 'epoch': 3.0}





TrainOutput(global_step=96, training_loss=0.06575289865334828, metrics={'train_runtime': 91.6019, 'train_samples_per_second': 16.375, 'train_steps_per_second': 1.048, 'train_loss': 0.06575289865334828, 'epoch': 3.0})

In [23]:
inference(model=model_lora, dataset_encoded=dataset_encoded, batch_size=200, n_samples=1000, device="mps")

Inference on mps...
Batch:  0
Batch:  1
Batch:  2
Batch:  3
Batch:  4
[1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0]
[1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0]
0.873


## TODO

1. What do you notice about the accuracy and training time with full fine-tuning, with freezing layers or with LoRa fine-tuning?
2. Test different LoRa parameters. How does the rank (parameter r) affect the number of trainable parameters. Hint: `print_trainable_parameters`?
3. How is the model performance affected when using a lower rank for instance?


### Test lora parameters

In [52]:
peft_config = LoraConfig(target_modules = ['word_embeddings', 'q_lin', 'k_lin', 'v_lin', 'out_lin','pre_classifier','classifier'],
                         r=4,
                         lora_alpha=32,
                         lora_dropout=0.01)

model_lora_2 = get_peft_model(model, peft_config)
model_lora_2.print_trainable_parameters()

trainable params: 281,840 || all params: 67,236,850 || trainable%: 0.41917490185813283


In [53]:
trainer_lora_2= Trainer(
    model=model_lora_2,
    args=training_args,
    train_dataset=Dataset.from_dict(dataset_encoded["train"][:n_train_samples]),
    eval_dataset=Dataset.from_dict(dataset_encoded["test"][:n_eval_samples])
)
trainer_lora_2.train()

 10%|█         | 10/96 [00:10<01:26,  1.00s/it]

{'loss': 0.1733, 'grad_norm': 28.433147430419922, 'learning_rate': 1e-05, 'epoch': 0.31}


 21%|██        | 20/96 [00:20<01:15,  1.00it/s]

{'loss': 0.0792, 'grad_norm': 0.6897999048233032, 'learning_rate': 2e-05, 'epoch': 0.62}


 31%|███▏      | 30/96 [00:30<01:05,  1.01it/s]

{'loss': 0.0535, 'grad_norm': 2.6509721279144287, 'learning_rate': 3e-05, 'epoch': 0.94}


 42%|████▏     | 40/96 [00:39<00:55,  1.01it/s]

{'loss': 0.118, 'grad_norm': 27.801658630371094, 'learning_rate': 4e-05, 'epoch': 1.25}


 52%|█████▏    | 50/96 [00:49<00:46,  1.00s/it]

{'loss': 0.1213, 'grad_norm': 0.2676115930080414, 'learning_rate': 5e-05, 'epoch': 1.56}


 62%|██████▎   | 60/96 [00:59<00:36,  1.00s/it]

{'loss': 0.0539, 'grad_norm': 0.32369083166122437, 'learning_rate': 3.91304347826087e-05, 'epoch': 1.88}


 73%|███████▎  | 70/96 [01:08<00:25,  1.02it/s]

{'loss': 0.0397, 'grad_norm': 2.26847767829895, 'learning_rate': 2.826086956521739e-05, 'epoch': 2.19}


 83%|████████▎ | 80/96 [01:18<00:16,  1.00s/it]

{'loss': 0.038, 'grad_norm': 2.1450226306915283, 'learning_rate': 1.739130434782609e-05, 'epoch': 2.5}


 94%|█████████▍| 90/96 [01:28<00:06,  1.00s/it]

{'loss': 0.1353, 'grad_norm': 8.075260162353516, 'learning_rate': 6.521739130434783e-06, 'epoch': 2.81}


100%|██████████| 96/96 [01:34<00:00,  1.02it/s]

{'train_runtime': 94.0094, 'train_samples_per_second': 15.956, 'train_steps_per_second': 1.021, 'train_loss': 0.08920533799876769, 'epoch': 3.0}





TrainOutput(global_step=96, training_loss=0.08920533799876769, metrics={'train_runtime': 94.0094, 'train_samples_per_second': 15.956, 'train_steps_per_second': 1.021, 'train_loss': 0.08920533799876769, 'epoch': 3.0})

In [55]:
inference(model=model_lora_2, dataset_encoded=test_set_encoded, batch_size=200, n_samples=1000, device="mps")

Inference on mps...
Batch:  0
Batch:  1
Batch:  2
Batch:  3
Batch:  4
[0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Model accuracy is: 0.834
