**Import**

In [1]:
!pip install transformers datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [2]:
import datasets
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoConfig

from peft import LoraConfig, get_peft_model, PeftModel,PeftConfig
import torch
import evaluate
import numpy as np

**Base model**

model: Tiny LLAMA

parameters: 1.1B

size: 22 layers ,32 heads

In [3]:
model_checkpoint="PY007/TinyLlama-1.1B-step-50K-105b"
#defining labels
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

#creating classification model
model=AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=2,id2label=id2label,label2id=label2id)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at PY007/TinyLlama-1.1B-step-50K-105b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Data set**

In [4]:
dataset=load_dataset("shawhin/imdb-truncated")
dataset

README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

(…)-00000-of-00001-5a744bf76a1d84b2.parquet:   0%|          | 0.00/836k [00:00<?, ?B/s]

(…)-00000-of-00001-a3a52fabb70c739f.parquet:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

**Data preprocessing**

In [5]:
#tokenization
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)
# Add padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Ensure the model is aware of the padding token id
if tokenizer.pad_token is not None:
    model.config.pad_token_id = tokenizer.pad_token_id
#tokenize function
def tok_func(examples):
     text=examples["text"]
     tokenizer.truncation_side="left"
     tokenized_inputs=tokenizer(text,padding="max_length",truncation=True,max_length=512)
     return tokenized_inputs
#adding pad tokens if req
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

#tokenizing dataset
tokenized_dataset=dataset.map(tok_func,batched=True)
tokenized_dataset

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [6]:
data_collator=DataCollatorWithPadding(tokenizer=tokenizer) #pads the whole batch sequence acc to largest seq present in batch

**Evaluation metrics**

In [7]:
accuracy=evaluate.load("accuracy")

#passing this func in trainer
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy":accuracy.compute(predictions=predictions, references=labels)}


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

**untrained model performance**

In [8]:
text_list=["It was good","not a fan, Don't recommend.","Better than the first one.","this is not woth watch not even once.","this one is a pass","The performances were outstanding, and I enjoyed every minute.","The pacing was slow, making it a struggle to sit through."]
print("untrained model predictions:")
for text in text_list:
    inputs=tokenizer.encode(text,return_tensors="pt")
    logits=model(inputs).logits
    predictions=torch.argmax(logits)
    print(text + " - " + id2label[predictions.tolist()])

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


untrained model predictions:
It was good - POSITIVE
not a fan, Don't recommend. - POSITIVE
Better than the first one. - POSITIVE
this is not woth watch not even once. - POSITIVE
this one is a pass - POSITIVE
The performances were outstanding, and I enjoyed every minute. - POSITIVE
The pacing was slow, making it a struggle to sit through. - POSITIVE


**fine tuning with loRA**

In [9]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
   
    target_modules=["q_proj", "k_proj", "v_proj"], 
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 770,048 || all params: 1,035,286,528 || trainable%: 0.0744


In [10]:
#hyperparameters
lr=1e-3
batch_size=4
num_epochs=10
#setting training arguments
training_args=TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    gradient_accumulation_steps=4,
    fp16=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [11]:
#creating trainer object
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#training
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.163084,{'accuracy': 0.944}
2,No log,0.359696,{'accuracy': 0.939}
4,No log,0.357875,{'accuracy': 0.937}
6,No log,0.405427,{'accuracy': 0.94}
8,0.062100,0.40699,{'accuracy': 0.94}
9,0.062100,0.407275,{'accuracy': 0.94}


Trainer is attempting to log a value of "{'accuracy': 0.944}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.916}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.939}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.942}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.937}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This i

TrainOutput(global_step=620, training_loss=0.05008520047751165, metrics={'train_runtime': 2510.8879, 'train_samples_per_second': 3.983, 'train_steps_per_second': 0.247, 'total_flos': 2.955241033039872e+16, 'train_loss': 0.05008520047751165, 'epoch': 9.92})

**trained model performance**

In [16]:
import torch

print("trained model predictions:")
for text in text_list:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = tokenizer.encode(text, return_tensors="pt").to(device)
    logits = model(inputs).logits
    predictions = torch.argmax(logits)

    # predictions is already an integer
    print(text + " - " + id2label[predictions.item()])

trained model predictions:
It was good - POSITIVE
not a fan, Don't recommend. - NEGATIVE
Better than the first one. - POSITIVE
this is not woth watch not even once. - NEGATIVE
this one is a pass - POSITIVE
The performances were outstanding, and I enjoyed every minute. - POSITIVE
The pacing was slow, making it a struggle to sit through. - NEGATIVE
