In [28]:
!pip install datasets transformers peft evaluate torch numpy wandb -qqq 




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
import transformers

from transformers import (
      AutoTokenizer,
      AutoConfig,
      AutoModelForSequenceClassification,
      DataCollatorWithPadding,
      TrainingArguments,
      Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [11]:
model_id = "albert/albert-xxlarge-v2"

In [12]:
id2label = {0: 'Negative', 1:'Neutral', 2:'Positive'}
label2id = {'Negative':0, 'Neutral':1, 'Positive':2}

#generate classification model from model_checkpoints
model = AutoModelForSequenceClassification.from_pretrained(
           model_id, num_labels = 3, id2label = id2label, label2id = label2id)

config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/893M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-xxlarge-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
dataset_1 = load_dataset("mteb/tweet_sentiment_extraction")
dataset_1

Downloading readme:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.63M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/465k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/27481 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3534 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 3534
    })
})

In [14]:
dataset_2 = dataset_1['train']
dataset_3 = dataset_1['test']

dataset = concatenate_datasets([dataset_2, dataset_3])

dataset = dataset.shuffle()

In [15]:
dataset

Dataset({
    features: ['id', 'text', 'label', 'label_text'],
    num_rows: 31015
})

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [17]:
print("Untrained model predictions:")
print("----------------------------")
x =0
while x <=30:
    # tokenize text
    inputs = tokenizer.encode(dataset['text'][x], return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    print( (dataset['label'][x]),  id2label[predictions.tolist()])
    x = x+1

Untrained model predictions:
----------------------------
1 Neutral
2 Neutral
0 Neutral
1 Neutral
0 Neutral
2 Positive
0 Negative
0 Neutral
2 Neutral
0 Positive
1 Neutral
2 Neutral
2 Positive
1 Neutral
2 Neutral
1 Negative
1 Neutral
1 Neutral
0 Neutral
1 Neutral
0 Neutral
2 Neutral
0 Negative
1 Neutral
1 Neutral
1 Neutral
2 Neutral
0 Neutral
0 Neutral
2 Neutral
1 Negative


In [18]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [19]:
# apply it to all texts in the dataset
dataset = dataset.map(tokenize_function, batched=True)
dataset

Map:   0%|          | 0/31015 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 31015
})

In [20]:
dataset = dataset.train_test_split(test_size=0.1, seed=1234)

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27913
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3102
    })
})

In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
peft_config = LoraConfig(task_type="SEQ_CLS", # Sequence Classification.
                        r=64,  # Intrinsic rank of trainable weight matrix.
                        lora_alpha=64,  # similar to Learning rate.
                        lora_dropout=0.01, # probability of dropout nodes.
                        target_modules = ["query","key","value","dense"]) # LoRA is applied to the query layer.

In [24]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 2,109,443 || all params: 224,717,318 || trainable%: 0.9387


In [25]:
args = TrainingArguments(
        output_dir="./albert-xxlarge-v2-Adapters",
        evaluation_strategy="steps",
        do_eval=True,
        optim="adamw_torch",
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=16,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=10,
        learning_rate=5e-5,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=50,
        num_train_epochs=3,
        warmup_ratio=0.1,
        lr_scheduler_type="inverse_sqrt",
        report_to="wandb",
        seed=42,
)



In [29]:


trainer = Trainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        tokenizer=tokenizer,
        args=args,
        data_collator=data_collator

)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Using auto half precision backend


In [30]:
trainer.train()

Currently training with a batch size of: 16
The following columns in the training set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: label_text, text, id. If label_text, text, id are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27,913
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 1,308
  Number of trainable parameters = 2,109,443
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your pro

 ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/user/.netrc


Step,Training Loss,Validation Loss
50,1.121,1.099198
100,1.1183,1.091835
150,1.1155,1.089858
200,1.0405,0.974963
250,0.9,0.900837
300,0.8362,0.851133
350,0.7714,0.803935
400,0.7385,0.761706
450,0.7422,0.726467
500,0.6513,0.715767


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: label_text, text, id. If label_text, text, id are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 3102
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: label_text, text, id. If label_text, text, id are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 3102
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: label_text, text, id. If label_text, text, id are not expected by `PeftModelForSequenceClassification.for

TrainOutput(global_step=1308, training_loss=0.721492462566504, metrics={'train_runtime': 1729.2572, 'train_samples_per_second': 48.425, 'train_steps_per_second': 0.756, 'total_flos': 4395507799907952.0, 'train_loss': 0.721492462566504, 'epoch': 2.9982808022922636})

In [32]:
trainer.push_to_hub( )

Saving model checkpoint to ./albert-xxlarge-v2-Adapters
loading configuration file config.json from cache at /home/user/.cache/huggingface/hub/models--albert--albert-xxlarge-v2/snapshots/97d3e58863d3a41dc581882f73b34d110b18f1f8/config.json
Model config AlbertConfig {
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 16384,
  "layer_norm_eps": 1e-12,
  "layers_to_keep": [],
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 64,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.42

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/8.44M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/dhanishetty/albert-xxlarge-v2-Adapters/commit/d90aa204a5b4899d10d76470a27ace570bddd2fc', commit_message='End of training', commit_description='', oid='d90aa204a5b4899d10d76470a27ace570bddd2fc', pr_url=None, pr_revision=None, pr_num=None)