In [1]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
)

from datasets import load_dataset, load_metric

from torch import nn

### The plan

We won't be going with distilling as the provided model is already distilled. As
the baseline we'll be pruning + quantizing the model.

In [2]:
model_id = "huawei-noah/TinyBERT_General_6L_768D"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = AutoConfig.from_pretrained(model_id, num_labels=2, finetuning_task="cola")
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [3]:
model = AutoModelForSequenceClassification.from_pretrained(model_id).to(device)

2023-06-13 02:20:45.895305: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading pytorch_model.bin:   0%|          | 0.00/287M [00:00<?, ?B/s]

Some weights of the model checkpoint at huawei-noah/TinyBERT_General_6L_768D were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'fit_denses.1.bias', 'cls.predictions.transform.LayerNorm.bias', 'fit_denses.4.bias', 'fit_denses.2.bias', 'fit_denses.3.weight', 'fit_denses.0.bias', 'fit_denses.3.bias', 'fit_denses.4.weight', 'fit_denses.6.bias', 'cls.predictions.transform.dense.weight', 'fit_denses.0.weight', 'fit_denses.1.weight', 'fit_denses.5.weight', 'fit_denses.2.weight', 'fit_denses.5.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'fit_denses.6.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT

### Training

We will be finetuning the provided model on GLUE's SST2 task.

In [3]:
from transformers import DataCollatorWithPadding


def process_dataset(dataset):
    dataset = dataset.map(
        lambda x: tokenizer(x["sentence"], padding="max_length"),
        batched=True,
    )

    dataset.set_format(
        type="torch",
        columns=["input_ids", "token_type_ids", "attention_mask", "label"],
    )

    return dataset

train_dataset, test_dataset = load_dataset(
    "glue", "sst2", split=["train", "validation"]
)

metric = load_metric("glue", "sst2")

train_dataset = process_dataset(train_dataset)
test_dataset = process_dataset(test_dataset)

2023-06-13 03:10:17.246013: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Found cached dataset glue (/home/lovv66/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/2 [00:00<?, ?it/s]

  metric = load_metric("glue", "sst2")
Loading cached processed dataset at /home/lovv66/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-0f506ed5072f826f.arrow


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.


In [4]:
from torch.utils.data import DataLoader


def get_predictions(model, dataset, tokenizer, device, batch_size=32):
    loader = DataLoader(
        dataset,
        collate_fn=DataCollatorWithPadding(tokenizer),
        batch_size=batch_size,
    )

    results = list()
    for  batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        batch.pop("labels")
        results.append(model(**batch)["logits"].argmax(axis=1))

    return torch.cat(results)

Finetuning could be done better, for example tuning only the classification head.
But in the scope of the task we'll tune all the weight of the model.

In [6]:
from transformers import Trainer, TrainingArguments


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=metric,
)

trainer.train()

model.save_pretrained("./models/baseline")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    "models/baseline/",
).to(device)

2023-06-13 02:17:11.352887: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
metric.compute(
    references=test_dataset["label"],
    predictions=get_predictions(model, test_dataset, tokenizer, device),
)

{'accuracy': 0.893348623853211}

Now we'll be trying to shrink the model while preserving the score above as much
as we can.

I've tried one-shot global unstructured magnitude pruning, but the quality was poor, so I switched
to a gradient based technique (SNIP). We will also be pruning the model iteratively, meaning that
it will be trained between pruning steps.

To make things easier we'll use huggingface's API.

In [11]:
from optimum.intel import INCTrainer
from neural_compressor import WeightPruningConfig
from transformers import TrainingArguments

from neural_compressor.config import AccuracyCriterion, TuningCriterion

def eval_fn(model):
    approx = get_predictions(model, test_dataset, tokenizer, device)
    metrics = metric.compute(
        references=test_dataset["label"],
        predictions=approx,
    )
    return metrics["accuracy"]

    
pruning_config = WeightPruningConfig(
    pruning_type="snip_momentum",
    start_step=0,
    end_step=15,
    target_sparsity=0.3,
    pruning_scope="local",
)
    
accuracy_criterion = AccuracyCriterion(tolerable_loss=0.05)
tuning_criterion = TuningCriterion(max_trials=10)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = INCTrainer(
    model=model,
    pruning_config=pruning_config,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=metric,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("models/pruned/").to(device)

In [7]:
eval_result = metric.compute(
    references=test_dataset["label"],
    predictions=get_predictions(model, test_dataset, tokenizer, device),
)

eval_result

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'accuracy': 0.8876146788990825}

Quality didn't degrade too much. Good.

Now let's quantize the pruned model. We'll be using bitsandbytes' 4bit
quantization API.

In [10]:
import bitsandbytes as bnb


def convert_linear(linear, device):
    quantized = bnb.nn.Linear4bit(
        linear.in_features,
        linear.out_features,
        bias=linear.bias is not None
    )
    quantized.load_state_dict(linear.state_dict())
    return quantized.cuda(device)


def convert_to_int4_(model, device):
    for module in list(model.modules()):
        for name, child in module.named_children():
            if isinstance(child, nn.Linear):
                quantized = convert_linear(child, device)
                setattr(module, name, quantized)

convert_to_int4_(model, device)


In [9]:
eval_result = metric.compute(
    references=test_dataset["label"],
    predictions=get_predictions(model, test_dataset, tokenizer, device),
)
eval_result

{'accuracy': 0.8841743119266054}

In [118]:
model.save_pretrained("./models/pruned+quantized")