### Shrinking distiliBERT in 3 ways

In [1]:
pip install transformers



In [2]:
pip install neural_compressor



In [3]:
pip install optimum



In [4]:
! pip install datasets transformers optimum[intel]



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
from transformers import DistilBertConfig

checkpoint_dir = '/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/distilibert_full/20230628-11-22-26/checkpoint-21000'

# Load the model configuration
config = DistilBertConfig.from_pretrained(checkpoint_dir)

# Print the model configuration
#print(config)


In [7]:
from transformers import DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer
import torch
from torch.quantization import quantize_dynamic

# Load the model from a directory
model_path = "/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/distilibert_full/20230628-11-22-26/checkpoint-21000"
config = DistilBertConfig.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path, config=config)


#### Pruning the Model

In [40]:
# Method 1: Pruning
from transformers import DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer
import torch
import torch.nn.utils.prune as prune


# Prune the model
parameters_to_prune = []
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        parameters_to_prune.append((module, 'weight'))

for layer, parameter_name in parameters_to_prune:
    prune.l1_unstructured(layer, name=parameter_name, amount=0.0001)


# Save the pruned model
pruned_model_directory = "/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/pruned_model"
model.save_pretrained(pruned_model_directory)


#### Quantized Model

In [9]:
!pip install onnx



In [10]:
from functools import partial
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from neural_compressor.config import PostTrainingQuantConfig
from optimum.intel import INCQuantizer

model_name = "distilbert-base-uncased"
checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/distilibert_full/20230628-11-22-26/checkpoint-21000"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# The directory where the quantized model will be saved
save_dir = "/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/static_quantization"

def preprocess_function(examples, tokenizer):
    return tokenizer(examples["sentence"], padding="max_length", max_length=128, truncation=True)

# Load the quantization configuration detailing the quantization we wish to apply
quantization_config = PostTrainingQuantConfig(approach="static")
quantizer = INCQuantizer.from_pretrained(model)
# Generate the calibration dataset needed for the calibration step
calibration_dataset = quantizer.get_calibration_dataset(
    "glue",
    dataset_config_name="sst2",
    preprocess_function=partial(preprocess_function, tokenizer=tokenizer),
    num_samples=100,
    dataset_split="train",
)
quantizer = INCQuantizer.from_pretrained(model)
# Apply static quantization and save the resulting model
quantizer.quantize(
    quantization_config=quantization_config,
    calibration_dataset=calibration_dataset,
    save_directory=save_dir,
)

The task could not be automatically inferred and will be set to default. Please provide the task argument with the relevant task from conversational, feature-extraction, fill-mask, text-generation, text2text-generation, text-classification, token-classification, multiple-choice, object-detection, question-answering, image-classification, image-segmentation, masked-im, semantic-segmentation, automatic-speech-recognition, audio-classification, audio-frame-classification, audio-xvector, image-to-text, stable-diffusion, zero-shot-image-classification, zero-shot-object-detection. Detailed error: Cannot infer the task from a local directory yet, please specify the task manually.
2023-06-28 19:33:33 [INFO] Start auto tuning.
2023-06-28 19:33:33 [INFO] Execute the tuning process due to detect the evaluation function.
2023-06-28 19:33:33 [INFO] Adaptor has 4 recipes.
2023-06-28 19:33:33 [INFO] 0 recipes specified by user.
2023-06-28 19:33:33 [INFO] 3 recipes require future tuning.
2023-06-28 19

#### Knowledge Distiliation

In [11]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/processed_data.csv')
data.dropna(inplace = True)

# 1) Choose 30% of the data (balanced by categories)
undersampled_data = data.groupby('headline_main_category').apply(lambda x: x.sample(frac=0.0005, random_state=42)).reset_index(drop=True)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1) Split the data into train and test (balanced by categories)
train_data, test_data = train_test_split(undersampled_data, test_size=0.2, stratify=undersampled_data['headline_main_category'], random_state=42)

# 2) Encode the 'headline_main_category' column as integer labels
label_encoder = LabelEncoder()
train_data['encoded_category'] = label_encoder.fit_transform(train_data['headline_main_category'])
test_data['encoded_category'] = label_encoder.transform(test_data['headline_main_category'])

from transformers import DistilBertTokenizer, AlbertTokenizer

# Initialize the tokenizers
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# 4) Tokenize the 'headline_text_processed' column
train_distilbert_tokenized = train_data['headline_text_processed'].apply(distilbert_tokenizer.encode)
test_distilbert_tokenized = test_data['headline_text_processed'].apply(distilbert_tokenizer.encode)


from tensorflow.keras.preprocessing.sequence import pad_sequences

# Get the maximum length of tokenized inputs for DistilBERT
max_length_distilbert = max(len(tokens) for tokens in train_distilbert_tokenized)
max_length_distilbert += 2  # Add extra buffer

# Pad the tokenized inputs
train_distilbert_padded = pad_sequences(train_distilbert_tokenized, maxlen=max_length_distilbert, padding='post', truncating='post')
test_distilbert_padded = pad_sequences(test_distilbert_tokenized, maxlen=max_length_distilbert, padding='post', truncating='post')

from torch.utils.data import DataLoader, Dataset

# Define a custom dataset class for PyTorch
class Dataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
      self.tokenized_inputs = tokenized_inputs  # Tokenized and padded input sequences
      self.labels = labels  # Encoded labels (categories)

    def __len__(self):
      return len(self.labels)  # Return the number of samples

    def __getitem__(self, idx):
      return {'input_ids': torch.tensor(self.tokenized_inputs[idx], dtype=torch.long),  # Convert input sequence to tensor
              'labels': torch.tensor(self.labels[idx], dtype=torch.long),  # Convert label to tensor
              }

distilibert_train_dataset = Dataset(train_distilbert_padded, train_data['encoded_category'].values)
distilibert_test_dataset = Dataset(test_distilbert_padded, test_data['encoded_category'].values)

In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments
from optimum.intel import INCTrainer#, DistillationConfig
from neural_compressor import DistillationConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
import datetime
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, Dataset
from transformers import DefaultFlowCallback
from transformers import EarlyStoppingCallback
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

teacher_model_path = 'distilbert-base-uncased'
checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/distilibert_full/20230628-11-22-26/checkpoint-21000"  # Path to the student model checkpoint
num_classes = len(train_data['encoded_category'].unique())


# Load the teacher model
teacher_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = num_classes)

# Load the student model
student_model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

# Define the distillation configuration with the teacher model
distillation_config = DistillationConfig(teacher_model=teacher_model)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/knowledge_distillation",  # Directory to save the model and logs
    num_train_epochs=1.0,
    do_train=True,
    do_eval=False,
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


# Define the INCTrainer with distillation
trainer = INCTrainer(
    model=student_model,
    distillation_config=distillation_config,
    args=training_args,
    train_dataset=distilibert_train_dataset,  # Replace with your training dataset
    eval_dataset=distilibert_test_dataset,  # Replace with your evaluation dataset
    compute_metrics=compute_metrics,  # Replace with your metric function
)

# Train the student model with knowledge distillation
train_result = trainer.train()
metrics = trainer.evaluate()
trainer.save_model()

# Load the trained student model
#model = AutoModelForSequenceClassification.from_pretrained("/path/to/save_dir")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


2023-06-28 19:34:42 [INFO] Training finished!


Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/knowledge_distillation
Configuration saved in /content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/knowledge_distillation/inc_config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/knowledge_distillation/pytorch_model.bin


### Evaluating the 3 compressed Models

In [22]:
import torch

# Check if a GPU is available, and if not, use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Alternatively, you can specify a specific GPU by its index
# device = torch.device('cuda:0')  # Use the first GPU

# Print the device
print(f"Device: {device}")

Device: cuda


In [19]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/processed_data.csv')
data.dropna(inplace = True)

# 1) Choose 30% of the data (balanced by categories)
test_data = data.groupby('headline_main_category').apply(lambda x: x.sample(frac=0.0005, random_state=103)).reset_index(drop=True)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 2) Encode the 'headline_main_category' column as integer labels
label_encoder = LabelEncoder()
test_data['encoded_category'] = label_encoder.fit_transform(test_data['headline_main_category'])

from transformers import DistilBertTokenizer, AlbertTokenizer

# Initialize the tokenizers
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# 4) Tokenize the 'headline_text_processed' column
test_distilbert_tokenized = test_data['headline_text_processed'].apply(distilbert_tokenizer.encode)


from tensorflow.keras.preprocessing.sequence import pad_sequences

# Get the maximum length of tokenized inputs for DistilBERT
max_length_distilbert = max(len(tokens) for tokens in train_distilbert_tokenized)
max_length_distilbert += 2  # Add extra buffer

# Pad the tokenized inputs
eval_distilbert_padded = pad_sequences(test_distilbert_tokenized, maxlen=max_length_distilbert, padding='post', truncating='post')


In [41]:
import torch

# Load the compressed models
model1 = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/pruned_model")
model2 = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/static_quantization")
model3 = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/knowledge_distillation")

# Move the model to the device
model1 = model1.to(device)
model2 = model2.to(device)
model3 = model3.to(device)

# Convert eval_distilbert_padded to a PyTorch tensor
eval_distilbert_padded = torch.tensor(eval_distilbert_padded, dtype=torch.long).to(device)
test_data_labels = test_data['encoded_category'].values
eval_dataset = Dataset(eval_distilbert_padded, test_data_labels)

# Define a dataloader for batch processing
eval_dataloader = DataLoader(eval_dataset, batch_size=16)

# Define a function to calculate evaluation metrics
def calculate_metrics(pred_labels, true_labels):
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average='weighted')
    recall = recall_score(true_labels, pred_labels, average='weighted')
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    return accuracy, precision, recall, f1

# Evaluate model 1
model1.eval()
model1_predictions = []
true_labels = []
for batch in eval_dataloader:
    input_ids = batch['input_ids'].to(device)
    with torch.no_grad():
        logits = model1(input_ids)[0]
        predictions = logits.argmax(dim=1)
    model1_predictions.extend(predictions.tolist())
    true_labels.extend(batch['labels'].tolist())

model1_accuracy, model1_precision, model1_recall, model1_f1 = calculate_metrics(model1_predictions, true_labels)

# Evaluate model 2
model2.eval()
model2_predictions = []
true_labels = []
for batch in eval_dataloader:
    input_ids = batch['input_ids'].to(device)
    with torch.no_grad():
        logits = model2(input_ids)[0]
        predictions = logits.argmax(dim=1)
    model2_predictions.extend(predictions.tolist())
    true_labels.extend(batch['labels'].tolist())

model2_accuracy, model2_precision, model2_recall, model2_f1 = calculate_metrics(model2_predictions, true_labels)


# Evaluate model 3
model3.eval()
model3_predictions = []
true_labels = []
for batch in eval_dataloader:
    input_ids = batch['input_ids'].to(device)
    with torch.no_grad():
        logits = model3(input_ids)[0]
        predictions = logits.argmax(dim=1)
    model3_predictions.extend(predictions.tolist())
    true_labels.extend(batch['labels'].tolist())

model3_accuracy, model3_precision, model3_recall, model3_f1 = calculate_metrics(model3_predictions, true_labels)


Some weights of the model checkpoint at /content/drive/MyDrive/Colab Notebooks/advanced_deep_learning_group12/pruned_model were not used when initializing DistilBertForSequenceClassification: ['distilbert.transformer.layer.2.ffn.lin2.weight_orig', 'distilbert.transformer.layer.4.ffn.lin2.weight_mask', 'distilbert.transformer.layer.3.attention.k_lin.weight_orig', 'distilbert.transformer.layer.3.attention.v_lin.weight_orig', 'distilbert.transformer.layer.0.attention.q_lin.weight_orig', 'distilbert.transformer.layer.1.attention.out_lin.weight_mask', 'distilbert.transformer.layer.2.attention.v_lin.weight_mask', 'distilbert.transformer.layer.1.attention.q_lin.weight_orig', 'distilbert.transformer.layer.0.ffn.lin2.weight_mask', 'pre_classifier.weight_orig', 'distilbert.transformer.layer.3.attention.v_lin.weight_mask', 'distilbert.transformer.layer.0.ffn.lin2.weight_orig', 'distilbert.transformer.layer.4.attention.out_lin.weight_orig', 'distilbert.transformer.layer.2.ffn.lin1.weight_mask', 'd

In [42]:
from tabulate import tabulate

# Calculate metrics for model 1
model1_accuracy, model1_precision, model1_recall, model1_f1 = calculate_metrics(model1_predictions, true_labels)

# Calculate metrics for model 2
model2_accuracy, model2_precision, model2_recall, model2_f1 = calculate_metrics(model2_predictions, true_labels)

# Calculate metrics for model 3
model3_accuracy, model3_precision, model3_recall, model3_f1 = calculate_metrics(model3_predictions, true_labels)

# Create a table to display the metrics
table = [["Model Compression", "Accuracy", "Precision", "Recall", "F1 Score"],
         ["Model 1 - Pruning", model1_accuracy, model1_precision, model1_recall, model1_f1],
         ["Model 2 - Quantized Model", model2_accuracy, model2_precision, model2_recall, model2_f1],
         ["Model 3 - Knowledge Distiliation", model3_accuracy, model3_precision, model3_recall, model3_f1]]

# Print the table
print(tabulate(table, headers="firstrow"))


Model Compression                   Accuracy    Precision     Recall    F1 Score
--------------------------------  ----------  -----------  ---------  ----------
Model 1 - Pruning                  0.0670807    0.251578   0.0670807  0.0191864
Model 2 - Quantized Model          0.0608696    0.0037051  0.0608696  0.00698503
Model 3 - Knowledge Distiliation   0.768944     0.772281   0.768944   0.766613


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
