In [1]:
import pandas as pd
from datasets import Dataset
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from torch.nn import KLDivLoss, CrossEntropyLoss
from torch.optim import Adam
import torch

# Check if a GPU is available and if not, use a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load pre-trained BERT model and tokenizer
teacher_model = BertForSequenceClassification.from_pretrained('fineBERT').to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load TinyBERT model
student_model = BertForSequenceClassification.from_pretrained('tinybert-gkd').to(device)

# Define the loss functions and optimizer
distill_loss_function = KLDivLoss(reduction='batchmean').to(device)
mlm_loss_function = CrossEntropyLoss(ignore_index=-1).to(device)
task_loss_function = CrossEntropyLoss().to(device)
optimizer = Adam(student_model.parameters(), lr=1e-4)

# Load your data into a DataFrame
df = pd.read_csv('augmented_data.txt', sep='\t', header=None, names=['label', 'text'])

# Convert the DataFrame into a Dataset
dataset = Dataset.from_pandas(df)

from transformers import BertTokenizerFast

# Load the tokenizer
tokeniz = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenize the texts and prepare the inputs
def tokenize_function(examples):
    return tokeniz(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./general/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=40,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=1000,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./general/logs',            # directory for storing logs
)

# Define the temperature
temperature = 2.0

class CustomTrainer(Trainer):
    def __init__(self, loss_type, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_type = loss_type

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = 0

        # Distillation loss
        if 'distill' in self.loss_type:
            distill_loss = distill_loss_function(
                torch.log_softmax(logits / temperature, dim=-1),
                torch.softmax(teacher_model(**inputs).logits / temperature, dim=-1),
            )
            loss += distill_loss

        # MLM loss
        if 'mlm' in self.loss_type:
            mlm_loss = mlm_loss_function(logits.view(-1, logits.shape[-1]), labels.view(-1))
            loss += mlm_loss

        # Task loss
        if 'task' in self.loss_type:
            task_loss = task_loss_function(logits.view(-1, logits.shape[-1]), labels.view(-1))
            loss += task_loss

        return (loss, outputs) if return_outputs else loss

# Define the trainer
trainer = CustomTrainer(
    model=student_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    loss_type=['distill', 'task'],  # types of loss to be used
)

# Start the training
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 149985/149985 [00:21<00:00, 6987.65 examples/s]


Step,Training Loss
500,0.8019
1000,0.6773
1500,0.6433
2000,0.6224
2500,0.604
3000,0.6039
3500,0.5926
4000,0.5456
4500,0.5158
5000,0.5208


TrainOutput(global_step=11250, training_loss=0.5342082777235243, metrics={'train_runtime': 11525.3458, 'train_samples_per_second': 39.04, 'train_steps_per_second': 0.976, 'total_flos': 6451903045969920.0, 'train_loss': 0.5342082777235243, 'epoch': 3.0})

In [2]:
trainer.save_model('tinybert-tkd')