In [119]:
import torch

In [120]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"GPU is available. Using {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU is not available. Using CPU.")

GPU is available. Using Tesla T4


# Load Tokenizer

In [121]:
tokenizer_path = '/kaggle/input/gsg-fine-tuned-epoch-20/transformers/default/1'

In [122]:
from transformers import AutoTokenizer

In [123]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Preprocess Dataset

In [124]:
from datasets import load_dataset, DatasetDict

In [125]:
data_path = '/kaggle/input/merged-clean/merged_clean_data.csv'

In [126]:
dataset = load_dataset('csv', data_files=data_path, split="train")

In [127]:
split_dataset = dataset.train_test_split(test_size=0.2)

In [128]:
test_validation_split = split_dataset['test'].train_test_split(test_size=0.5)

In [129]:
dataset_dict = DatasetDict({
    'train': split_dataset['train'],
    'validation': test_validation_split['train'],
    'test': test_validation_split['test']
})

In [130]:
print(dataset_dict["train"].shape[0])
print(dataset_dict["validation"].shape[0])
print(dataset_dict["test"].shape[0])

100229
12529
12529


## For Student Training

In [131]:
train_dataset = split_dataset['train']

In [132]:
train_dataset = train_dataset.select_columns(
    [
        "Findings",
        "Impression"
    ]
)

In [133]:
train_dataset = train_dataset.filter(
    lambda x: x["Findings"] is not None and x["Impression"] is not None
)

Filter:   0%|          | 0/100229 [00:00<?, ? examples/s]

In [134]:
TEST_DATA = """There is no focal consolidation, pleural effusion or pneumothorax.  Bilateral
 nodular opacities that most likely represent nipple shadows. The
 cardiomediastinal silhouette is normal.  Clips project over the left lung,
 potentially within the breast. The imaged upper abdomen is unremarkable.
 Chronic deformity of the posterior left sixth and seventh ribs are noted."""

In [135]:
def test_model(prefix, model):
    model.eval()

    input_ids = tokenizer.encode(
        TEST_DATA,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512,
        add_special_tokens=False,
    ).to(device)

    generated_ids = model.generate(input_ids, max_length=120)[0]
    print(
        prefix
        + " "
        + tokenizer.decode(
            generated_ids,
            skip_special_tokens=True,
            remove_invalid_values=True,
        )
    )


In [136]:
def tokenize_function(example):
    inputs = tokenizer(
        example["Findings"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )
    targets = tokenizer(
        example["Impression"],
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )

    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids,
    }

In [137]:
train_dataset = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/100229 [00:00<?, ? examples/s]

In [138]:
train_dataset.column_names

['Findings', 'Impression', 'input_ids', 'attention_mask', 'labels']

In [139]:
train_dataset = train_dataset.remove_columns(
    ["Findings", "Impression"]
)

In [140]:
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)

# Set config for student

## Get config of teacher

In [141]:
teacher_model_path = '/kaggle/input/gsg-fine-tuned-epoch-20/transformers/default/1'

In [142]:
from transformers import AutoModelForSeq2SeqLM

In [143]:
teacher_model = AutoModelForSeq2SeqLM.from_pretrained(teacher_model_path)

In [144]:
teacher_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [145]:
teacher_config = teacher_model.config

In [146]:
teacher_config

T5Config {
  "_name_or_path": "/kaggle/input/gsg-fine-tuned-epoch-20/transformers/default/1",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "e

In [147]:
student_config = teacher_config

| Scaling Factor | `num_layers` | `d_model` | `d_ff` | `num_heads` |
|----------------|--------------|-----------|--------|-------------|
| **1/8**        | 4            | 181       | 256    | 6           |
| **1/16**       | 3            | 128       | 128    | 4           |
| **1/32**       | 2            | 90        | 64     | 2           |


In [148]:
# Scale 1/32
student_config.num_layers = 3 # encoder layers
student_config.d_model = 128 # dimensionality of the hidden layers
student_config.d_ff = 128 # feed forward dimension
student_config.num_heads = 4 # num attention head

In [149]:
student_config

T5Config {
  "_name_or_path": "/kaggle/input/gsg-fine-tuned-epoch-20/transformers/default/1",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 64,
  "d_kv": 64,
  "d_model": 90,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 2,
  "num_layers": 2,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "earl

# Load Student

In [150]:
student_model = AutoModelForSeq2SeqLM.from_config(student_config)

In [151]:
student_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 90)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 90)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=90, out_features=128, bias=False)
              (k): Linear(in_features=90, out_features=128, bias=False)
              (v): Linear(in_features=90, out_features=128, bias=False)
              (o): Linear(in_features=128, out_features=90, bias=False)
              (relative_attention_bias): Embedding(32, 2)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=90, out_features=64, bias=False)
              (wo): Linear(in_features=64, out_features=90, bias=False)
              (dropout): Dropout(p=0.1, i

# Hyper parameters

In [152]:
learning_rate = 0.003
batch_size = 32
num_epochs = 20
temperature = 20
alpha = 0.7

In [153]:
from torch.utils.data import DataLoader

In [154]:
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
)

In [155]:
optimizer = torch.optim.AdamW(student_model.parameters(), lr=learning_rate)

# Loss

In [156]:
from torch.nn.functional import log_softmax, softmax

In [157]:
def calculate_loss(student_outputs, teacher_outputs, labels):
    s_logits = student_outputs.logits
    t_logits = teacher_outputs.logits

    vocab_size = s_logits.size(-1)
    ce_logits = s_logits.view(-1, vocab_size)
    ce_labels = labels.view(-1)
    ce_loss = torch.nn.functional.cross_entropy(ce_logits, ce_labels)
    student_log_probs = log_softmax(s_logits.view(-1, vocab_size) / temperature, dim=-1)
    teacher_probs = softmax(t_logits.view(-1, vocab_size) / temperature, dim=-1)

    distill_loss = torch.nn.functional.kl_div(
        student_log_probs, teacher_probs, reduction="batchmean"
    )
    loss = (1 - alpha) * ce_loss + (
        alpha * temperature**2 / batch_size**2
    ) * distill_loss

    return loss

# Student Training

In [158]:
from tqdm import tqdm
from torch import nn
from torch import tensor

In [None]:
for epoch in range(num_epochs):
    loss_value = 0

    test_model("Before epoch " + str(epoch), student_model)
    student_model.train()

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch}")

    for batch in progress_bar:
        optimizer.zero_grad()

        batch = dict([(k, v.to(device)) for k, v in batch.items()])

        # Forward pass through the teacher model
        with torch.no_grad():
            teacher_outputs = teacher_model(**batch)

        # Forward pass through the student model
        student_outputs = student_model(**batch)
        assert student_outputs.logits.size() == teacher_outputs.logits.size()
        loss = calculate_loss(student_outputs, teacher_outputs, batch["labels"])
        # Backpropagation
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix(loss_value=loss.item())

Before epoch 0 admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admission admis

Epoch 0:  85%|████████▌ | 2668/3133 [39:23<06:51,  1.13it/s, loss_value=0.08]  

# Save student

In [None]:
import os

In [None]:
output_dir = '/kaggle/working/'

In [None]:
os.mkdir(os.path.join(output_dir, 'gsg_kd_1_16'))

In [None]:
distilled_model_path = os.path.join(output_dir, 'gsg_kd_1_16')

In [None]:
student_model.save_pretrained(distilled_model_path)

# Metrics calculation comparison

In [None]:
%pip install evaluate

In [None]:
%pip install rouge_score

In [None]:
from evaluate import load
rouge_metric = load('rouge')
bleu_metric = load('bleu')

In [None]:
import nltk
import numpy as np
nltk.download('punkt')

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # decoded lebels
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute ROUGE scores
    rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    rouge_result = {key: value * 100 for key, value in rouge_result.items()}  # Convert to percentage

    # Compute BLEU score
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    bleu_scores = {
        "bleu": bleu_result['bleu'] * 100,  # Cumulative BLEU score
        "bleu1": bleu_result['precisions'][0] * 100,  # BLEU-1 score
        "bleu2": bleu_result['precisions'][1] * 100,  # BLEU-2 score
        "bleu3": bleu_result['precisions'][2] * 100,  # BLEU-3 score
        "bleu4": bleu_result['precisions'][3] * 100,  # BLEU-4 score
    }
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result = {
        **rouge_result,
        **bleu_scores,
        "gen_len": np.mean(prediction_lens)
    }

    return {k: round(v, 4) for k, v in result.items()}

# More dataset preprocessing

In [None]:
max_input_length = 512
max_target_length = 256  #covers 90 percent of the sample length reduces lost info

def preprocess_function(examples):
    inputs = [doc for doc in examples["Findings"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["Impression"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

# Testing student

In [None]:
os.mkdir(os.path.join(output_dir, 'student_model_output'))

In [None]:
student_model_output = os.path.join(output_dir, 'student_model_output')

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
batch_size = 8
args = Seq2SeqTrainingArguments(
    output_dir=student_model_output,
    eval_strategy = "epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

In [None]:
from transformers import DataCollatorForSeq2Seq

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=student_model)

In [None]:
trainer = Seq2SeqTrainer(
    student_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
test_results_student = trainer.predict(tokenized_datasets["test"])

In [None]:
print(test_results_student.metrics)

| Test Loss | Test ROUGE-1 | Test ROUGE-2 | Test ROUGE-L | Test ROUGE-Lsum | Test BLEU | Test BLEU-1 | Test BLEU-2 | Test BLEU-3 | Test BLEU-4 | Test Gen Len | Test Runtime | Test Samples/Sec | Test Steps/Sec |
|-----------|--------------|--------------|--------------|----------------|-----------|-------------|-------------|-------------|-------------|--------------|---------------|-----------------|----------------|
| 2.7516    | 31.4677      | 23.357       | 31.4736      | 31.4389         | 2.4311    | 50.2623     | 27.2791     | 22.0692     | 19.2737     | 7.0          | 134.6357      | 90.6            | 5.667          |


d_model = 64

# 1/8

{'test_loss': 4.1454644203186035, 'test_rouge1': 32.0684, 'test_rouge2': 23.8177, 'test_rougeL': 32.0561, 'test_rougeLsum': 32.0784, 'test_bleu': 2.5505, 'test_bleu1': 50.5308, 'test_bleu2': 27.5082, 'test_bleu3': 22.1007, 'test_bleu4': 19.1516, 'test_gen_len': 7.0, 'test_runtime': 140.0612, 'test_samples_per_second': 89.454, 'test_steps_per_second': 5.598}

# 1/32