# Showcase of a PEFT technique
- PEFT Technique: LoRA
- Model: BERT trained on clinical data
- Evaluation approach: Determine if patients are correctly diagnosed based on description of symptoms.
- Fine-tuning dataset: Symptom to disease dataset from Huggingface

# LOAD DATA

In [1]:
dataset_name = "duxprajapati/symptom-disease-dataset"
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()
print(device)

cuda


In [2]:
from datasets import load_dataset, DatasetDict

[ds_train, ds_test] = load_dataset(dataset_name, split=["train", "test"])
ds_train_valid = ds_train.train_test_split(test_size=0.1)

dataset = DatasetDict({
    'train': ds_train_valid['train'],
    'test': ds_test,
    'valid': ds_train_valid['test']    
})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5070
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1409
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 564
    })
})

In [3]:
import pandas as pd

df = pd.DataFrame(dataset["train"])
df[["label", "text"]].head()

Unnamed: 0,label,text
0,113,Bladder cancer signs and symptoms may include:...
1,105,"Bee stings can produce different reactions, ra..."
2,511,"fatigue,weight_gain,cold_hands_and_feets,mood_..."
3,110,Most people with binge-eating disorder are ove...
4,0,"headache,nausea,spinning_movements,loss_of_bal..."


In [4]:
import json

with open('mapping.json') as file:
  label2id = json.load(file)

df = pd.DataFrame.from_dict(label2id, orient='index', columns=['value'])
print(df.head())

id2label = {id: name for name, id in label2id.items() }
df = pd.DataFrame.from_dict(id2label, orient='index', columns=['value'])
print(df.head())

                                         value
(Vertigo) Paroymsal  Positional Vertigo      0
Abdominal Aortic Aneurysm                    1
Acanthosis Nigricans                         2
Achalasia                                    3
Achilles Tendinitis                          4
                                     value
0  (Vertigo) Paroymsal  Positional Vertigo
1                Abdominal Aortic Aneurysm
2                     Acanthosis Nigricans
3                                Achalasia
4                      Achilles Tendinitis


In [5]:
def label_to_string(example):
    return {
        'label': id2label[example["label"]],
        'text': example["text"]
    }

readable_dataset = dataset.map(label_to_string)
                               
labels = id2label.keys()
print('Total number of diseases: ', len(labels))

df = pd.DataFrame(readable_dataset["train"])
df[["label", "text"]].head()

Map:   0%|          | 0/5070 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Total number of diseases:  1082


Unnamed: 0,label,text
0,Bladder Cancer,Bladder cancer signs and symptoms may include:...
1,Bee Stings,"Bee stings can produce different reactions, ra..."
2,Hypothyroidism,"fatigue,weight_gain,cold_hands_and_feets,mood_..."
3,Binge Eating Disorder,Most people with binge-eating disorder are ove...
4,(Vertigo) Paroymsal Positional Vertigo,"headache,nausea,spinning_movements,loss_of_bal..."


# LOAD MODEL

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def prepare_features(examples):
    # Tokenize the text
    tokenized = tokenizer(examples['text'], truncation=True, padding=True, max_length=512) 
    return tokenized

tokenized_dataset = {}
for split in dataset:
    tokenized_dataset[split] = dataset[split].map(
        prepare_features,
        batched=True
    )

print(tokenized_dataset["train"][0])
tokenized_dataset["train"]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

{'text': 'Bladder cancer signs and symptoms may include:  Blood in urine (hematuria), which may cause urine to appear bright red or cola colored, though sometimes the urine appears normal and blood is detected on a lab test Frequent urination Painful urination Back pain ', 'label': 113, 'input_ids': [101, 24176, 4456, 5751, 1998, 8030, 2089, 2421, 1024, 2668, 1999, 17996, 1006, 19610, 4017, 27703, 1007, 1010, 2029, 2089, 3426, 17996, 2000, 3711, 4408, 2417, 2030, 15270, 6910, 1010, 2295, 2823, 1996, 17996, 3544, 3671, 1998, 2668, 2003, 11156, 2006, 1037, 6845, 3231, 6976, 24471, 12758, 9145, 24471, 12758, 2067, 3255, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5070
})

In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

config = AutoConfig.from_pretrained(model_name,
                                   num_labels=len(labels),
                                   label2id=label2id,
                                   id2label=id2label)
model = AutoModelForSequenceClassification.from_config(config)
print(model)

for param in model.base_model.parameters():
    param.requires_grad = False
    
trainable_params = []
for name, param in model.named_parameters():
    if param.requires_grad:
        trainable_params.append(name)
print('Number of trainable params: ', len(trainable_params))
trainable_params[:5]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


['pre_classifier.weight',
 'pre_classifier.bias',
 'classifier.weight',
 'classifier.bias']

# PREPARE PEFT MODEL

In [28]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    # target_modules=["q_lin", "k_lin", "v_lin", "out_lin", "lin1", "lin2"]
    target_modules="all-linear",
    task_type='SEQ_CLS',
    modules_to_save=["pre_classifier", "classifier"]
)

lora_model = get_peft_model(model, config)
lora_model.print_trainable_parameters()

trainable params: 2,749,754 || all params: 70,535,284 || trainable%: 3.8984


In [29]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": (predictions == labels).mean()}

trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/diagnoses",
        learning_rate=2e-3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="steps",
        save_strategy="steps",
        num_train_epochs=15,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        eval_on_start=True
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# TRAIN PEFT MODEL

In [26]:
trainer.train()

post_training_eval = trainer.evaluate(metric_key_prefix='')
post_training_eval

Step,Training Loss,Validation Loss,Accuracy
0,No log,7.024103,0.0


{'_loss': 1.395288109779358,
 '_accuracy': 0.758694109297374,
 '_runtime': 7.7954,
 '_samples_per_second': 180.748,
 '_steps_per_second': 5.773,
 'epoch': 3.0}

# EVALUATE PEFT MODEL

```JSON
{'_loss': 1.395288109779358,
 '_accuracy': 0.758694109297374,
 '_runtime': 7.7954,
 '_samples_per_second': 180.748,
 '_steps_per_second': 5.773,
 'epoch': 3.0}
```