# Showcase of a PEFT technique
- PEFT Technique: LoRA
- Model: BERT trained on clinical data
- Evaluation approach: Determine if patients are correctly diagnosed based on a description of symptoms.
  Using `accuracy` as the metric to evaluate the model.
- Fine-tuning dataset: Symptom to disease dataset from Huggingface

# LOAD DATA

In [2]:
dataset_name = "duxprajapati/symptom-disease-dataset"
model_name = "distilbert/distilbert-base-uncased"

from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()
print(device)

cuda


In [3]:
from datasets import load_dataset, DatasetDict

[ds_train, ds_test] = load_dataset(dataset_name, split=["train", "test"])
ds_train_valid = ds_train.train_test_split(test_size=0.1, shuffle=False)

dataset = DatasetDict({
    'train': ds_train_valid['train'],
    'test': ds_test,
    'valid': ds_train_valid['test']    
})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5070
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1409
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 564
    })
})

In [4]:
import pandas as pd

df = pd.DataFrame(dataset["train"])
df[["label", "text"]].head()

Unnamed: 0,label,text
0,308,I have been having migraines and headaches. I ...
1,35,I have asthma and I get wheezing and breathing...
2,798,Signs and symptoms of primary ovarian insuffic...
3,149,"cough,high_fever,breathlessness,family_history..."
4,596,"chills,vomiting,high_fever,sweating,headache,n..."


In [5]:
import json

with open('mapping.json') as file:
  label2id = json.load(file)

df = pd.DataFrame.from_dict(label2id, orient='index', columns=['value'])
print(df.head())

id2label = {id: name for name, id in label2id.items() }
df = pd.DataFrame.from_dict(id2label, orient='index', columns=['value'])
print(df.head())

                                         value
(Vertigo) Paroymsal  Positional Vertigo      0
Abdominal Aortic Aneurysm                    1
Acanthosis Nigricans                         2
Achalasia                                    3
Achilles Tendinitis                          4
                                     value
0  (Vertigo) Paroymsal  Positional Vertigo
1                Abdominal Aortic Aneurysm
2                     Acanthosis Nigricans
3                                Achalasia
4                      Achilles Tendinitis


In [6]:
def label_to_string(example):
    return {
        'label': id2label[example["label"]],
        'text': example["text"]
    }

readable_dataset = dataset.map(label_to_string)
                               
labels = id2label.keys()
print('Total number of diseases: ', len(labels))

df = pd.DataFrame(readable_dataset["train"])
df[["label", "text"]].head()

Map:   0%|          | 0/5070 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Total number of diseases:  1082


Unnamed: 0,label,text
0,Drug Reaction,I have been having migraines and headaches. I ...
1,Allergy,I have asthma and I get wheezing and breathing...
2,Premature Ovarian Failure,Signs and symptoms of primary ovarian insuffic...
3,Bronchial Asthma,"cough,high_fever,breathlessness,family_history..."
4,Malaria,"chills,vomiting,high_fever,sweating,headache,n..."


# LOAD MODEL

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

def prepare_features(examples):
    # Tokenize the text
    tokenized = tokenizer(examples['text'], truncation=True, padding="max_length", return_tensors="pt") 
    return tokenized

tokenized_dataset = {}
for split in dataset:
    tokenized_dataset[split] = dataset[split].map(
        prepare_features,
        batched=True
    )

print(tokenized_dataset["train"][0])
tokenized_dataset["train"]

Map:   0%|          | 0/5070 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

{'text': "I have been having migraines and headaches. I can't sleep. My whole body is shaking and shivering. I feel dizzy sometimes.", 'label': 308, 'input_ids': [101, 1045, 2031, 2042, 2383, 19117, 26456, 2015, 1998, 14978, 2015, 1012, 1045, 2064, 1005, 1056, 3637, 1012, 2026, 2878, 2303, 2003, 5513, 1998, 19197, 1012, 1045, 2514, 14849, 2823, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5070
})

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

config = AutoConfig.from_pretrained(model_name,
                                   num_labels=len(labels),
                                   label2id=label2id,
                                   id2label=id2label)
model = AutoModelForSequenceClassification.from_config(config)
model.to(device)

# for param in model.parameters():
#     param.requires_grad = False

print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [9]:
import evaluate
metric = evaluate.load("accuracy")

def validate_model(model):
    predictions = []
    labels = []

    for row in tokenized_dataset['valid']:
        expected_label_id = row['label']
        inputs = tokenizer(row['text'], truncation=True, padding="max_length", return_tensors="pt").to(device)
        outputs = model(**inputs)
        predicted_label_id = outputs.logits.argmax().item()

        predictions.append(predicted_label_id)
        labels.append(expected_label_id)
    
    return metric.compute(predictions=predictions, references=labels)

pretrained_accuracy = validate_model(model)
pretrained_accuracy

{'accuracy': 0.0}

In [10]:
def diagnose(model, tokenizer, symptoms):
    inputs = tokenizer(symptoms, return_tensors="pt").to(device)
    outputs = model(**inputs)
    
    diagnosis = outputs.logits.argmax().item()
    return model.config.id2label[diagnosis]

print(diagnose(model, tokenizer, "Headache, flatulence, easily irritated, emotionally unstable")) 

Median Arcuate Ligament Syndrome Mals


# PREPARE PEFT MODEL

In [11]:
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"],
    task_type='SEQ_CLS',
    modules_to_save=["pre_classifier", "classifier"],
    inference_mode=True
)

lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()
lora_model.peft_config

trainable params: 1,422,650 || all params: 69,798,004 || trainable%: 2.0382


{'default': LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='distilbert/distilbert-base-uncased', revision=None, inference_mode=True, r=16, target_modules={'k_lin', 'q_lin', 'out_lin', 'v_lin'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=['pre_classifier', 'classifier', 'classifier', 'score'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)}

In [12]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    #return {"accuracy": (predictions == labels).mean()}
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./model/",
        learning_rate=2e-3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="steps",
        save_strategy="steps",
        num_train_epochs=15,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        eval_on_start=True
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# EVALUATION PRE TRAINING

In [13]:
pre_training_eval = trainer.evaluate(metric_key_prefix='')
pre_training_eval

{'_loss': 7.0007123947143555,
 '_model_preparation_time': 0.0017,
 '_accuracy': 0.0,
 '_runtime': 6.6517,
 '_samples_per_second': 211.826,
 '_steps_per_second': 13.38}

# TRAIN PEFT MODEL

In [14]:
trainer.train()

post_training_eval = trainer.evaluate(metric_key_prefix='')
post_training_eval

Step,Training Loss,Validation Loss,Model Preparation Time,Accuracy
0,No log,7.000712,0.0017,0.0
500,4.141900,3.38167,0.0017,0.320085
1000,3.247800,2.708128,0.0017,0.494677
1500,2.831000,2.370874,0.0017,0.674947
2000,2.611100,2.212404,0.0017,0.657204
2500,2.472600,2.129985,0.0017,0.678495
3000,2.357800,2.110312,0.0017,0.703336
3500,2.332400,2.059547,0.0017,0.721079
4000,2.267500,2.066001,0.0017,0.723918
4500,2.223800,2.034957,0.0017,0.724627


{'_loss': 2.034956932067871,
 '_model_preparation_time': 0.0017,
 '_accuracy': 0.7246273953158269,
 '_runtime': 6.7874,
 '_samples_per_second': 207.589,
 '_steps_per_second': 13.112,
 'epoch': 15.0}

# EVALUATE PEFT MODEL

In [15]:
# Description of my symptoms from the perspective of my significant other, with an accuracy of 0.99
print(diagnose(lora_model, tokenizer, "Headache, flatulence, easily irritated, emotionally unstable")) 

Gastroenteritis


# SAVE TRAINED MODEL

In [16]:
lora_model.peft_config

{'default': LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='distilbert/distilbert-base-uncased', revision=None, inference_mode=True, r=16, target_modules={'k_lin', 'q_lin', 'out_lin', 'v_lin'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=['pre_classifier', 'classifier', 'classifier', 'score'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)}

# LOAD AND USE TRAINED MODEL

In [19]:
# THIS WORKS BUT IS NOT CONSIDERED CORRECT.
# Loading an AutoModel with the merged base+LoRA weights.

# merged_model = lora_model.merge_and_unload()
# merged_model.save_pretrained("dr-bert-merged")

# peft_model = AutoModelForSequenceClassification.from_pretrained("dr-bert-merged")
# peft_model.to(device)

In [24]:
# I CAN'T GET THIS TO WORK. THROWS ERROR THAT classifier, pre_classifier bias and weights mismatch.
## Loading an AutoPeftModel.

# lora_model.save_pretrained("dr-bert")

# from peft import AutoPeftModelForSequenceClassification

# peft_model = AutoPeftModelForSequenceClassification.from_pretrained("dr-bert")

In [18]:
## Loading a checkpoint
lora_model.save_pretrained("dr-bert")

from peft import PeftModel

base_config = AutoConfig.from_pretrained(model_name,
                                   num_labels=len(labels),
                                   label2id=label2id,
                                   id2label=id2label)
base_model = AutoModelForSequenceClassification.from_config(base_config)
peft_model = PeftModel.from_pretrained(base_model, "dr-bert")

peft_model.to(device)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=7

In [20]:
current_accuracy = validate_model(lora_model)
current_accuracy

{'accuracy': 0.6879432624113475}

In [21]:
trained_accuracy = validate_model(peft_model)
trained_accuracy

{'accuracy': 0.6879432624113475}

In [22]:
def diagnose_patients(model, patients):
    for (name, symptoms) in patients:
        inputs = tokenizer(symptoms, return_tensors="pt").to(device)
        outputs = model(**inputs)
        
        diagnosis = outputs.logits.argmax().item()
        
        print(f'Patient: {name}, has: {model.config.id2label[diagnosis]}, for: {symptoms}')

patients = [
    ("1", "headache,flatulence,fatique"),
    ("2", "stomach_pain,acidity,ulcers_on_tongue,vomiting"),
    ("3", "vomiting,headache,weakness_of_one_body_side"),
    ("4", "this is a random string")    
]

diagnose_patients(lora_model, patients)
print('\r')
diagnose_patients(peft_model, patients)

Patient: 1, has: Malaria, for: headache,flatulence,fatique
Patient: 2, has: Gerd, for: stomach_pain,acidity,ulcers_on_tongue,vomiting
Patient: 3, has: Paralysis (Brain Hemorrhage), for: vomiting,headache,weakness_of_one_body_side
Patient: 4, has: Paralysis (Brain Hemorrhage), for: this is a random string

Patient: 1, has: Malaria, for: headache,flatulence,fatique
Patient: 2, has: Gerd, for: stomach_pain,acidity,ulcers_on_tongue,vomiting
Patient: 3, has: Paralysis (Brain Hemorrhage), for: vomiting,headache,weakness_of_one_body_side
Patient: 4, has: Paralysis (Brain Hemorrhage), for: this is a random string


In [23]:
print('Before: ', pretrained_accuracy['accuracy'],'\nCurrent: ', current_accuracy['accuracy'], '\nAfter: ', trained_accuracy['accuracy'])

Before:  0.0 
Current:  0.6879432624113475 
After:  0.6879432624113475
