# Showcase of a PEFT technique
- PEFT Technique: LoRA
- Model: "istilbert/distilbert-base-uncased-finetuned-sst-2-english"
- Evaluation approach: Determine 
- Fine-tuning dataset: "JoaoFassina/pokemon_anotated"

# LOAD DATA

In [55]:
dataset_name = "duxprajapati/symptom-disease-dataset"
model_name = "medicalai/ClinicalBERT"

from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()
print(device)

cpu


In [2]:
from datasets import load_dataset

original_dataset = load_dataset(dataset_name)
original_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5634
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1409
    })
})

In [3]:
import pandas as pd

df = pd.DataFrame(original_dataset["train"])
df[["label", "text"]].head()

Unnamed: 0,label,text
0,308,I have been having migraines and headaches. I ...
1,35,I have asthma and I get wheezing and breathing...
2,798,Signs and symptoms of primary ovarian insuffic...
3,149,"cough,high_fever,breathlessness,family_history..."
4,596,"chills,vomiting,high_fever,sweating,headache,n..."


In [48]:
import json

with open('mapping.json') as file:
  label2id = json.load(file)

df = pd.DataFrame.from_dict(label2id, orient='index', columns=['value'])
print(df.head())

id2label = {id: name for name, id in label2id.items() }
df = pd.DataFrame.from_dict(id2label, orient='index', columns=['value'])
print(df.head())

                                         value
(Vertigo) Paroymsal  Positional Vertigo      0
Abdominal Aortic Aneurysm                    1
Acanthosis Nigricans                         2
Achalasia                                    3
Achilles Tendinitis                          4
                                     value
0  (Vertigo) Paroymsal  Positional Vertigo
1                Abdominal Aortic Aneurysm
2                     Acanthosis Nigricans
3                                Achalasia
4                      Achilles Tendinitis


In [49]:
def label_to_string(example):
    return {
        'label': id2label[example["label"]],
        'text': example["text"]
    }

readable_dataset = original_dataset.map(label_to_string)
dataset = original_dataset
                               
labels = id2label.keys()
print('Total number of diseases: ', len(labels))

df = pd.DataFrame(readable_dataset["train"])
df[["label", "text"]].head()

Total number of diseases:  1082


Unnamed: 0,label,text
0,Drug Reaction,I have been having migraines and headaches. I ...
1,Allergy,I have asthma and I get wheezing and breathing...
2,Premature Ovarian Failure,Signs and symptoms of primary ovarian insuffic...
3,Bronchial Asthma,"cough,high_fever,breathlessness,family_history..."
4,Malaria,"chills,vomiting,high_fever,sweating,headache,n..."


# LOAD MODEL

In [50]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

config = AutoConfig.from_pretrained(model_name,
                                   num_labels=len(labels),
                                   label2id=label2id,
                                   id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
print(model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)

In [51]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def prepare_features(examples):
    # Tokenize the text
    tokenized = tokenizer(examples['text'], truncation=True, padding=True, max_length=512) 
    return tokenized

tokenized_dataset = {}
for split in dataset:
    tokenized_dataset[split] = dataset[split].map(
        prepare_features,
        batched=True,
        remove_columns=dataset[split].column_names
    )

print(tokenized_dataset["train"][0])
tokenized_dataset["train"]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

{'input_ids': [101, 177, 10529, 10590, 13677, 34478, 32782, 10171, 10111, 13578, 106461, 119, 177, 10944, 112, 188, 63658, 119, 15127, 21047, 14333, 10124, 48201, 22471, 10376, 10111, 57667, 52019, 10376, 119, 177, 38008, 36897, 12547, 17611, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 5634
})

# PREPARE PEFT MODEL

In [52]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    # target_modules=["q_lin", "k_lin", "v_lin", "out_lin", "lin1", "lin2"]
    target_modules="all-linear"
)

lora_model = get_peft_model(model, config)

lora_model.print_trainable_parameters()

trainable params: 682,000 || all params: 136,008,210 || trainable%: 0.5014


In [53]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/diagnoses",
        learning_rate=2e-3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [54]:
trainer.evaluate()

KeyboardInterrupt: 

# EVALUATE MODEL

# TRAIN PEFT MODEL

In [None]:
trainer.train()

# EVALUATE PEFT MODEL