**Install necessory libraries**

In [1]:
!pip install datasets
!pip install wandb
!pip install transformers[torch] --upgrade
!pip install accelerate --upgrade
!pip install peft
!pip install sacremoses
!pip install evaluate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[

**CPU or GPU :/**

In [3]:
import torch

if torch.cuda.is_available():
    print("Running on GPU")
    print(torch.cuda.get_device_name(0))
else:
    print("Running on CPU")


Running on GPU
Tesla T4


**Import the necessary modules**

In [4]:
import pandas as pd
import torch
import torch.nn as nn
import os
from transformers import AutoTokenizer, BioGptForSequenceClassification, EvalPrediction, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
import torch.optim as optim
from transformers import DataCollatorWithPadding
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from datasets import Dataset
from sklearn.model_selection import train_test_split
import wandb
import numpy as np
from datasets import load_dataset
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from datasets import Dataset, Features, Array3D, Array2D
import evaluate
from datasets import load_metric
from transformers import Trainer, TrainingArguments,  get_linear_schedule_with_warmup
from transformers.trainer_callback import TrainerCallback
import torch.optim as optim
import matplotlib.pyplot as plt


**Import drive**

In [5]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


**Read symptom disease from driv**

In [8]:
# uploaded Data set
df = pd.read_csv('/content/drive/MyDrive/LLM_df_stringify.csv')
#df = pd.read_csv('LLM_df_stringify_filtered.csv')

In [9]:
print(df.info())
df.sample(n=5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246945 entries, 0 to 246944
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      246945 non-null  int64 
 1   diseases        246945 non-null  object
 2   Symptom_labels  246945 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.7+ MB
None


Unnamed: 0.1,Unnamed: 0,diseases,Symptom_labels
152415,238344,molluscum contagiosum,"acne or pimples, knee lump or mass, itching of..."
5687,58506,acute bronchospasm,"sore throat, cough, nasal congestion, difficul..."
118641,124247,hypokalemia,"shortness of breath, fainting, headache, diarr..."
196441,19747,retinopathy due to high blood pressure,"pus draining from ear, foreign body sensation ..."
70692,167241,diabetic peripheral neuropathy,"leg pain, skin lesion, problems with movement,..."


**Pre Process the data before train model**

In [10]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Assuming the dataset has 'symptoms' and 'disease' columns
symptoms = df['Symptom_labels'].tolist()
diseases = df['diseases'].tolist()

# Encode the labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
disease_labels = le.fit_transform(diseases)

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(symptoms, disease_labels, test_size=0.2)

# Load BioBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

**Tokenize the data set with loaded BioBERT tokenizer**

In [11]:
def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding='max_length', max_length=200) ###


train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)


In [None]:
train_encodings.items()

**Convert the tokenized data into PyTorch datasets.**


In [12]:
import torch

class DiseaseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DiseaseDataset(train_encodings, train_labels)
val_dataset = DiseaseDataset(val_encodings, val_labels)


In [13]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', num_labels=len(le.classes_))


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Compute accuracy**

In [14]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }


In [15]:
class LossCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if 'loss' in logs:
                self.train_losses.append(logs['loss'])
            if 'eval_loss' in logs:
                self.eval_losses.append(logs['eval_loss'])

****

**Set up the training arguments and trainer.**

In [23]:
from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir='./results',          # output directory
#     #evaluation_strategy="epoch",
#     num_train_epochs=3,              # number of training epochs
#     per_device_train_batch_size=8,   # batch size for training
#     per_device_eval_batch_size=8,    # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     fp16=True,
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
#     logging_steps=10,
#     learning_rate=2e-5,  # Adjust this value, e.g., try 3e-5 or 1e-5
#     #load_best_model_at_end=True,
#     #evaluation_strategy="steps",
# )



training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    fp16=True,
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,                # log every 10 steps
    learning_rate=2e-5,              # learning rate
    #evaluation_strategy="steps",     # evaluate at each logging step
    eval_steps=10                    # evaluate every 10 steps
)


# Initialize optimizer
#optimizer = optim.AdamW(model.parameters(), lr=training_args.learning_rate)

# Initialize loss callback
#loss_callback = LossCallback()


# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics      # compute metrics function
)

# trainer = Trainer(
#     model=model,                         # the instantiated Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_dataset,         # training dataset
#     eval_dataset=val_dataset,             # evaluation dataset
#         callbacks=[loss_callback],
#     optimizers=(optimizer, get_linear_schedule_with_warmup(
#         optimizer,
#         num_warmup_steps=training_args.warmup_steps,
#         num_training_steps=training_args.max_steps
#     )),
#     compute_metrics=compute_metrics      # compute metrics function

# )
# API : fbcda51265612fcf1c8ee6637dca63c1608de2e3

In [26]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    #evaluation_strategy="epoch",
    num_train_epochs=4,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    fp16=True,
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    learning_rate=2e-5,  # Adjust this value, e.g., try 3e-5 or 1e-5
    #load_best_model_at_end=True,


)




trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics      # compute metrics function

)


**Train the model**

In [27]:
trainer.train()

Step,Training Loss
10,5.6646
20,5.7834
30,5.7884
40,5.7581
50,5.6644
60,5.7454
70,5.7155
80,5.6574
90,5.6486
100,5.7357


KeyboardInterrupt: 

**Reporting results including accuracy**

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 2.2555477619171143, 'eval_accuracy': 0.7307400379506641, 'eval_runtime': 15.9082, 'eval_samples_per_second': 331.275, 'eval_steps_per_second': 41.425, 'epoch': 3.0}


In [None]:
# Plot the losses
plt.figure(figsize=(10, 5))
plt.plot(loss_callback.train_losses, label='Training Loss')
plt.plot(loss_callback.eval_losses, label='Validation Loss')
plt.xlabel('Step/10')
plt.ylabel('Loss')
plt.title('Training and Validation Losses')
plt.legend()
plt.grid(True)
plt.show()
