# 1. Install and Import libraries

https://github.com/ghourimarti/LLMs-Fine-Tuning-Training-Small-Language-Model

https://huggingface.co/datasets/QuyenAnhDE/Diseases_Symptoms

https://huggingface.co/distilbert/distilgpt2

In [50]:
# !pip install torch torchtext transformers sentencepiece pandas tqdm datasets

In [51]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel

import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [52]:
# If you have an NVIDIA GPU attached, use 'cuda'
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    # If Apple Silicon, set to 'mps' - otherwise 'cpu' (not advised)
    try:
        device = torch.device('mps')
    except Exception:
        device = torch.device('cpu')
print(f"Using device: {device}")

Using device: cuda


# 2. Load Dataset

In [53]:
# Load data set from huggingface
data_sample = load_dataset("QuyenAnhDE/Diseases_Symptoms")

# Convert to a pandas dataframe
updated_data = [{'Name': item['Name'], 'Symptoms': item['Symptoms']} for item in data_sample['train']]
df = pd.DataFrame(updated_data)
display("df.head(5)" , df.head(5))
display("df.describe()" , df.describe())
display("data_sample" , data_sample)



# Just extract the Symptoms
df['Symptoms'] = df['Symptoms'].apply(lambda x: ', '.join(x.split(', ')))
display(df.head())

Repo card metadata block was not found. Setting CardData to empty.


'df.head(5)'

Unnamed: 0,Name,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


'df.describe()'

Unnamed: 0,Name,Symptoms
count,400,400
unique,392,395
top,Sciatica,"Swelling, pain, dry mouth, bad taste"
freq,3,3


'data_sample'

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments'],
        num_rows: 400
    })
})

Unnamed: 0,Name,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


# 3. Tokenizer

In [54]:
# The tokenizer turns texts to numbers (and vice-versa)
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

# The transformer
model = GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)
# model

# 4. Dataset Preparation

In [55]:
# Dataset Prep
class LanguageDataset(Dataset):
    """
    An extension of the Dataset object to:
      - Make training loop cleaner
      - Make ingestion easier from pandas df's
    """
    def __init__(self, df, tokenizer):
        self.labels = df.columns
        self.data = df.to_dict(orient='records')
        self.tokenizer = tokenizer
        x = self.fittest_max_length(df)  # Fix here
        self.max_length = x

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][self.labels[0]]
        y = self.data[idx][self.labels[1]]
        text = f"{x} | {y}"
        tokens = self.tokenizer.encode_plus(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        return tokens

    def fittest_max_length(self, df):  # Fix here
        """
        Smallest power of two larger than the longest term in the data set.
        Important to set up max length to speed training time.
        """
        max_length = max(len(max(df[self.labels[0]], key=len)), len(max(df[self.labels[1]], key=len)))
        x = 2
        while x < max_length: x = x * 2
        return x

# Cast the Huggingface data set as a LanguageDataset we defined above
data_sample = LanguageDataset(df, tokenizer)
data_sample


<__main__.LanguageDataset at 0x1e5ece970e0>

# 5. Training Parameters

In [56]:
# Model params
BATCH_SIZE = 32

# Create train, valid
train_size = int(0.8 * len(data_sample))
valid_size = len(data_sample) - train_size
train_data, valid_data = random_split(data_sample, [train_size, valid_size])

# Make the iterators
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE)

# Set the number of epochs
num_epochs = 10

# Training parameters
batch_size = BATCH_SIZE
model_name = 'distilgpt2'
gpu = 0

# Set the learning rate and loss function
## CrossEntropyLoss measures how close answers to the truth.
## More punishing for high confidence wrong answers
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
tokenizer.pad_token = tokenizer.eos_token

# Init a results dataframe
results = pd.DataFrame(columns=['epoch', 'transformer', 'batch_size', 'gpu',
                                'training_loss', 'validation_loss', 'epoch_duration_sec'])

# 6. Training

In [57]:
# The training loop
for epoch in range(num_epochs):
    start_time = time.time()  # Start the timer for the epoch

    # Training
    ## This line tells the model we're in 'learning mode'
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs} Batch Size: {batch_size}, Transformer: {model_name}")
    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iterator.set_postfix({'Training Loss': loss.item()})
        epoch_training_loss += loss.item()
    avg_epoch_training_loss = epoch_training_loss / len(train_iterator)

    # Validation
    ## This line below tells the model to 'stop learning'
    model.eval()
    epoch_validation_loss = 0
    total_loss = 0
    valid_iterator = tqdm(valid_loader, desc=f"Validation Epoch {epoch+1}/{num_epochs}")
    with torch.no_grad():
        for batch in valid_iterator:
            inputs = batch['input_ids'].squeeze(1).to(device)
            targets = inputs.clone()
            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss
            valid_iterator.set_postfix({'Validation Loss': loss.item()})
            epoch_validation_loss += loss.item()

    avg_epoch_validation_loss = epoch_validation_loss / len(valid_loader)

    end_time = time.time()  # End the timer for the epoch
    epoch_duration_sec = end_time - start_time  # Calculate the duration in seconds

    new_row = {'transformer': model_name,
               'batch_size': batch_size,
               'gpu': gpu,
               'epoch': epoch+1,
               'training_loss': avg_epoch_training_loss,
               'validation_loss': avg_epoch_validation_loss,
               'epoch_duration_sec': epoch_duration_sec}  # Add epoch_duration to the dataframe

    results.loc[len(results)] = new_row
    print(f"Epoch: {epoch+1}, Validation Loss: {total_loss/len(valid_loader)}")

Training Epoch 1/10 Batch Size: 32, Transformer: distilgpt2: 100%|██████████| 10/10 [00:22<00:00,  2.25s/it, Training Loss=0.939]
Validation Epoch 1/10: 100%|██████████| 3/3 [00:00<00:00,  6.42it/s, Validation Loss=0.795]


Epoch: 1, Validation Loss: 0.8818838000297546


Training Epoch 2/10 Batch Size: 32, Transformer: distilgpt2: 100%|██████████| 10/10 [00:09<00:00,  1.01it/s, Training Loss=0.854]
Validation Epoch 2/10: 100%|██████████| 3/3 [00:00<00:00,  5.67it/s, Validation Loss=0.691]


Epoch: 2, Validation Loss: 0.7750835418701172


Training Epoch 3/10 Batch Size: 32, Transformer: distilgpt2: 100%|██████████| 10/10 [00:20<00:00,  2.09s/it, Training Loss=0.555]
Validation Epoch 3/10: 100%|██████████| 3/3 [00:00<00:00,  5.71it/s, Validation Loss=0.61] 


Epoch: 3, Validation Loss: 0.6968891024589539


Training Epoch 4/10 Batch Size: 32, Transformer: distilgpt2: 100%|██████████| 10/10 [00:20<00:00,  2.07s/it, Training Loss=0.573]
Validation Epoch 4/10: 100%|██████████| 3/3 [00:00<00:00,  5.63it/s, Validation Loss=0.561]


Epoch: 4, Validation Loss: 0.6551836729049683


Training Epoch 5/10 Batch Size: 32, Transformer: distilgpt2: 100%|██████████| 10/10 [00:21<00:00,  2.10s/it, Training Loss=0.46]
Validation Epoch 5/10: 100%|██████████| 3/3 [00:00<00:00,  5.54it/s, Validation Loss=0.553]


Epoch: 5, Validation Loss: 0.6576871871948242


Training Epoch 6/10 Batch Size: 32, Transformer: distilgpt2: 100%|██████████| 10/10 [00:20<00:00,  2.07s/it, Training Loss=0.419]
Validation Epoch 6/10: 100%|██████████| 3/3 [00:00<00:00,  5.94it/s, Validation Loss=0.567]


Epoch: 6, Validation Loss: 0.6752836108207703


Training Epoch 7/10 Batch Size: 32, Transformer: distilgpt2: 100%|██████████| 10/10 [00:20<00:00,  2.01s/it, Training Loss=0.368]
Validation Epoch 7/10: 100%|██████████| 3/3 [00:00<00:00,  5.75it/s, Validation Loss=0.577]


Epoch: 7, Validation Loss: 0.689461350440979


Training Epoch 8/10 Batch Size: 32, Transformer: distilgpt2: 100%|██████████| 10/10 [00:20<00:00,  2.06s/it, Training Loss=0.279]
Validation Epoch 8/10: 100%|██████████| 3/3 [00:00<00:00,  5.64it/s, Validation Loss=0.599]


Epoch: 8, Validation Loss: 0.7189571857452393


Training Epoch 9/10 Batch Size: 32, Transformer: distilgpt2: 100%|██████████| 10/10 [00:20<00:00,  2.07s/it, Training Loss=0.278]
Validation Epoch 9/10: 100%|██████████| 3/3 [00:00<00:00,  5.94it/s, Validation Loss=0.62] 


Epoch: 9, Validation Loss: 0.7462502717971802


Training Epoch 10/10 Batch Size: 32, Transformer: distilgpt2: 100%|██████████| 10/10 [00:20<00:00,  2.01s/it, Training Loss=0.261]
Validation Epoch 10/10: 100%|██████████| 3/3 [00:00<00:00,  5.97it/s, Validation Loss=0.647]

Epoch: 10, Validation Loss: 0.775989294052124





# 7. Inference

In [58]:
input_str = "Kidney Failure"
input_ids = tokenizer.encode(input_str, return_tensors='pt').to(device)

output = model.generate(
    input_ids,
    max_length=20,
    num_return_sequences=1,
    do_sample=True,
    top_k=8,
    top_p=0.95,
    temperature=0.5,
    repetition_penalty=1.2
)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Kidney Failure | Abdominal pain, frequent urination


# 8. Save Model

In [59]:
torch.save(model, 'SmallMedLM.pt')
# torch.save(model, 'drive/My Drive/SmallMedLM.pt')