In [None]:
import time

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset, random_split
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

In [263]:
# Load data set from huggingface
data_sample = load_dataset("QuyenAnhDE/Diseases_Symptoms")

Repo card metadata block was not found. Setting CardData to empty.


In [264]:
data_sample

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments'],
        num_rows: 400
    })
})

In [265]:
# Convert to a pandas dataframe
updated_data = [
    {"Name": item["Name"], "Symptoms": item["Symptoms"]}
    for item in data_sample["train"]
]
df = pd.DataFrame(updated_data)

In [266]:
df.head(5)

Unnamed: 0,Name,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


In [267]:
# Just extract the Symptoms
df["Symptoms"] = df["Symptoms"].apply(lambda x: ", ".join(x.split(", ")))
display(df.head())

Unnamed: 0,Name,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


In [268]:
# If you have an NVIDIA GPU attached, use 'cuda'
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    # If Apple Silicon, set to 'mps' - otherwise 'cpu' (not advised)
    try:
        device = torch.device("mps")
    except Exception:
        device = torch.device("cpu")
print(device)

mps


In [269]:
# model_name = "distilgpt2"
model_name = "distilbert/distilgpt2"
# The tokenizer turns texts to numbers (and vice-versa)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [270]:
# The transformer model
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [271]:
df.describe()

Unnamed: 0,Name,Symptoms
count,400,400
unique,392,395
top,Sciatica,"Swelling, pain, dry mouth, bad taste"
freq,3,3


In [272]:
df.head(5)

Unnamed: 0,Name,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


In [273]:
# Dataset Prep
class LanguageDataset(Dataset):
    """
    An extension of the Dataset object to:
      - Make training loop cleaner
      - Make ingestion easier from pandas df's
    """

    def __init__(self, df, tokenizer):
        self.columns = df.columns
        self.df = df
        self.tokenizer = tokenizer
        # x = self.fittest_max_length(df)  # Fix here
        # self.max_length = x

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if isinstance(idx, int):
            idx = [idx]
        x = self.df.iloc[idx, 0].tolist()
        y = self.df.iloc[idx, 1].tolist()
        text = f"{' '.join(x)} | {' '.join(y)}"
        tokens = self.tokenizer.encode_plus(
            text,
            return_tensors="pt",
            max_length=128,
            padding="max_length",
            truncation=True,
        )
        return tokens

    @property
    def fittest_max_length(df):  # Fix here
        """
        Smallest power of two larger than the longest term in the data set.
        Important to set up max length to speed training time.
        """
        max_length = max(
            len(max(df[df.columns[0]].astype(str).tolist(), key=len)),
            len(max(df[df.columns[1]].astype(str).tolist(), key=len)),
        )
        x = 2
        while x < max_length:
            x = x * 2
        return x

In [274]:
df.iloc[[2], 0].tolist()

['Turner syndrome']

In [275]:
# Cast the Huggingface data set as a LanguageDataset we defined above
data_sample = LanguageDataset(df, tokenizer)

In [276]:
data_sample[1]

{'input_ids': tensor([[   53,  4374, 15050,  7514,    79,   930,  9544,   945,  9449,    11,
           569,  4374, 19179,    11,   569,  4374, 12301, 15212, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 5

In [277]:
# Training Params
BATCH_SIZE = 8
num_epochs = 3
learning_rate = 1e-4

In [278]:
# Create train, valid
train_size = int(0.8 * len(data_sample))
valid_size = len(data_sample) - train_size
train_data, valid_data = random_split(data_sample, [train_size, valid_size])

In [279]:
# Make the iterators
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE)

In [280]:
# Set the learning rate and loss function
## CrossEntropyLoss measures how close answers to the truth.
## More punishing for high confidence wrong answers
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
# criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [281]:
# Init a results dataframe
results = pd.DataFrame(
    columns=[
        "epoch",
        "transformer",
        "batch_size",
        "gpu",
        "training_loss",
        "validation_loss",
        "epoch_duration_sec",
    ]
)
results

Unnamed: 0,epoch,transformer,batch_size,gpu,training_loss,validation_loss,epoch_duration_sec


In [282]:
# The training loop
for epoch in range(num_epochs):
    start_time = time.time()  # Start the timer for the epoch

    # Training
    ## This line tells the model we're in 'learning mode'
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(
        train_loader,
        desc=f"Training Epoch {epoch + 1}/{num_epochs} Batch Size: {BATCH_SIZE}, Transformer: {model_name}",
    )
    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch["input_ids"].squeeze(1).to(device)
        targets = inputs.clone().to(device)
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iterator.set_postfix({"Training Loss": loss.item()})
        epoch_training_loss += loss.item()
    avg_epoch_training_loss = epoch_training_loss / len(train_iterator)

    # Validation
    ## This line below tells the model to 'stop learning'
    model.eval()
    epoch_validation_loss = 0
    total_loss = 0
    valid_iterator = tqdm(
        valid_loader, desc=f"Validation Epoch {epoch + 1}/{num_epochs}"
    )
    with torch.no_grad():
        for batch in valid_iterator:
            inputs = batch["input_ids"].squeeze(1).to(device)
            targets = inputs.clone().to(device)
            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss
            valid_iterator.set_postfix({"Validation Loss": loss.item()})
            epoch_validation_loss += loss.item()

    avg_epoch_validation_loss = epoch_validation_loss / len(valid_loader)

    end_time = time.time()  # End the timer for the epoch
    epoch_duration_sec = end_time - start_time  # Calculate the duration in seconds

    new_row = {
        "transformer": model_name,
        "batch_size": BATCH_SIZE,
        "gpu": "mps",
        "epoch": epoch + 1,
        "training_loss": avg_epoch_training_loss,
        "validation_loss": avg_epoch_validation_loss,
        "epoch_duration_sec": epoch_duration_sec,
    }  # Add epoch_duration to the dataframe

    results.loc[len(results)] = new_row
    print(f"Epoch: {epoch + 1}, Validation Loss: {total_loss / len(valid_loader)}")
    print(f"Epoch: {epoch + 1}, Training Loss: {avg_epoch_training_loss}")

Training Epoch 1/3 Batch Size: 8, Transformer: distilbert/distilgpt2: 100%|██████████| 40/40 [00:11<00:00,  3.36it/s, Training Loss=0.663]
Validation Epoch 1/3: 100%|██████████| 10/10 [00:00<00:00, 12.62it/s, Validation Loss=0.692]


Epoch: 1, Validation Loss: 0.6852001547813416
Epoch: 1, Training Loss: 1.5383217334747314


Training Epoch 2/3 Batch Size: 8, Transformer: distilbert/distilgpt2: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s, Training Loss=0.578]
Validation Epoch 2/3: 100%|██████████| 10/10 [00:00<00:00, 12.85it/s, Validation Loss=0.625]


Epoch: 2, Validation Loss: 0.6081731915473938
Epoch: 2, Training Loss: 0.6298708029091358


Training Epoch 3/3 Batch Size: 8, Transformer: distilbert/distilgpt2: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s, Training Loss=0.51] 
Validation Epoch 3/3: 100%|██████████| 10/10 [00:00<00:00, 12.95it/s, Validation Loss=0.607]

Epoch: 3, Validation Loss: 0.5843923687934875
Epoch: 3, Training Loss: 0.5338263027369976





In [283]:
input_str = "Kidney Failure"
encoded_inputs = tokenizer(
    input_str,
    return_tensors="pt",  # Return PyTorch tensors
    padding=True,  # Ensures padding if needed
    truncation=True,  # Truncate if input exceeds max length
    max_length=64,  # Set a maximum length (adjust as needed)
    return_attention_mask=True,  # Ensures attention mask is included
).to(device)  # Move tensors to the same device as the model

# Extract input_ids and attention_mask
input_ids = encoded_inputs["input_ids"]
attention_mask = encoded_inputs["attention_mask"]

In [284]:
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=32,
    num_return_sequences=1,
    do_sample=True,
    top_k=8,
    top_p=0.95,
    temperature=0.1,
    repetition_penalty=1.2,
)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print("-" * 20)
print(decoded_output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--------------------
Kidney Failure | Fatigue, fatigue, weakness
