In [2]:
%pip install torch torchtext transformers sentencepiece pandas tqdm datasets

Note: you may need to restart the kernel to use updated packages.


In [3]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#load dataset
db = load_dataset("QuyenAnhDE/Diseases_Symptoms")

Repo card metadata block was not found. Setting CardData to empty.


In [5]:
db

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments'],
        num_rows: 400
    })
})

In [6]:
updated_db = [{'Name':item['Name'], 'Symptoms':item['Symptoms']} for item in db['train']]

In [7]:
df = pd.DataFrame(updated_db)

In [8]:
df.head(5)

Unnamed: 0,Name,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


In [9]:
#extracting the symptoms feature/column
df['Symptoms'] = df['Symptoms'].apply(lambda x: ', '.join(x.split(', ')))

In [10]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split

In [11]:
#setting up the device with cuda or cpu
if torch.cuda.is_available():
    device = torch.device('cuda')
    
else:
    device = torch.device('cpu')

In [12]:
device

device(type='cpu')

In [15]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)
#force_download=True

In [16]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [17]:
BATCH_SIZE = 8

In [18]:
df.describe()

Unnamed: 0,Name,Symptoms
count,400,400
unique,392,395
top,Sciatica,"Swelling, pain, dry mouth, bad taste"
freq,3,3


# Dataset Preparation

In [19]:
class LanguageDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.labels = df.columns 
        self.data = df.to_dict(orient='records')
        self.tokenizer = tokenizer
        x = self.fittest_max_length(df)
        self.max_length = x
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x = self.data[idx][self.labels[0]]
        y = self.data[idx][self.labels[1]]
        text = f"{x} | {y}"
        tokens = self.tokenizer.encode_plus(text, return_tensors = 'pt', max_length=128, padding='max_length', truncation=True)
        return tokens
    
    def fittest_max_length(self, df):
        max_length = max(len(max(df[self.labels[0]], key=len)), len(max(df[self.labels[1]], key=len)))
        x = 2
        while x < max_length: x = x * 2
        return x

In [20]:
data_sample = LanguageDataset(df, tokenizer)

In [21]:
data_sample

<__main__.LanguageDataset at 0x1553451d190>

# Data Preprocessing 

In [23]:
train_size = int(0.8 * len(data_sample))
val_size = len(data_sample) - train_size

train_data, val_data = random_split(data_sample, [train_size, val_size])

In [24]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)

In [25]:
num_epochs = 8

In [26]:
batch_size = BATCH_SIZE
model_name = 'distilgpt2'
gpu = 0

In [27]:
criterion = nn.CrossEntropyLoss(ignore_index= tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
tokenizer.pad_token = tokenizer.eos_token

In [28]:
results = pd.DataFrame(columns=['epoch','transformer','batch_size', 'gpu', 'training_loss', 'validation_loss', 'epoch_duration_sec'])

In [None]:
#training loop
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(train_loader, desc=f"Training epoch {epoch+1}/{num_epochs} Batch Size: {batch_size}, Transformer:{model_name}")
    
    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()
        outputs = model(input_ids = inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iterator.set_postfix({'Training Loss': loss.item()})
        epoch_training_loss += loss.item()