In [4]:
#Importing libraries to import classes
import sys
import os
sys.path.append(os.path.abspath('..'))


from medVec_main import mvPreproc, cBERTbase, BERTdataset, cBertMCChead, cBERT_train, Bert_test, sftTune



# Data Preprocessing

In [5]:
dataproc = mvPreproc()
patdf, meddf = dataproc.ds_load()


patdf['text'] = patdf['text'].apply(dataproc.clean_text)

labels = {
    'allergy': 0,
    'arthritis': 1,
    'bronchial asthma': 2,
    'cervical spondylosis': 3,
    'chicken pox': 4,
    'common cold': 5,
    'dengue': 6,
    'diabetes': 7,
    'drug reaction': 8,
    'fungal infection': 9,
    'gastroesophageal reflux disease': 10,
    'hypertension': 11,
    'impetigo': 12,
    'jaundice': 13,
    'malaria': 14,
    'migraine': 15,
    'peptic ulcer disease': 16,
    'pneumonia': 17,
    'psoriasis': 18,
    'typhoid': 19,
    'urinary tract infection': 20,
    'varicose veins': 21
}


patdf = dataproc.feat_map(dataset=patdf,label_map=labels,col='label')

print(patdf.head())


   label                                               text
0      3  I've been having a lot of pain in my neck and ...
1     12  I have a rash on my face that is getting worse...
2     20  I have been urinating blood. I sometimes feel ...
3      1  I have been having trouble with my muscles and...
4      6  I have been feeling really sick. My body hurts...


# Embedding Creation to Dataset Splits

In [6]:
bert_instance = cBERTbase(inputs=None)

print('ClinicalBERT Instance Created!')

X_train, X_dev, X_test, y_train, y_dev, y_test = bert_instance.ttd_splits(
    dataset=patdf,
    x_col='text',
    y_col='label'
)

print('TTD Splits Created!')

embd_splits = bert_instance.mv_tokenizer(
    X_train, X_dev, X_test,
    y_train, y_dev, y_test,
    repr='pooled'
)

print('Tokenization and Sentence Embeddings Complete!')

train_data = BERTdataset(embd_splits['train']['embeddings'], embd_splits['train']['labels'])

print('Train Dataset Ready!')

dev_data   = BERTdataset(embd_splits['dev']['embeddings'],   embd_splits['dev']['labels'])

print('DEV Dataset Ready!')

test_data  = BERTdataset(embd_splits['test']['embeddings'],  embd_splits['test']['labels'])

print('Test Dataset Ready!')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ClinicalBERT Instance Created!
TTD Splits Created!
Tokenization and Sentence Embeddings Complete!
Train Dataset Ready!
DEV Dataset Ready!
Test Dataset Ready!


# Classification Head Instantiation/Training/Tuning

In [7]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_data, batch_size=1, shuffle=True)

print('Train data loaded!')

dev_loader   = DataLoader(dev_data, batch_size=1, shuffle=False)

print('Dev data loaded!')

test_loader  = DataLoader(test_data, batch_size=1, shuffle=False)

print('Test data loaded!')


classification_head = cBertMCChead(input_dim=768, hidden_dim=256, dropout=0.3, num_layers=1, num_classes=22)
tuner = sftTune(model=classification_head, learn_rate=1e-4)
optimizer = tuner.adam()
loss_fn = tuner.xc_entropy()

print('Classification Head Instantiated!')


trainer = cBERT_train()

print('Training loop instantiated!')


for epoch in range(60):
    print(f"Epoch {epoch+1}")
    trainer.train_loop(train_loader, classification_head, loss_fn, optimizer, device='cpu')


print('Training cycle complete!')


tuning = Bert_test()

print('Testing loop instantiated!')

tuning.test_loop(dev_loader, classification_head, loss_fn)

Train data loaded!
Dev data loaded!
Test data loaded!
Classification Head Instantiated!
Training loop instantiated!
Epoch 1
loss: 3.285174  [    0/  596]
loss: 2.860443  [  100/  596]
loss: 3.092063  [  200/  596]
loss: 3.011103  [  300/  596]
loss: 2.954723  [  400/  596]
loss: 2.933386  [  500/  596]
Epoch 2
loss: 2.827223  [    0/  596]
loss: 2.918288  [  100/  596]
loss: 2.657739  [  200/  596]
loss: 2.396403  [  300/  596]
loss: 2.813665  [  400/  596]
loss: 2.250852  [  500/  596]
Epoch 3
loss: 2.487681  [    0/  596]
loss: 2.253762  [  100/  596]
loss: 2.455234  [  200/  596]
loss: 2.794484  [  300/  596]
loss: 2.047603  [  400/  596]
loss: 2.704161  [  500/  596]
Epoch 4
loss: 2.601577  [    0/  596]
loss: 2.652456  [  100/  596]
loss: 2.780193  [  200/  596]
loss: 1.929157  [  300/  596]
loss: 3.105184  [  400/  596]
loss: 1.451292  [  500/  596]
Epoch 5
loss: 2.636462  [    0/  596]
loss: 2.523971  [  100/  596]
loss: 2.244311  [  200/  596]
loss: 1.429069  [  300/  596]
loss

(0.4280597131766722,
 0.8713450292397661,
 0.8837063196712319,
 0.8713450292397661,
 0.8703522377152036)