In [1]:
#Importing libraries to import classes
import sys
import os
sys.path.append(os.path.abspath('..'))


from medVec_main import mvPreproc, cBERTbase, BERTdataset, cBertMCChead, cBERT_train, Bert_test, sftTune



  from .autonotebook import tqdm as notebook_tqdm


# Data Preprocessing

In [None]:
dataproc = mvPreproc()
patdf, meddf = dataproc.ds_load()

#Cleaning text and mapping categories to numeric values
patdf['text'] = patdf['text'].apply(dataproc.clean_text)

labels = {
    'allergy': 0,
    'arthritis': 1,
    'bronchial asthma': 2,
    'cervical spondylosis': 3,
    'chicken pox': 4,
    'common cold': 5,
    'dengue': 6,
    'diabetes': 7,
    'drug reaction': 8,
    'fungal infection': 9,
    'gastroesophageal reflux disease': 10,
    'hypertension': 11,
    'impetigo': 12,
    'jaundice': 13,
    'malaria': 14,
    'migraine': 15,
    'peptic ulcer disease': 16,
    'pneumonia': 17,
    'psoriasis': 18,
    'typhoid': 19,
    'urinary tract infection': 20,
    'varicose veins': 21
}


patdf = dataproc.feat_map(dataset=patdf,label_map=labels,col='label')

print(patdf.head())


   label                                               text
0      3  I've been having a lot of pain in my neck and ...
1     12  I have a rash on my face that is getting worse...
2     20  I have been urinating blood. I sometimes feel ...
3      1  I have been having trouble with my muscles and...
4      6  I have been feeling really sick. My body hurts...


# Embedding Creation to Dataset Splits

In [None]:
#Instantating BERT and splitting data into the six datasets

bert_instance = cBERTbase(inputs=None)

print('ClinicalBERT Instance Created!')

X_train, X_dev, X_test, y_train, y_dev, y_test = bert_instance.ttd_splits(
    dataset=patdf,
    x_col='text',
    y_col='label'
)

print('TTD Splits Created!')

#Tokenizing the datasets and creating the embeddings
embd_splits = bert_instance.mv_tokenizer(
    X_train, X_dev, X_test,
    y_train, y_dev, y_test,
    repr='pooled'
)

print('Tokenization and Sentence Embeddings Complete!')

#Preparing the data for classification
train_data = BERTdataset(embd_splits['train']['embeddings'], embd_splits['train']['labels'])

print('Train Dataset Ready!')

dev_data   = BERTdataset(embd_splits['dev']['embeddings'],   embd_splits['dev']['labels'])

print('DEV Dataset Ready!')

test_data  = BERTdataset(embd_splits['test']['embeddings'],  embd_splits['test']['labels'])

print('Test Dataset Ready!')

2025-05-07 17:37:05.755539: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ClinicalBERT Instance Created!
TTD Splits Created!
Tokenization and Sentence Embeddings Complete!
Train Dataset Ready!
DEV Dataset Ready!
Test Dataset Ready!


# Classification Head Instantiation/Training/Tuning

In [12]:
from torch.utils.data import DataLoader
import copy

#Loading the data for training, tuning, and testing
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)

print('Train data loaded!')

dev_loader   = DataLoader(dev_data, batch_size=1, shuffle=False)

print('Dev data loaded!')

test_loader  = DataLoader(test_data, batch_size=1, shuffle=False)

print('Test data loaded!')

#Instantiating classifier model, training, and testing loops
classification_head = cBertMCChead(input_dim=768, hidden_dim=256, dropout=0.3, num_layers=1, num_classes=22)
tuner = sftTune(model=classification_head, learn_rate=1e-4)
optimizer = tuner.adamW()
loss_fn = tuner.xc_entropy()

print('Classification Head Instantiated!')


trainer = cBERT_train()
print('Training loop instantiated!')

testing = Bert_test()
print('Tuning loop instantiated!')

top_model = None
lowest_tune_ls = float('inf')
no_improve = 0
epoch_limit = 5

#Creating a break in the training loop to avoid overfitting
for epoch in range(60):
    print(f"Epoch {epoch+1}")
    trainer.train_loop(train_loader, classification_head, loss_fn, optimizer, device='cpu')
    tune_loss, *_ = testing.test_loop(dev_loader, classification_head, loss_fn)

    if tune_loss < lowest_tune_ls:
        lowest_tune_ls = tune_loss
        top_model = copy.deepcopy(classification_head)
        no_improve = 0
        print('No improvement count reset!')
    else:
        no_improve += 1
        print('No improvement detected!')

        if no_improve >= epoch_limit:
            print('Epoch limit reached!')
            break


print('Training cycle complete!')



Train data loaded!
Dev data loaded!
Test data loaded!
Classification Head Instantiated!
Training loop instantiated!
Tuning loop instantiated!
Epoch 1
loss: 3.199614  [    0/  596]
loss: 3.086557  [  100/  596]
loss: 3.191272  [  200/  596]
loss: 2.900566  [  300/  596]
loss: 2.988469  [  400/  596]
loss: 3.025452  [  500/  596]
Test Error: 
 Accuracy: 24.6%, Avg loss: 2.906895 

 Precision: 0.2477, Recall: 0.2456, F1 Score: 0.2005

No improvement count reset!
Epoch 2
loss: 2.917278  [    0/  596]
loss: 2.918213  [  100/  596]
loss: 3.168327  [  200/  596]
loss: 2.839109  [  300/  596]
loss: 3.199832  [  400/  596]
loss: 2.968313  [  500/  596]
Test Error: 
 Accuracy: 33.3%, Avg loss: 2.636689 

 Precision: 0.3931, Recall: 0.3333, F1 Score: 0.2956

No improvement count reset!
Epoch 3
loss: 2.955577  [    0/  596]
loss: 2.573903  [  100/  596]
loss: 2.892143  [  200/  596]
loss: 1.670786  [  300/  596]
loss: 2.103541  [  400/  596]
loss: 2.578471  [  500/  596]
Test Error: 
 Accuracy: 55

# Classification Head Testing

In [13]:
#Final test loop of the model 

print('Testing loop instantiated for best model!')

testing.test_loop(test_loader, top_model, loss_fn)

Testing loop instantiated for best model!
Test Error: 
 Accuracy: 80.2%, Avg loss: 0.522161 

 Precision: 0.7984, Recall: 0.8023, F1 Score: 0.7811



(0.5221605012355762,
 0.8023255813953488,
 0.7984496124031009,
 0.8023255813953488,
 0.7810723514211886)