In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Loading notes data
notes = pd.read_csv('/content/drive/MyDrive/NOTEEVENTS.csv.gz',low_memory=False, compression='gzip').set_index('ROW_ID')

In [3]:
#Loading diagnoses data
diagnoses = pd.read_csv('/content/drive/MyDrive/DIAGNOSES_ICD.csv.gz', compression='gzip').set_index('ROW_ID')

In [4]:
#Merging tables together
merged_df = pd.merge(notes, diagnoses, on=['SUBJECT_ID','HADM_ID'], how='inner')

In [5]:
merged_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,SEQ_NUM,ICD9_CODE
0,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...,1.0,1193
1,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...,2.0,4254
2,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...,3.0,42731
3,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...,4.0,2639
4,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...,5.0,2762


In [6]:
diagnoses.loc[diagnoses['ICD9_CODE'] == '7862'] #cough

Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
ROW_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
45003,4006,169222,8.0,7862
41162,3718,171172,7.0,7862
71887,6437,181008,9.0,7862
64440,5740,107188,8.0,7862
101271,9052,152259,24.0,7862
...,...,...,...,...
645770,98864,141667,22.0,7862
613444,91765,188657,16.0,7862
614367,91929,100463,7.0,7862
646344,99005,192840,4.0,7862


In [7]:
diagnoses.loc[diagnoses['ICD9_CODE'] == '7840'] #headache

Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
ROW_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1254,109,161950,6.0,7840
6640,572,193283,2.0,7840
11907,1018,135732,17.0,7840
33198,2977,199810,5.0,7840
28663,2557,177403,2.0,7840
...,...,...,...,...
636484,96777,114995,27.0,7840
643846,98416,153260,14.0,7840
641439,97843,163471,3.0,7840
640258,97567,186941,12.0,7840


In [8]:
diagnoses.loc[diagnoses['ICD9_CODE'] == '99591'] #sepsis

Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
ROW_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
505,64,172056,3.0,99591
679,85,112077,18.0,99591
477,61,189535,8.0,99591
490,62,116009,4.0,99591
2383,191,136614,4.0,99591
...,...,...,...,...
632154,95895,160501,10.0,99591
633127,96145,137544,3.0,99591
638716,97229,191765,7.0,99591
640418,97598,148929,11.0,99591


In [9]:
diagnoses.loc[diagnoses['ICD9_CODE'] == '1330'] #scabies

Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
ROW_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
56564,5060,143525,4.0,1330
56691,5060,193317,9.0,1330
163113,14585,107708,14.0,1330
291521,26027,127906,7.0,1330
348304,30335,128950,6.0,1330
412452,47522,192407,5.0,1330
477354,61223,130044,25.0,1330
463426,58269,123569,7.0,1330
574788,82765,101333,9.0,1330


In [10]:
diagnoses.loc[diagnoses['ICD9_CODE'] == '78791'] #diarrhea

Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
ROW_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1165,109,151240,9.0,78791
2028,161,121804,11.0,78791
994,109,126055,11.0,78791
4534,378,105908,5.0,78791
4307,357,122609,24.0,78791
...,...,...,...,...
650275,99822,195871,17.0,78791
650446,99864,111512,3.0,78791
650866,99944,185654,15.0,78791
639053,97327,121397,19.0,78791


In [11]:
#Filtering through all the notes that have the category as Discharge summary

discharge_filter = notes.loc[notes['CATEGORY'] == 'Discharge summary', ['SUBJECT_ID', 'HADM_ID', 'TEXT']]

In [12]:
#Mapping ICD9_codes to name of the diagnoses
diagnoses_codes = ['78791', '1330', '99591', '7840', '7862']
diagnoses_disease = diagnoses[diagnoses['ICD9_CODE'].isin(diagnoses_codes)].copy()

diagnoses_types = {
    '78791': 'Diarrhea',
    '1330': 'Scabies',
    '99591': 'Sepsis',
    '7840': 'Headache',
    '7862': 'Cough'
}

diagnoses_disease['DISEASE TYPE'] = diagnoses_disease['ICD9_CODE'].map(diagnoses_types)

In [13]:
#Merging specific diagnoses with appropriate notes
dd_df = pd.merge(discharge_filter, diagnoses_disease, on=['SUBJECT_ID', 'HADM_ID'], how='inner')

In [14]:
#Relaying dataframe to csv
dd_df.to_csv('dd_df.csv')

In [15]:
noted_df = pd.read_csv('dd_df.csv')

In [16]:
noted_df.value_counts('DISEASE TYPE')

Unnamed: 0_level_0,count
DISEASE TYPE,Unnamed: 1_level_1
Sepsis,1364
Diarrhea,1118
Headache,297
Cough,81
Scabies,9


In [17]:
!pip install spacy==3.7.1

Collecting spacy==3.7.1
  Downloading spacy-3.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting weasel<0.4.0,>=0.1.0 (from spacy==3.7.1)
  Downloading weasel-0.3.4-py3-none-any.whl.metadata (4.7 kB)
Collecting typer<0.10.0,>=0.3.0 (from spacy==3.7.1)
  Downloading typer-0.9.4-py3-none-any.whl.metadata (14 kB)
Collecting pathy>=0.10.0 (from spacy==3.7.1)
  Downloading pathy-0.11.0-py3-none-any.whl.metadata (16 kB)
Collecting smart-open<7.0.0,>=5.2.1 (from spacy==3.7.1)
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Collecting pathlib-abc==0.1.1 (from pathy>=0.10.0->spacy==3.7.1)
  Downloading pathlib_abc-0.1.1-py3-none-any.whl.metadata (18 kB)
Collecting cloudpathlib<0.17.0,>=0.7.0 (from weasel<0.4.0,>=0.1.0->spacy==3.7.1)
  Downloading cloudpathlib-0.16.0-py3-none-any.whl.metadata (14 kB)
Downloading spacy-3.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m102.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy<3.8.0,>=3.7.2 (from en-core-web-sm==3.7.1)
  Downloading spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Downloading spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.7.1
    Uninstalling spacy-3.7.1:
      Successfully uninstalled spacy-3.7.1
Successfully installed spacy-3.7.5
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en

In [19]:
#importing spacy and nlp model
import spacy
nlp = spacy.load('en_core_web_sm')

In [20]:
#function that will clean up the notes taken
def extract_tokens(text):
  doc = nlp(text)
  tokens = [token.text.lower() for token in doc if (token.is_alpha or token.is_digit) and not token.is_punct]
  tokens = [token for token in tokens if token not in nlp.Defaults.stop_words]
  created_text = ' '.join(tokens)
  return created_text

In [21]:
#Applying the cleaned notes to the noted_df
noted_df['Created Text'] = noted_df['TEXT'].apply(extract_tokens)

In [22]:
#CountVectorizer on cleaned notes
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features=3000)
X = vect.fit_transform(noted_df['Created Text'])

In [23]:
#BERT Model
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
#Splitting the data for training
from sklearn.model_selection import train_test_split
#Training accounts for 80% of data and 20% for testing
train_texts, test_texts, train_labels, test_labels = train_test_split(
    noted_df['Created Text'], noted_df['DISEASE TYPE'], test_size=0.2, random_state=42
)

#Used during training to montitor our model's performance
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)
#Converting lables into a category data type
train_labels = train_labels.astype('category')
val_labels = val_labels.astype('category')

#Utilizing tokenizer so our model can understand the data it's fed
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, return_tensors='pt')

#loading data into model in batches. Batch number can be changed during training
train_dataloader = DataLoader(
    TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        torch.tensor(train_labels.cat.codes.values, dtype=torch.long)
    ),
    batch_size=32,
    shuffle=True
)
val_dataloader = DataLoader(
    TensorDataset(
        val_encodings['input_ids'],
        val_encodings['attention_mask'],
        torch.tensor(val_labels.cat.codes.values, dtype=torch.long)
    ),
    batch_size=32
)

In [25]:
import torch.nn as nn

In [26]:
#Initializing sequential model
model = nn.Sequential(
    BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5),
    nn.Dropout(0.5)
)
#AdamW optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6, weight_decay=0.025)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
'''
!pip install nlpaug

#Data augmentation
import nlpaug.augmenter.word as naw
import nltk
nltk.download('averaged_perceptron_tagger_eng')

aug = naw.SynonymAug(aug_src='wordnet')

augmented_text = aug.augment(noted_df['Created Text'][0])
'''

"\n!pip install nlpaug\n\n#Data augmentation\nimport nlpaug.augmenter.word as naw\nimport nltk\nnltk.download('averaged_perceptron_tagger_eng')\n\naug = naw.SynonymAug(aug_src='wordnet')  \n\naugmented_text = aug.augment(noted_df['Created Text'][0])\n"

In [30]:
'''
#function for data augmentation on the text
def augment_text(text):
    augmented_text = aug.augment(text)
    return augmented_text
'''

'\n#function for data augmentation on the text\ndef augment_text(text):\n    augmented_text = aug.augment(text)\n    return augmented_text\n'

In [31]:
'''
!pip install -U imbalanced-learn

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, noted_df['DISEASE TYPE'])
'''

"\n!pip install -U imbalanced-learn\n\nfrom imblearn.over_sampling import RandomOverSampler\n\nros = RandomOverSampler(random_state=42)\nX_resampled, y_resampled = ros.fit_resample(X, noted_df['DISEASE TYPE'])  \n"

In [32]:
#defining loss function
criterion = nn.CrossEntropyLoss()

#training loop
def train_epoch(model, data_loader, optimizer, device):
    model = model.train() #model in training mode

    losses = []
    correct_predictions = 0
    total_predictions = 0

    #loops through each bactch in the data loader
    for batch in data_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()


        outputs = model[0](input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)


        loss.backward()
        optimizer.step()

        losses.append(loss.item())

        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += len(labels)
    #return average accuracy and loss
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

In [33]:
#evaluating model
def eval_model(model, data_loader, device):
    model = model.eval()

    losses = []
    correct_predictions = 0
    total_samples = 0

    #loops through each bactch in the data loader
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model[0](input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

            losses.append(loss.item())

            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_samples += labels.size(0)
    #return overall accuracy and average loss
    return correct_predictions.double() / total_samples, np.mean(losses)

In [34]:
import torch

torch.cuda.empty_cache()

In [36]:
#save model checkpoint
def save_checkpoint(model, optimizer, epoch, filepath):
    """Saves the model and optimizer state to a checkpoint file."""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }
    torch.save(checkpoint, filepath)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

best_val_acc = 0
epochs_without_improvement = 0
patience = 9
#training epoch loop
for epoch in range(15):

    train_acc, train_loss = train_epoch(model, train_dataloader, optimizer, device)
    val_acc, val_loss = eval_model(model, val_dataloader, device)

    print(f'Epoch {epoch + 1}')
    print(f'Train Loss: {train_loss} | Train Accuracy: {train_acc}')
    print(f'Val Loss: {val_loss} | Val Accuracy: {val_acc}')
    #if-else statement for saving best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        epochs_without_improvement = 0
        save_checkpoint(model, optimizer, epoch, 'best_model_checkpoint.pth')
        print("Checkpoint saved!")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print("Early stopping triggered!")
            break

Epoch 1
Train Loss: 0.7850483848498417 | Train Accuracy: 0.6542372881355932
Val Loss: 2.7714847326278687 | Val Accuracy: 0.4260869565217391
Checkpoint saved!
Epoch 2
Train Loss: 0.7204673629540663 | Train Accuracy: 0.6983050847457627
Val Loss: 2.7235948741436005 | Val Accuracy: 0.32608695652173914
Epoch 3
Train Loss: 0.6866729149451622 | Train Accuracy: 0.7099273607748184
Val Loss: 2.8487130403518677 | Val Accuracy: 0.3869565217391304
Epoch 4
Train Loss: 0.6417011164701902 | Train Accuracy: 0.7394673123486682
Val Loss: 2.866071343421936 | Val Accuracy: 0.3652173913043478
Epoch 5
Train Loss: 0.5940979627462534 | Train Accuracy: 0.7665859564164649
Val Loss: 3.013763576745987 | Val Accuracy: 0.3869565217391304
Epoch 6
Train Loss: 0.5385694079674207 | Train Accuracy: 0.8043583535108959
Val Loss: 2.9898224472999573 | Val Accuracy: 0.39565217391304347
Epoch 7
Train Loss: 0.507182797560325 | Train Accuracy: 0.8193704600484262
Val Loss: 3.0400937497615814 | Val Accuracy: 0.3130434782608696
Epo

In [37]:
#data used for testing
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, return_tensors='pt')

test_labels = test_labels.astype('category')

test_dataloader = DataLoader(
    TensorDataset(
        test_encodings['input_ids'],
        test_encodings['attention_mask'],
        torch.tensor(test_labels.cat.codes.values, dtype=torch.long)
    ),
    batch_size=32
)

In [41]:
#loading best versino of model
checkpoint = torch.load('best_model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
#evaluation mode
model.eval()

all_predictions = []
all_labels = []
#loop that's predicting
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model[0](input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        all_predictions.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

from sklearn.metrics import accuracy_score, classification_report
#evaluates model and generates report
diagnoses_name = list(train_labels.cat.categories)
accuracy = accuracy_score(all_labels, all_predictions)
report = classification_report(all_labels, all_predictions, target_names=diagnoses_name)

print(f"Test Accuracy: {accuracy}")
print(report)

Test Accuracy: 0.5662020905923345
              precision    recall  f1-score   support

       Cough       0.00      0.00      0.00        22
    Diarrhea       0.49      0.76      0.60       227
    Headache       0.58      0.68      0.63        63
     Scabies       0.00      0.00      0.00         5
      Sepsis       0.74      0.42      0.54       257

    accuracy                           0.57       574
   macro avg       0.36      0.37      0.35       574
weighted avg       0.59      0.57      0.55       574



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
