In [1]:
import pandas as pd
from prettytable import PrettyTable

# Reading the dataset

In [2]:
df = pd.read_csv("./data/MIMIC-III-CXR/sections/mimic_cxr_sectioned.csv")
df.head()

Unnamed: 0,study,impression,findings,last_paragraph,comparison
0,s56699142,No acute cardiopulmonary process.,"The lungs are clear of focal consolidation, pl...",,"Radiographs from ___, ___ and ___."
1,s50414267,No acute cardiopulmonary process.,"There is no focal consolidation, pleural effus...",,None.
2,s53911762,No acute intrathoracic process.,Single frontal view of the chest provided.\n \...,,Chest radiograph ___
3,s53189527,No acute cardiopulmonary abnormality.,"The cardiac, mediastinal and hilar contours ar...",,___
4,s57375967,"Focal consolidation at the left lung base, pos...",PA and lateral views of the chest provided. ...,,


In [3]:
len(df)

227781

In [21]:
df_split = pd.read_csv("./data/MIMIC-III-CXR-JPG/mimic-cxr-2.0.0-split.csv.gz")
print(len(df_split))
df_split.head()

377110


Unnamed: 0,dicom_id,study_id,subject_id,split
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,10000032,train
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,10000032,train
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train


In [22]:
# remove the duplicates
df_split.drop_duplicates('study_id', inplace=True)
print(len(df_split))

227835


In [23]:
df_split['split'].value_counts()

train       222758
test          3269
validate      1808
Name: split, dtype: int64

In [5]:
df_chexpert = pd.read_csv("./data/MIMIC-III-CXR-JPG/mimic-cxr-2.0.0-chexpert.csv.gz")
df_chexpert.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


In [6]:
df_negbio = pd.read_csv("./data/MIMIC-III-CXR-JPG/mimic-cxr-2.0.0-negbio.csv.gz")
df_negbio.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


# Stats of labels in CheXpert labels

In [7]:
column_names = [c for c in df_chexpert.columns if c not in ('subject_id', 'study_id')]
df_chexpert_converted = df_chexpert[column_names].copy()
df_chexpert_converted=df_chexpert_converted[(df_chexpert_converted.T != 0).any()]
total = len(df_chexpert_converted)
if total < len(df):
    index_list = df_chexpert_converted.index
    df_chexpert = df_chexpert.iloc[index_list,:]
    df_chexpert.head()
else:
    print("All the rows have at least one label")

All the rows have at least one label


In [9]:
def count_percent(pathology,value):
    str_count = ""
    count = 0
    if value == 1:
        try:
            count = df_chexpert[pathology].value_counts()[1]
        except:
            print("An exception occurred")
    if value == 0:
        try:
            count = df_chexpert[pathology].value_counts()[0]
        except:
            print("An exception occurred")
    if value == -1:
        try:
            count = df_chexpert[pathology].value_counts()[-1]
        except:
            print("An exception occurred")
    str_count = str(count)+" ("+str(round((count/total)*100,2))+ ")"
    return str_count

# create a summary table
labeldistro_df = pd.DataFrame( columns =['Pathology', 'Positive(%)','Negative(%)', 'uncertain(%)'])


pretty=PrettyTable()
pretty.field_names = ['Pathology', 'Positive(%)','Negative(%)', 'uncertain(%)']
for pathology in column_names:
    pretty.add_row([pathology, count_percent(pathology,1), count_percent(pathology,-1), count_percent(pathology,0)])
    labeldistro_df = labeldistro_df.append({'Pathology':pathology, 'Positive(%)':count_percent(pathology,1), 'Negative(%)':count_percent(pathology,-1), 'uncertain(%)':count_percent(pathology,0)} , ignore_index=True)
print(pretty)
labeldistro_df.to_csv("data/MIMIC-III-CXR/CheXpertLabelsdistro.csv")
print("Total No. of reports:", total)
                                            

An exception occurred
An exception occurred
An exception occurred
An exception occurred
+----------------------------+---------------+--------------+---------------+
|         Pathology          |  Positive(%)  | Negative(%)  |  uncertain(%) |
+----------------------------+---------------+--------------+---------------+
|        Atelectasis         | 45808 (20.11) | 10327 (4.53) |  1531 (0.67)  |
|        Cardiomegaly        | 44845 (19.68) | 6043 (2.65)  |  15911 (6.98) |
|       Consolidation        |  10778 (4.73) |  4331 (1.9)  |   7967 (3.5)  |
|           Edema            | 27018 (11.86) | 13174 (5.78) | 25641 (11.25) |
| Enlarged Cardiomediastinum |  7179 (3.15)  | 9375 (4.11)  |  5283 (2.32)  |
|          Fracture          |  4390 (1.93)  |  555 (0.24)  |   886 (0.39)  |
|        Lung Lesion         |  6284 (2.76)  |  1141 (0.5)  |   862 (0.38)  |
|        Lung Opacity        | 51525 (22.62) | 3831 (1.68)  |  3069 (1.35)  |
|         No Finding         | 75455 (33.12) |   0 (0.

# Stats of labels in Negbio Labels

In [10]:
column_names = [c for c in df_negbio.columns if c not in ('subject_id', 'study_id')]
df_negbio_converted = df_negbio[column_names].copy()
df_negbio_converted = df_negbio_converted[(df_negbio_converted.T != 0).any()]
total = len(df_negbio_converted)
if total < len(df):
    index_list = df_negbio_converted.index
    df_negbio = df_negbio.iloc[index_list,:]
    df_negbio.head()
else:
    print("All the rows have at least one label")

All the rows have at least one label


In [11]:
def count_percent(pathology,value):
    str_count = ""
    count = 0
    if value == 1:
        try:
            count = df_negbio[pathology].value_counts()[1]
        except:
            print("An exception occurred")
    if value == 0:
        try:
            count = df_negbio[pathology].value_counts()[0]
        except:
            print("An exception occurred")
    if value == -1:
        try:
            count = df_negbio[pathology].value_counts()[-1]
        except:
            print("An exception occurred")
    str_count = str(count)+" ("+str(round((count/total)*100,2))+ ")"
    return str_count

pretty=PrettyTable()
pretty.field_names = ['Pathology', 'Positive(%)','Negative(%)', 'uncertain(%)']
for pathology in column_names:
    pretty.add_row([pathology, count_percent(pathology,1), count_percent(pathology,-1), count_percent(pathology,0)])

print(pretty)

print("Total No. of reports:", total)

An exception occurred
An exception occurred
+----------------------------+---------------+--------------+---------------+
|         Pathology          |  Positive(%)  | Negative(%)  |  uncertain(%) |
+----------------------------+---------------+--------------+---------------+
|        Atelectasis         | 46000 (20.19) | 10630 (4.67) |  1036 (0.45)  |
|        Cardiomegaly        | 39112 (17.17) | 11577 (5.08) |  16071 (7.05) |
|       Consolidation        |  10641 (4.67) | 3251 (1.43)  |  9184 (4.03)  |
|           Edema            | 26609 (11.68) | 12575 (5.52) |  26649 (11.7) |
| Enlarged Cardiomediastinum |  7040 (3.09)  | 9456 (4.15)  |  5311 (2.33)  |
|          Fracture          |  3782 (1.66)  |  361 (0.16)  |  1688 (0.74)  |
|        Lung Lesion         |  6176 (2.71)  | 1157 (0.51)  |   954 (0.42)  |
|        Lung Opacity        | 51348 (22.54) | 2656 (1.17)  |  4421 (1.94)  |
|         No Finding         | 78777 (34.58) |   0 (0.0)    |    0 (0.0)    |
|      Pleural Effus

# Agreement of negbio and chexpert

In [12]:
nb = pd.read_csv("./data/MIMIC-III-CXR-JPG/mimic-cxr-2.0.0-negbio.csv.gz")
cx = pd.read_csv("./data/MIMIC-III-CXR-JPG/mimic-cxr-2.0.0-chexpert.csv.gz")

# merge these findings to create a table
# both agree -> output label
# disagree -> output -9

# drop subject_id from cx - we have it in nb
df = nb.merge(
    cx.drop('subject_id', axis=1),
    how='left',
    left_on='study_id', right_on='study_id',
    suffixes=('', '_cx')
)

# subselect to training set
# study_ids = set(df_split.loc[df_split['split']=='train', 'study_id'])
# df = df.loc[df['study_id'].isin(study_ids)]

# replace numeric labels with meaningful labels
# also annotate disagreements between the two labelers
labels = {0: 'Negative', 1: 'Positive', -1: 'Uncertain', -9: 'Disagreement'}
for c in df.columns:
    if c in ('subject_id', 'study_id'):
        continue
    elif c.endswith('_cx'):
        continue
    
    # chexpert column
    c_cx = f'{c}_cx'
    
    # annotate disagreement
    for val in labels.keys():
        if val == -9:
            continue
        
        # check one is null and the other isn't
        idx = df[c].isnull() & df[c_cx].notnull()
        df.loc[idx, c] = -9
        
        idx = df[c].notnull() & df[c_cx].isnull()
        df.loc[idx, c] = -9
        
        # check both non-null, but different value
        idx = df[c].notnull() & df[c_cx].notnull() & (df[c] != df[c_cx])
        df.loc[idx, c] = -9
        
    # now for those missing in negbio
    idx = df[c].isnull() & df[f'{c}_cx'].notnull()
    df.loc[idx, c] = -9
    
    df[c] = df[c].map(labels)
    
# drop chexpert columns
cols_drop = [c for c in df.columns if c.endswith('_cx')]
df.drop(cols_drop, axis=1, inplace=True)

# display a few example cases
# display(df.head(n=10))

# create a summary table of the findings
grp_cols = [c for c in df.columns if c not in ('subject_id', 'study_id')]
tbl = {}
for c in grp_cols:
    tbl[c] = df[c].value_counts().to_dict()
tbl = pd.DataFrame.from_dict(tbl, orient='index')


# pretty format the labels
N = df.shape[0]
for c in tbl.columns:
    tbl[c] = tbl[c].apply(lambda x: f'{x:,} ({100.0*x/N:3.1f}%)')

# sort columns
print(f'Frequency of labels in MIMIC-CXR-JPG on the {df.shape[0]:,} unique radiologic studies.')
tbl = tbl[['Positive', 'Negative', 'Uncertain', 'Disagreement']]
tbl.to_latex('findings_frequency.tex')
tbl.to_csv("data/MIMIC-III-CXR/agreementsdistro.csv")
tbl

Frequency of labels in MIMIC-CXR-JPG on the 227,827 unique radiologic studies.


Unnamed: 0,Positive,Negative,Uncertain,Disagreement
Atelectasis,"45,088 (19.8%)",937.0 (0.4%),"9,897.0 (4.3%)","1,744 (0.8%)"
Cardiomegaly,"39,094 (17.2%)","15,860.0 (7.0%)","5,924.0 (2.6%)","5,924 (2.6%)"
Consolidation,"10,487 (4.6%)","7,939.0 (3.5%)","3,022.0 (1.3%)","1,628 (0.7%)"
Edema,"26,455 (11.6%)","25,246.0 (11.1%)","11,781.0 (5.2%)","2,351 (1.0%)"
Enlarged Cardiomediastinum,"7,004 (3.1%)","5,271.0 (2.3%)","9,307.0 (4.1%)",255 (0.1%)
Fracture,"3,768 (1.7%)",880.0 (0.4%),299.0 (0.1%),884 (0.4%)
Lung Lesion,"6,129 (2.7%)",842.0 (0.4%),"1,020.0 (0.4%)",296 (0.1%)
Lung Opacity,"50,916 (22.3%)","2,868.0 (1.3%)","2,110.0 (0.9%)","2,531 (1.1%)"
No Finding,"75,163 (33.0%)",nan (nan%),nan (nan%),"3,906 (1.7%)"
Pleural Effusion,"53,188 (23.3%)","27,072.0 (11.9%)","5,345.0 (2.3%)","1,667 (0.7%)"


# Working on the Architecture

In [13]:
from transformers import BertTokenizer, BertForTokenClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')

input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, labels=labels)

loss, scores = outputs[:2]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [14]:
labels

tensor([[1, 1, 1, 1, 1, 1, 1, 1]])