**Goal:**     
This notebook will convert the lables to the union of papers that cited OpenI[1][2][3][4].

**Refrence:**      
[1] Wang, Xiaosong, et al. "Chestx-ray8: Hospital-scale chest x-ray database and benchmarks on weakly-supervised classification and localization of common thorax diseases." Proceedings of the IEEE conference on computer vision and pattern recognition. 2017.            
[2] Zech, John R., et al. "Variable generalization performance of a deep learning model to detect pneumonia in chest radiographs: a cross-sectional study." PLoS medicine 15.11 (2018).        
[3] Wang, Xiaosong, et al. "Tienet: Text-image embedding network for common thorax disease classification and reporting in chest x-rays." Proceedings of the IEEE conference on computer vision and pattern recognition. 2018.    
[4] Shin, Hoo-Chang, et al. "Learning to read chest x-rays: Recurrent neural cascade model for automated image annotation." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.


In [1]:
# imports
import pandas as pd
import ast
from collections import Counter
from collections import OrderedDict
from itertools import islice
import re
from prettytable import PrettyTable

# cited papers

In [2]:
paper1 = ['Atelectasis', 'Cardiomegaly', 'Effusion', 
        'Infiltrate', 'Mass', 'Nodule', 
        'Pneumonia', 'Pneumothorax']

paper2 = ['Pneumonia', 'Emphysema', 'Effusion', 'Consolidation', 
        'Nodule', 'Atelectasis', 'Edema', 'Cardiomegaly', 'Hernia']

paper3 = ['Atelectasis', 'Cardiomegaly', 'Effusion', 
        'Infiltrate', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax', 
        'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'NoFinding']

paper4 = ['opacity', 'Cardiomegaly', 'Calcinosis', 'lung/hypoinflation',
        'Calcified granuloma', 'thoracic vertebrae/degenerative', 'lung/hyperdistention', 
        'spine/degenerative', 'catheters, indwelling', 'granulomatous disease', 
        'nodule', 'surgical instrument']

added = ['Cicatrix', 'Deformity', "Medical Device", "Airspace Disease"]

labels = list(set().union(paper1, paper2, paper3, paper4, added))
labels.sort()
for label in labels:
    print(label)

Airspace Disease
Atelectasis
Calcified granuloma
Calcinosis
Cardiomegaly
Cicatrix
Consolidation
Deformity
Edema
Effusion
Emphysema
Fibrosis
Hernia
Infiltrate
Mass
Medical Device
NoFinding
Nodule
Pneumonia
Pneumothorax
catheters, indwelling
granulomatous disease
lung/hyperdistention
lung/hypoinflation
nodule
opacity
spine/degenerative
surgical instrument
thoracic vertebrae/degenerative


# Read OpenI dataset

In [3]:
openI_df = pd.read_csv("data/OpenI/OpenI.csv")
openI_df = openI_df.drop(['NormalLabel', 'manual_labels'], axis=1)
# convert str to list
openI_df['expert_labels'] = openI_df.apply(lambda row: ast.literal_eval(row['expert_labels']), axis=1)

openI_df.head()

Unnamed: 0,fileNo,COMPARISON,INDICATION,FINDINGS,IMPRESSION,expert_labels
0,881,None available.,XXXX-year-old XXXX with dyspnea.,The lungs are without focal air space opacity....,No acute cardiopulmonary abnormality.,[normal]
1,1734,,Back pain,Heart size and mediastinal contour are normal....,No acute cardiopulmonary process.,[normal]
2,306,,,The lungs are clear. Heart size is normal. No ...,Clear lungs. No acute cardiopulmonary abnormal...,[normal]
3,3188,"Chest x-XXXX, XXXX, XXXX.",This is a XXXX-year-old male patient with vomi...,,The cardiac silhouette appears be at upper lim...,"[Cardiac Shadow/enlarged/borderline, Mediastin..."
4,1951,,,Cardiomediastinal silhouette is normal. Pulmon...,No acute cardiopulmonary disease.,[normal]


# OpenI label Distribution

In [4]:
def plot_Barchart_top_n_labels(n=20):
    expert_labels = []
    for labels in openI_df['expert_labels']:
        for label in labels:
            expert_labels.append(label)

    label_counts = Counter(expert_labels)

    sorted_label_counts = OrderedDict(sorted(label_counts.items(), key=lambda x: x[1], reverse=True))
    
    unique_labels = sorted_label_counts.keys()
    print("Total No. of Unique labels:",len(unique_labels))
    
    sliced = islice(sorted_label_counts.items(), n)  
    sliced_o = OrderedDict(sliced)

    df = pd.DataFrame.from_dict(sliced_o, orient='index')
#     df.plot(kind='bar', title = "Top "+str(n)+" ranked expert labels")
    return unique_labels


#********************Original OpenI dataset ****************
unique_labels = plot_Barchart_top_n_labels(20)

Total No. of Unique labels: 1719


# Convert OpenI labels to cited Labels

In [5]:
def find_similar_disorders_caseInsensitive(disorder,unique_labels):
    similar_disorders = []
    for label in unique_labels:
        match = re.search(".*"+disorder+".*", label, flags = re.IGNORECASE)
        if match is not None:
            similar_disorders.append(match.group())
#             print(match.group())
    return similar_disorders

def find_similar_disorders_caseSensitive(disorder,unique_labels):
    similar_disorders = []
    for label in unique_labels:
        match = re.search(".*"+disorder+".*", label)
        if match is not None:
            similar_disorders.append(match.group())
#             print(match.group())
    return similar_disorders

def update_row(row,similar_disorders,disorder):
#     print("Before:", row)
    new_label_list = []
    for item in row:
        if item in similar_disorders:
            new_label_list.append(disorder)
        else:
            new_label_list.append(item)
#     print("After:",new_label_list)
    return new_label_list

def update_labels(similar_disorders, disorder):
    print("Updating similar labels to :", disorder)
    openI_df['expert_labels'] = openI_df.apply(lambda row: \
                                               update_row(row['expert_labels'],similar_disorders,disorder), \
                                               axis=1)
    
def create_new_column(column_name):
    openI_df[column_name] = openI_df.apply(lambda row: \
                                           1 if column_name in row['expert_labels'] \
                                           else 0, \
                                           axis=1)    

In [6]:
for label in labels: 
    disorder = label
    if disorder == "Pneumothorax":
        similar_disorders = find_similar_disorders_caseSensitive(disorder, unique_labels)
    else:
        similar_disorders = find_similar_disorders_caseInsensitive(disorder, unique_labels)
    if similar_disorders != []:
        update_labels(similar_disorders, disorder)
        create_new_column(label)
        openI_df.head()
        unique_labels = plot_Barchart_top_n_labels(20)
    else:
        print(label)

Updating similar labels to : Airspace Disease
Total No. of Unique labels: 1665
Updating similar labels to : Atelectasis
Total No. of Unique labels: 1601
Updating similar labels to : Calcified granuloma
Total No. of Unique labels: 1531
Updating similar labels to : Calcinosis
Total No. of Unique labels: 1442
Updating similar labels to : Cardiomegaly
Total No. of Unique labels: 1438
Updating similar labels to : Cicatrix
Total No. of Unique labels: 1365
Updating similar labels to : Consolidation
Total No. of Unique labels: 1345
Updating similar labels to : Deformity
Total No. of Unique labels: 1289
Updating similar labels to : Edema
Total No. of Unique labels: 1276
Updating similar labels to : Effusion
Total No. of Unique labels: 1240
Updating similar labels to : Emphysema
Total No. of Unique labels: 1219
Updating similar labels to : Fibrosis
Total No. of Unique labels: 1202
Updating similar labels to : Hernia
Total No. of Unique labels: 1195
Updating similar labels to : Infiltrate
Total N

In [7]:
update_labels(['normal'],"NoFinding")
create_new_column('NoFinding')
openI_df.head()

Updating similar labels to : NoFinding


Unnamed: 0,fileNo,COMPARISON,INDICATION,FINDINGS,IMPRESSION,expert_labels,Airspace Disease,Atelectasis,Calcified granuloma,Calcinosis,...,"catheters, indwelling",granulomatous disease,lung/hyperdistention,lung/hypoinflation,nodule,opacity,spine/degenerative,surgical instrument,thoracic vertebrae/degenerative,NoFinding
0,881,None available.,XXXX-year-old XXXX with dyspnea.,The lungs are without focal air space opacity....,No acute cardiopulmonary abnormality.,[NoFinding],0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1734,,Back pain,Heart size and mediastinal contour are normal....,No acute cardiopulmonary process.,[NoFinding],0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,306,,,The lungs are clear. Heart size is normal. No ...,Clear lungs. No acute cardiopulmonary abnormal...,[NoFinding],0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3188,"Chest x-XXXX, XXXX, XXXX.",This is a XXXX-year-old male patient with vomi...,,The cardiac silhouette appears be at upper lim...,"[Cardiac Shadow/enlarged/borderline, Mediastin...",0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1951,,,Cardiomediastinal silhouette is normal. Pulmon...,No acute cardiopulmonary disease.,[NoFinding],0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Summary

In [8]:
column_names = labels

openI_converted = openI_df[column_names].copy()
openI_converted=openI_converted[(openI_converted.T != 0).any()]
total = len(openI_converted)
total

3528

In [9]:
index_list = openI_converted.index
openI_df = openI_df.iloc[index_list,:]
openI_df.head()

Unnamed: 0,fileNo,COMPARISON,INDICATION,FINDINGS,IMPRESSION,expert_labels,Airspace Disease,Atelectasis,Calcified granuloma,Calcinosis,...,"catheters, indwelling",granulomatous disease,lung/hyperdistention,lung/hypoinflation,nodule,opacity,spine/degenerative,surgical instrument,thoracic vertebrae/degenerative,NoFinding
0,881,None available.,XXXX-year-old XXXX with dyspnea.,The lungs are without focal air space opacity....,No acute cardiopulmonary abnormality.,[NoFinding],0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1734,,Back pain,Heart size and mediastinal contour are normal....,No acute cardiopulmonary process.,[NoFinding],0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,306,,,The lungs are clear. Heart size is normal. No ...,Clear lungs. No acute cardiopulmonary abnormal...,[NoFinding],0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3188,"Chest x-XXXX, XXXX, XXXX.",This is a XXXX-year-old male patient with vomi...,,The cardiac silhouette appears be at upper lim...,"[Cardiac Shadow/enlarged/borderline, Mediastin...",0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1951,,,Cardiomediastinal silhouette is normal. Pulmon...,No acute cardiopulmonary disease.,[NoFinding],0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
def count_percent_pathology(pathology):
    str_count = ""
    count = openI_df[pathology].value_counts()[1]
    str_count = str(count)+" ("+str(round((count/total)*100,2))+ ")"
    return str_count

pretty=PrettyTable()
pretty.field_names = ['Pathology', 'Positive(%)', 'uncertain(%)','Negative(%)']
for pathology in column_names:
    pretty.add_row([pathology, count_percent_pathology(pathology), "-", "-"])
print(pretty)

print("Total No. of reports:", total)

+---------------------------------+--------------+--------------+-------------+
|            Pathology            | Positive(%)  | uncertain(%) | Negative(%) |
+---------------------------------+--------------+--------------+-------------+
|         Airspace Disease        |  125 (3.54)  |      -       |      -      |
|           Atelectasis           |  332 (9.41)  |      -       |      -      |
|       Calcified granuloma       |  274 (7.77)  |      -       |      -      |
|            Calcinosis           |  305 (8.65)  |      -       |      -      |
|           Cardiomegaly          | 375 (10.63)  |      -       |      -      |
|             Cicatrix            |  196 (5.56)  |      -       |      -      |
|          Consolidation          |  30 (0.85)   |      -       |      -      |
|            Deformity            |  117 (3.32)  |      -       |      -      |
|              Edema              |   46 (1.3)   |      -       |      -      |
|             Effusion            |  165

# Export OpenI_with_all_labels.csv

In [11]:
openI_df.to_csv('./data/OpenI/OpenI_with_all_labels.csv', index=False)