In [53]:
import os
import json
import pandas as pd

DATA_PATH = '../data/'

# reads data from reports in train and test in storage
reports = pd.read_csv(os.path.join(DATA_PATH, 'indiana_reports.csv'))
print(reports.shape)
ids = pd.read_csv(os.path.join(DATA_PATH, 'indiana_projections.csv'))
print(ids.shape)

# merging 
df = pd.merge(reports, ids, on=['uid'], how='left')

df['uid'] = df['uid'].astype(int)
df['im_1'] = df['filename'].apply(lambda x: x.split('-')[1])
df['im_2'] = df['filename'].apply(lambda x: x.split('-')[2][:4])


df.sort_values('filename', ascending=False).head()
df.sort_values('uid', ascending=False).head()


(3851, 8)
(7466, 3)


Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection,im_1,im_2
7464,3999,normal,normal,"CHEST PA and LATERAL: on XXXX, XXXX.",This is a XXXX-year-old female patient with shortness of breath.,"Chest x-XXXX, XXXX, XXXX.",,The cardiac silhouette is normal in size and configuration. The mediastinum and perihilar structures appear unremarkable. The lungs appear clear. The osseous structures are within normal limits.,3999_IM-2049-1001.dcm.png,Frontal,2049,1001
7465,3999,normal,normal,"CHEST PA and LATERAL: on XXXX, XXXX.",This is a XXXX-year-old female patient with shortness of breath.,"Chest x-XXXX, XXXX, XXXX.",,The cardiac silhouette is normal in size and configuration. The mediastinum and perihilar structures appear unremarkable. The lungs appear clear. The osseous structures are within normal limits.,3999_IM-2049-2001.dcm.png,Lateral,2049,2001
7463,3998,normal,normal,"PA and lateral chest XXXX, XXXX XXXX comparison from XXXX XXXX",tuberculosis positive PPD,,,Heart size is normal and the lungs are clear.,3998_IM-2048-1002.dcm.png,Lateral,2048,1002
7462,3998,normal,normal,"PA and lateral chest XXXX, XXXX XXXX comparison from XXXX XXXX",tuberculosis positive PPD,,,Heart size is normal and the lungs are clear.,3998_IM-2048-1001.dcm.png,Frontal,2048,1001
7461,3997,Opacity/lung/upper lobe/right/round/small;Granuloma/lung/upper lobe/right/round/small,Opacity;Granuloma,PA and lateral views of the chest.,XXXX-year-old male with positive PPD.,None available.,"Heart size within normal limits. Small, nodular opacity in the right upper lobe. This does not look like an acute infiltrate, and more XXXX represents a granuloma. No pneumothorax or effusions.","No acute findings, no evidence for active TB.",3997_IM-2048-1002.dcm.png,Lateral,2048,1002


In [54]:
# adding annotations provided
annotations_json = json.load(open(os.path.join(DATA_PATH, 'annotation_quiz_all.json')))
annotations_train = pd.DataFrame.from_records(annotations_json['train'])
print(annotations_train.shape)
annotations_test = pd.DataFrame.from_records(annotations_json['test'])
print(annotations_test.shape)
annotations_val = pd.DataFrame.from_records(annotations_json['val'])
print(annotations_val.shape)
annotations = pd.concat([annotations_train, annotations_test, annotations_val], axis=0)
annotations = annotations.rename(columns={'report':'annotation'})
print(annotations.shape)

annotations['uid'] = annotations['id'].apply(lambda x: x.split('_')[0].replace('CXR','')).astype(int)
annotations['im_1'] = annotations['id'].apply(lambda x: x.split("IM-")[-1][:4])

annotations.head()

(2069, 3)
(590, 3)
(296, 3)
(2955, 4)


Unnamed: 0,id,annotation,split,original_report,uid,im_1
0,CXR2384_IM-0942,"{'bone': 'Degenerative changes are present in the spine.', 'heart': 'Heart size and pulmonary vascularity appear within normal limits.', 'lung': 'Lungs are free of focal airspace disease. No pneum...",train,,2384,942
1,CXR2926_IM-1328,"{'bone': 'Bony structures are intact.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'Lungs are clear.', 'mediastinal': 'Mediastinal contours are within normal limits.', 'others'...",train,,2926,1328
2,CXR1451_IM-0291,"{'bone': '', 'heart': 'Heart size normal.', 'lung': 'Left lower lobe calcified granuloma. No pleural effusion or pneumothorax. Mild medial right atelectasis. Mild emphysema.', 'mediastinal': '', '...",train,,1451,291
3,CXR2887_IM-1289,"{'bone': 'Displaced fracture of the mid one-third of the right clavicle.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'There are a few opacities in the lung bases bilaterally. ...",train,,2887,1289
4,CXR1647_IM-0424,"{'bone': 'Visualized osseous structures of the thorax are without acute abnormality.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'The lungs are clear bilaterally. No evidence ...",train,,1647,424


In [55]:
# overall dataset
df = pd.merge(df, annotations, how='left', on=['uid', 'im_1'], suffixes=('', '_annotated'))
print(df.shape)

(7466, 16)


In [56]:
df.head()

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection,im_1,im_2,id,annotation,split,original_report
0,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of...,Normal chest x-XXXX.,1_IM-0001-4001.dcm.png,Frontal,1,4001,CXR1_1_IM-0001,"{'bone': '', 'heart': 'Cardiac silhouette within normal limits.', 'lung': 'No pulmonary edema. No focal consolidation. No evidence of pneumothorax.', 'mediastinal': 'Mediastinum size within normal...",train,
1,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of...,Normal chest x-XXXX.,1_IM-0001-3001.dcm.png,Lateral,1,3001,CXR1_1_IM-0001,"{'bone': '', 'heart': 'Cardiac silhouette within normal limits.', 'lung': 'No pulmonary edema. No focal consolidation. No evidence of pneumothorax.', 'mediastinal': 'Mediastinum size within normal...",train,
2,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XXXX. Enlarged pulmonary arteries. Clear lungs. Inferior XXXX XXXX XXXX.,No acute pulmonary findings.,2_IM-0652-1001.dcm.png,Frontal,652,1001,CXR2_IM-0652,"{'bone': 'Midline sternotomy.', 'heart': 'Borderline cardiomegaly.', 'lung': 'Clear lungs. Enlarged pulmonary arteries.', 'mediastinal': '', 'others': 'Inferior XXXX XXXX XXXX.'}",test,
3,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XXXX. Enlarged pulmonary arteries. Clear lungs. Inferior XXXX XXXX XXXX.,No acute pulmonary findings.,2_IM-0652-2001.dcm.png,Lateral,652,2001,CXR2_IM-0652,"{'bone': 'Midline sternotomy.', 'heart': 'Borderline cardiomegaly.', 'lung': 'Clear lungs. Enlarged pulmonary arteries.', 'mediastinal': '', 'others': 'Inferior XXXX XXXX XXXX.'}",test,
4,3,normal,normal,Xray Chest PA and Lateral,"rib pain after a XXXX, XXXX XXXX steps this XXXX. Pain to R back, R elbow and R rib XXXX, no previous heart or lung hx, non-XXXX, no hx ca",,,"No displaced rib fractures, pneumothorax, or pleural effusion identified. Well-expanded and clear lungs. Mediastinal contour within normal limits. No acute cardiopulmonary abnormality identified.",3_IM-1384-1001.dcm.png,Frontal,1384,1001,,,,


In [57]:
df.isnull().sum()

uid                   0
MeSH                  0
Problems              0
image                 0
indication          159
comparison         2260
findings            997
impression           52
filename              0
projection            0
im_1                  0
im_2                  0
id                 1381
annotation         1987
split              1381
original_report    6860
dtype: int64

In [58]:
df[~df['original_report'].isnull()][['findings','original_report']].head(100)

Unnamed: 0,findings,original_report
6,"There are diffuse bilateral interstitial and alveolar opacities consistent with chronic obstructive lung disease and bullous emphysema. There are irregular opacities in the left lung apex, that co...","There are diffuse bilateral interstitial and alveolar opacities consistent with chronic obstructive lung disease and bullous emphysema. There are irregular opacities in the left lung apex, that co..."
7,"There are diffuse bilateral interstitial and alveolar opacities consistent with chronic obstructive lung disease and bullous emphysema. There are irregular opacities in the left lung apex, that co...","There are diffuse bilateral interstitial and alveolar opacities consistent with chronic obstructive lung disease and bullous emphysema. There are irregular opacities in the left lung apex, that co..."
46,Cardiomediastinal silhouette is within normal limits of size and appearance. The pulmonary vascularity is unremarkable. Lungs are expanded and clear of airspace disease. Negative for pneumothorax ...,Cardiomediastinal silhouette is within normal limits of size and appearance. The pulmonary vascularity is unremarkable. Lungs are expanded and clear of airspace disease. Negative for pneumothorax ...
47,Cardiomediastinal silhouette is within normal limits of size and appearance. The pulmonary vascularity is unremarkable. Lungs are expanded and clear of airspace disease. Negative for pneumothorax ...,Cardiomediastinal silhouette is within normal limits of size and appearance. The pulmonary vascularity is unremarkable. Lungs are expanded and clear of airspace disease. Negative for pneumothorax ...
50,The cardiac contours are normal. The lungs are clear. Thoracic spondylosis.,The cardiac contours are normal. The lungs are clear. Thoracic spondylosis.
...,...,...
1574,Cardiomediastinal silhouette and pulmonary vasculature are within normal limits. Lungs are clear. No pneumothorax or pleural effusion. No radiodense foreign bodies noted. No acute osseous findings.,Cardiomediastinal silhouette and pulmonary vasculature are within normal limits. Lungs are clear. No pneumothorax or pleural effusion. No radiodense foreign bodies noted. No acute osseous findings.
1575,Cardiomediastinal silhouette and pulmonary vasculature are within normal limits. Lungs are clear. No pneumothorax or pleural effusion. No radiodense foreign bodies noted. No acute osseous findings.,Cardiomediastinal silhouette and pulmonary vasculature are within normal limits. Lungs are clear. No pneumothorax or pleural effusion. No radiodense foreign bodies noted. No acute osseous findings.
1600,"Limited evaluation of the lateral view due to rotation and frontal view due to motion artifact. Stable mild cardiomegaly. Normal pulmonary vascularity. The lungs are clear. No focal consolidation,...","Limited evaluation of the lateral view due to rotation and frontal view due to motion artifact. Stable mild cardiomegaly. Normal pulmonary vascularity. The lungs are clear. No focal consolidation,..."
1601,"Limited evaluation of the lateral view due to rotation and frontal view due to motion artifact. Stable mild cardiomegaly. Normal pulmonary vascularity. The lungs are clear. No focal consolidation,...","Limited evaluation of the lateral view due to rotation and frontal view due to motion artifact. Stable mild cardiomegaly. Normal pulmonary vascularity. The lungs are clear. No focal consolidation,..."


In [61]:
# filling nulls to get uniform 'original_report'
df['original_report'] = df['original_report'].fillna(df['findings'])

df.to_csv(os.path.join(DATA_PATH, 'data_prep.csv'), index=False)

In [60]:
df[~df['annotation'].isnull()][['annotation','original_report']]#.head(100)

Unnamed: 0,annotation,original_report
0,"{'bone': '', 'heart': 'Cardiac silhouette within normal limits.', 'lung': 'No pulmonary edema. No focal consolidation. No evidence of pneumothorax.', 'mediastinal': 'Mediastinum size within normal...",The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of...
1,"{'bone': '', 'heart': 'Cardiac silhouette within normal limits.', 'lung': 'No pulmonary edema. No focal consolidation. No evidence of pneumothorax.', 'mediastinal': 'Mediastinum size within normal...",The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of...
2,"{'bone': 'Midline sternotomy.', 'heart': 'Borderline cardiomegaly.', 'lung': 'Clear lungs. Enlarged pulmonary arteries.', 'mediastinal': '', 'others': 'Inferior XXXX XXXX XXXX.'}",Borderline cardiomegaly. Midline sternotomy XXXX. Enlarged pulmonary arteries. Clear lungs. Inferior XXXX XXXX XXXX.
3,"{'bone': 'Midline sternotomy.', 'heart': 'Borderline cardiomegaly.', 'lung': 'Clear lungs. Enlarged pulmonary arteries.', 'mediastinal': '', 'others': 'Inferior XXXX XXXX XXXX.'}",Borderline cardiomegaly. Midline sternotomy XXXX. Enlarged pulmonary arteries. Clear lungs. Inferior XXXX XXXX XXXX.
8,"{'bone': 'Small T-spine osteophytes.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'There is no pneumothorax or pleural effusion. There are no focal areas of consolidation. Ther...",The cardiomediastinal silhouette and pulmonary vasculature are within normal limits. There is no pneumothorax or pleural effusion. There are no focal areas of consolidation. Cholecystectomy clips ...
...,...,...
7457,"{'bone': 'No acute bony findings.', 'heart': 'Cardiomediastinal silhouette are within normal limits in size.', 'lung': 'Pulmonary vasculature are within normal limits in size. The lungs are mildly...",The cardiomediastinal silhouette and pulmonary vasculature are within normal limits in size. The lungs are mildly hyperinflated with flattening of the diaphragms and expansion of the retrosternal ...
7458,"{'bone': 'Endplate changes in the spine.', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax.', 'mediastinal': '', 'others': ''}",The lungs are clear. Heart size is normal. No pneumothorax. There are endplate changes in the spine.
7459,"{'bone': 'Endplate changes in the spine.', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax.', 'mediastinal': '', 'others': ''}",The lungs are clear. Heart size is normal. No pneumothorax. There are endplate changes in the spine.
7460,"{'bone': '', 'heart': 'Heart size within normal limits.', 'lung': 'Small, nodular opacity in the right upper lobe. No pneumothorax or effusions.', 'mediastinal': '', 'others': ''}","Heart size within normal limits. Small, nodular opacity in the right upper lobe. This does not look like an acute infiltrate, and more XXXX represents a granuloma. No pneumothorax or effusions."
