In [1]:
import os
import json
import pandas as pd

DATA_PATH = '../data/'

In [2]:

# reads data from reports in train and test in storage
reports = pd.read_csv(os.path.join(DATA_PATH, 'indiana_reports.csv'))
print(reports.shape)
ids = pd.read_csv(os.path.join(DATA_PATH, 'indiana_projections.csv'))
print(ids.shape)

# merging 
df = pd.merge(reports, ids, on=['uid'], how='left')

df['uid'] = df['uid'].astype(int)
df['im_1'] = df['filename'].apply(lambda x: x.split('-')[1])
df['im_2'] = df['filename'].apply(lambda x: x.split('-')[2][:4])


df.sort_values('filename', ascending=False).head()
df.sort_values('uid', ascending=False).head()


(3851, 8)
(7466, 3)


Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection,im_1,im_2
7464,3999,normal,normal,"CHEST PA and LATERAL: on XXXX, XXXX.",This is a XXXX-year-old female patient with sh...,"Chest x-XXXX, XXXX, XXXX.",,The cardiac silhouette is normal in size and c...,3999_IM-2049-1001.dcm.png,Frontal,2049,1001
7465,3999,normal,normal,"CHEST PA and LATERAL: on XXXX, XXXX.",This is a XXXX-year-old female patient with sh...,"Chest x-XXXX, XXXX, XXXX.",,The cardiac silhouette is normal in size and c...,3999_IM-2049-2001.dcm.png,Lateral,2049,2001
7463,3998,normal,normal,"PA and lateral chest XXXX, XXXX XXXX comparis...",tuberculosis positive PPD,,,Heart size is normal and the lungs are clear.,3998_IM-2048-1002.dcm.png,Lateral,2048,1002
7462,3998,normal,normal,"PA and lateral chest XXXX, XXXX XXXX comparis...",tuberculosis positive PPD,,,Heart size is normal and the lungs are clear.,3998_IM-2048-1001.dcm.png,Frontal,2048,1001
7461,3997,Opacity/lung/upper lobe/right/round/small;Gran...,Opacity;Granuloma,PA and lateral views of the chest.,XXXX-year-old male with positive PPD.,None available.,"Heart size within normal limits. Small, nodula...","No acute findings, no evidence for active TB.",3997_IM-2048-1002.dcm.png,Lateral,2048,1002


In [54]:
# adding annotations provided
annotations_json = json.load(open(os.path.join(DATA_PATH, 'annotation_quiz_all.json')))
annotations_train = pd.DataFrame.from_records(annotations_json['train'])
print(annotations_train.shape)
annotations_test = pd.DataFrame.from_records(annotations_json['test'])
print(annotations_test.shape)
annotations_val = pd.DataFrame.from_records(annotations_json['val'])
print(annotations_val.shape)
annotations = pd.concat([annotations_train, annotations_test, annotations_val], axis=0)
annotations = annotations.rename(columns={'report':'annotation'})
print(annotations.shape)

annotations['uid'] = annotations['id'].apply(lambda x: x.split('_')[0].replace('CXR','')).astype(int)
annotations['im_1'] = annotations['id'].apply(lambda x: x.split("IM-")[-1][:4])

annotations.head()

(2069, 3)
(590, 3)
(296, 3)
(2955, 4)


Unnamed: 0,id,annotation,split,original_report,uid,im_1
0,CXR2384_IM-0942,"{'bone': 'Degenerative changes are present in the spine.', 'heart': 'Heart size and pulmonary vascularity appear within normal limits.', 'lung': 'Lungs are free of focal airspace disease. No pneum...",train,,2384,942
1,CXR2926_IM-1328,"{'bone': 'Bony structures are intact.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'Lungs are clear.', 'mediastinal': 'Mediastinal contours are within normal limits.', 'others'...",train,,2926,1328
2,CXR1451_IM-0291,"{'bone': '', 'heart': 'Heart size normal.', 'lung': 'Left lower lobe calcified granuloma. No pleural effusion or pneumothorax. Mild medial right atelectasis. Mild emphysema.', 'mediastinal': '', '...",train,,1451,291
3,CXR2887_IM-1289,"{'bone': 'Displaced fracture of the mid one-third of the right clavicle.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'There are a few opacities in the lung bases bilaterally. ...",train,,2887,1289
4,CXR1647_IM-0424,"{'bone': 'Visualized osseous structures of the thorax are without acute abnormality.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'The lungs are clear bilaterally. No evidence ...",train,,1647,424


In [55]:
# overall dataset
df = pd.merge(df, annotations, how='left', on=['uid', 'im_1'], suffixes=('', '_annotated'))
print(df.shape)

(7466, 16)


In [56]:
df.head()

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection,im_1,im_2,id,annotation,split,original_report
0,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of...,Normal chest x-XXXX.,1_IM-0001-4001.dcm.png,Frontal,1,4001,CXR1_1_IM-0001,"{'bone': '', 'heart': 'Cardiac silhouette within normal limits.', 'lung': 'No pulmonary edema. No focal consolidation. No evidence of pneumothorax.', 'mediastinal': 'Mediastinum size within normal...",train,
1,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of...,Normal chest x-XXXX.,1_IM-0001-3001.dcm.png,Lateral,1,3001,CXR1_1_IM-0001,"{'bone': '', 'heart': 'Cardiac silhouette within normal limits.', 'lung': 'No pulmonary edema. No focal consolidation. No evidence of pneumothorax.', 'mediastinal': 'Mediastinum size within normal...",train,
2,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XXXX. Enlarged pulmonary arteries. Clear lungs. Inferior XXXX XXXX XXXX.,No acute pulmonary findings.,2_IM-0652-1001.dcm.png,Frontal,652,1001,CXR2_IM-0652,"{'bone': 'Midline sternotomy.', 'heart': 'Borderline cardiomegaly.', 'lung': 'Clear lungs. Enlarged pulmonary arteries.', 'mediastinal': '', 'others': 'Inferior XXXX XXXX XXXX.'}",test,
3,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XXXX. Enlarged pulmonary arteries. Clear lungs. Inferior XXXX XXXX XXXX.,No acute pulmonary findings.,2_IM-0652-2001.dcm.png,Lateral,652,2001,CXR2_IM-0652,"{'bone': 'Midline sternotomy.', 'heart': 'Borderline cardiomegaly.', 'lung': 'Clear lungs. Enlarged pulmonary arteries.', 'mediastinal': '', 'others': 'Inferior XXXX XXXX XXXX.'}",test,
4,3,normal,normal,Xray Chest PA and Lateral,"rib pain after a XXXX, XXXX XXXX steps this XXXX. Pain to R back, R elbow and R rib XXXX, no previous heart or lung hx, non-XXXX, no hx ca",,,"No displaced rib fractures, pneumothorax, or pleural effusion identified. Well-expanded and clear lungs. Mediastinal contour within normal limits. No acute cardiopulmonary abnormality identified.",3_IM-1384-1001.dcm.png,Frontal,1384,1001,,,,


In [57]:
df.isnull().sum()

uid                   0
MeSH                  0
Problems              0
image                 0
indication          159
comparison         2260
findings            997
impression           52
filename              0
projection            0
im_1                  0
im_2                  0
id                 1381
annotation         1987
split              1381
original_report    6860
dtype: int64

In [58]:
df[~df['original_report'].isnull()][['findings','original_report']].head(100)

Unnamed: 0,findings,original_report
6,"There are diffuse bilateral interstitial and alveolar opacities consistent with chronic obstructive lung disease and bullous emphysema. There are irregular opacities in the left lung apex, that co...","There are diffuse bilateral interstitial and alveolar opacities consistent with chronic obstructive lung disease and bullous emphysema. There are irregular opacities in the left lung apex, that co..."
7,"There are diffuse bilateral interstitial and alveolar opacities consistent with chronic obstructive lung disease and bullous emphysema. There are irregular opacities in the left lung apex, that co...","There are diffuse bilateral interstitial and alveolar opacities consistent with chronic obstructive lung disease and bullous emphysema. There are irregular opacities in the left lung apex, that co..."
46,Cardiomediastinal silhouette is within normal limits of size and appearance. The pulmonary vascularity is unremarkable. Lungs are expanded and clear of airspace disease. Negative for pneumothorax ...,Cardiomediastinal silhouette is within normal limits of size and appearance. The pulmonary vascularity is unremarkable. Lungs are expanded and clear of airspace disease. Negative for pneumothorax ...
47,Cardiomediastinal silhouette is within normal limits of size and appearance. The pulmonary vascularity is unremarkable. Lungs are expanded and clear of airspace disease. Negative for pneumothorax ...,Cardiomediastinal silhouette is within normal limits of size and appearance. The pulmonary vascularity is unremarkable. Lungs are expanded and clear of airspace disease. Negative for pneumothorax ...
50,The cardiac contours are normal. The lungs are clear. Thoracic spondylosis.,The cardiac contours are normal. The lungs are clear. Thoracic spondylosis.
...,...,...
1574,Cardiomediastinal silhouette and pulmonary vasculature are within normal limits. Lungs are clear. No pneumothorax or pleural effusion. No radiodense foreign bodies noted. No acute osseous findings.,Cardiomediastinal silhouette and pulmonary vasculature are within normal limits. Lungs are clear. No pneumothorax or pleural effusion. No radiodense foreign bodies noted. No acute osseous findings.
1575,Cardiomediastinal silhouette and pulmonary vasculature are within normal limits. Lungs are clear. No pneumothorax or pleural effusion. No radiodense foreign bodies noted. No acute osseous findings.,Cardiomediastinal silhouette and pulmonary vasculature are within normal limits. Lungs are clear. No pneumothorax or pleural effusion. No radiodense foreign bodies noted. No acute osseous findings.
1600,"Limited evaluation of the lateral view due to rotation and frontal view due to motion artifact. Stable mild cardiomegaly. Normal pulmonary vascularity. The lungs are clear. No focal consolidation,...","Limited evaluation of the lateral view due to rotation and frontal view due to motion artifact. Stable mild cardiomegaly. Normal pulmonary vascularity. The lungs are clear. No focal consolidation,..."
1601,"Limited evaluation of the lateral view due to rotation and frontal view due to motion artifact. Stable mild cardiomegaly. Normal pulmonary vascularity. The lungs are clear. No focal consolidation,...","Limited evaluation of the lateral view due to rotation and frontal view due to motion artifact. Stable mild cardiomegaly. Normal pulmonary vascularity. The lungs are clear. No focal consolidation,..."


In [61]:
# filling nulls to get uniform 'original_report'
df['original_report'] = df['original_report'].fillna(df['findings'])

df.to_csv(os.path.join(DATA_PATH, 'data_prep.csv'), index=False)

In [60]:
df[~df['annotation'].isnull()][['annotation','original_report']]#.head(100)

Unnamed: 0,annotation,original_report
0,"{'bone': '', 'heart': 'Cardiac silhouette within normal limits.', 'lung': 'No pulmonary edema. No focal consolidation. No evidence of pneumothorax.', 'mediastinal': 'Mediastinum size within normal...",The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of...
1,"{'bone': '', 'heart': 'Cardiac silhouette within normal limits.', 'lung': 'No pulmonary edema. No focal consolidation. No evidence of pneumothorax.', 'mediastinal': 'Mediastinum size within normal...",The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of...
2,"{'bone': 'Midline sternotomy.', 'heart': 'Borderline cardiomegaly.', 'lung': 'Clear lungs. Enlarged pulmonary arteries.', 'mediastinal': '', 'others': 'Inferior XXXX XXXX XXXX.'}",Borderline cardiomegaly. Midline sternotomy XXXX. Enlarged pulmonary arteries. Clear lungs. Inferior XXXX XXXX XXXX.
3,"{'bone': 'Midline sternotomy.', 'heart': 'Borderline cardiomegaly.', 'lung': 'Clear lungs. Enlarged pulmonary arteries.', 'mediastinal': '', 'others': 'Inferior XXXX XXXX XXXX.'}",Borderline cardiomegaly. Midline sternotomy XXXX. Enlarged pulmonary arteries. Clear lungs. Inferior XXXX XXXX XXXX.
8,"{'bone': 'Small T-spine osteophytes.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'There is no pneumothorax or pleural effusion. There are no focal areas of consolidation. Ther...",The cardiomediastinal silhouette and pulmonary vasculature are within normal limits. There is no pneumothorax or pleural effusion. There are no focal areas of consolidation. Cholecystectomy clips ...
...,...,...
7457,"{'bone': 'No acute bony findings.', 'heart': 'Cardiomediastinal silhouette are within normal limits in size.', 'lung': 'Pulmonary vasculature are within normal limits in size. The lungs are mildly...",The cardiomediastinal silhouette and pulmonary vasculature are within normal limits in size. The lungs are mildly hyperinflated with flattening of the diaphragms and expansion of the retrosternal ...
7458,"{'bone': 'Endplate changes in the spine.', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax.', 'mediastinal': '', 'others': ''}",The lungs are clear. Heart size is normal. No pneumothorax. There are endplate changes in the spine.
7459,"{'bone': 'Endplate changes in the spine.', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax.', 'mediastinal': '', 'others': ''}",The lungs are clear. Heart size is normal. No pneumothorax. There are endplate changes in the spine.
7460,"{'bone': '', 'heart': 'Heart size within normal limits.', 'lung': 'Small, nodular opacity in the right upper lobe. No pneumothorax or effusions.', 'mediastinal': '', 'others': ''}","Heart size within normal limits. Small, nodular opacity in the right upper lobe. This does not look like an acute infiltrate, and more XXXX represents a granuloma. No pneumothorax or effusions."


In [3]:
df = pd.read_csv(os.path.join(DATA_PATH, 'data_prep.csv'))

In [5]:
df[df['split']=='train']

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection,im_1,im_2,id,annotation,split,original_report
0,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size ar...,Normal chest x-XXXX.,1_IM-0001-4001.dcm.png,Frontal,1,4001,CXR1_1_IM-0001,"{'bone': '', 'heart': 'Cardiac silhouette with...",train,The cardiac silhouette and mediastinum size ar...
1,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size ar...,Normal chest x-XXXX.,1_IM-0001-3001.dcm.png,Lateral,1,3001,CXR1_1_IM-0001,"{'bone': '', 'heart': 'Cardiac silhouette with...",train,The cardiac silhouette and mediastinum size ar...
8,5,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,Xray Chest PA and Lateral,Chest and nasal congestion.,,The cardiomediastinal silhouette and pulmonary...,No acute cardiopulmonary abnormality.,5_IM-2117-1003002.dcm.png,Frontal,2117,1003,CXR5_IM-2117,"{'bone': 'Small T-spine osteophytes.', 'heart'...",train,The cardiomediastinal silhouette and pulmonary...
9,5,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,Xray Chest PA and Lateral,Chest and nasal congestion.,,The cardiomediastinal silhouette and pulmonary...,No acute cardiopulmonary abnormality.,5_IM-2117-1004003.dcm.png,Lateral,2117,1004,CXR5_IM-2117,"{'bone': 'Small T-spine osteophytes.', 'heart'...",train,The cardiomediastinal silhouette and pulmonary...
10,6,normal,normal,"PA and Lateral Chest. XXXX, XXXX at XXXX",Evaluate for infection,"XXXX, XXXX",Heart size and mediastinal contour are within ...,No acute cardiopulmonary findings.,6_IM-2192-1001.dcm.png,Frontal,2192,1001,CXR6_IM-2192,{'bone': 'Mild degenerative change of the thor...,train,Heart size and mediastinal contour are within ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7455,3994,Cardiomegaly/mild;Pulmonary Congestion;Heart F...,Cardiomegaly;Pulmonary Congestion;Heart Failure,2 view ( PA and lateral) chest radiograph date...,"XXXX-year-old male with chest pain, positive t...","Portable chest x-XXXX XXXX, XXXX",Similar mild cardiomegaly. Of the pulmonary va...,Mild cardiomegaly with XXXX of early failure.,3994_IM-2045-1001.dcm.png,Lateral,2045,1001,CXR3994_IM-2045,"{'bone': 'No acute bony abnormality.', 'heart'...",train,Similar mild cardiomegaly. Of the pulmonary va...
7456,3995,Lung/hyperdistention/mild;Diaphragm/bilateral/...,Lung;Diaphragm;Cicatrix;Pulmonary Atelectasis;...,Xray Chest PA and Lateral,"Nausea, vomiting x2 weeks. Dialysis patient.","XXXX, XXXX.",The cardiomediastinal silhouette and pulmonary...,1. Interval resolution of bibasilar airspace d...,3995_IM-2046-1001.dcm.png,Frontal,2046,1001,CXR3995_IM-2046,"{'bone': 'No acute bony findings.', 'heart': '...",train,The cardiomediastinal silhouette and pulmonary...
7457,3995,Lung/hyperdistention/mild;Diaphragm/bilateral/...,Lung;Diaphragm;Cicatrix;Pulmonary Atelectasis;...,Xray Chest PA and Lateral,"Nausea, vomiting x2 weeks. Dialysis patient.","XXXX, XXXX.",The cardiomediastinal silhouette and pulmonary...,1. Interval resolution of bibasilar airspace d...,3995_IM-2046-2001.dcm.png,Lateral,2046,2001,CXR3995_IM-2046,"{'bone': 'No acute bony findings.', 'heart': '...",train,The cardiomediastinal silhouette and pulmonary...
7458,3996,Spine/degenerative,Spine,Xray Chest PA and Lateral,,None.,The lungs are clear. Heart size is normal. No ...,Clear lungs. No acute cardiopulmonary abnormal...,3996_IM-2047-1001.dcm.png,Frontal,2047,1001,CXR3996_IM-2047,"{'bone': 'Endplate changes in the spine.', 'he...",train,The lungs are clear. Heart size is normal. No ...


In [4]:
df[df['split']=='test']

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection,im_1,im_2,id,annotation,split,original_report
2,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XX...,No acute pulmonary findings.,2_IM-0652-1001.dcm.png,Frontal,652,1001,CXR2_IM-0652,"{'bone': 'Midline sternotomy.', 'heart': 'Bord...",test,Borderline cardiomegaly. Midline sternotomy XX...
3,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XX...,No acute pulmonary findings.,2_IM-0652-2001.dcm.png,Lateral,652,2001,CXR2_IM-0652,"{'bone': 'Midline sternotomy.', 'heart': 'Bord...",test,Borderline cardiomegaly. Midline sternotomy XX...
22,12,normal,normal,PA and lateral chest radiograph (2 views) (2 i...,XXXX,,Lungs are clear bilaterally. Cardiac and media...,No acute cardiopulmonary abnormality.,12_IM-0133-1001.dcm.png,Frontal,133,1001,CXR12_IM-0133,"{'bone': 'No acute bony abnormality.', 'heart'...",test,Lungs are clear bilaterally. Cardiac and media...
23,12,normal,normal,PA and lateral chest radiograph (2 views) (2 i...,XXXX,,Lungs are clear bilaterally. Cardiac and media...,No acute cardiopulmonary abnormality.,12_IM-0133-2001.dcm.png,Lateral,133,2001,CXR12_IM-0133,"{'bone': 'No acute bony abnormality.', 'heart'...",test,Lungs are clear bilaterally. Cardiac and media...
24,13,Cardiac Shadow/borderline,Cardiac Shadow,PA and lateral chest radiograph (2 views) (2 i...,Chest pain.,"Chest radiograph from XXXX, XXXX.",The cardiac silhouette is borderline enlarged....,Borderline enlargement of the cardiac silhouet...,13_IM-0198-1001.dcm.png,Frontal,198,1001,CXR13_IM-0198,"{'bone': '', 'heart': 'Cardiac silhouette is b...",test,The cardiac silhouette is borderline enlarged....
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7449,3991,Spondylosis/thoracic vertebrae,Spondylosis,Xray Chest PA and Lateral,Preop bariatric surgery,None.,The cardiac contours are normal. The lungs are...,No acute preoperative findings.,3991_IM-2044-2001.dcm.png,Lateral,2044,2001,CXR3991_IM-2044,"{'bone': 'Thoracic spondylosis.', 'heart': 'Th...",test,The cardiac contours are normal. The lungs are...
7452,3993,Cardiomegaly/mild;Diaphragm/left/elevated,Cardiomegaly;Diaphragm,CHEST 2V FRONTAL/LATERAL,Edema,,The heart is mildly enlarged. Left hemidiaphra...,Borderline cardiomegaly without acute disease.,3993_IM-2044-1001.dcm.png,Frontal,2044,1001,CXR3993_IM-2044,"{'bone': '', 'heart': 'Mildly enlarged.', 'lun...",test,The heart is mildly enlarged. Left hemidiaphra...
7453,3993,Cardiomegaly/mild;Diaphragm/left/elevated,Cardiomegaly;Diaphragm,CHEST 2V FRONTAL/LATERAL,Edema,,The heart is mildly enlarged. Left hemidiaphra...,Borderline cardiomegaly without acute disease.,3993_IM-2044-1002.dcm.png,Lateral,2044,1002,CXR3993_IM-2044,"{'bone': '', 'heart': 'Mildly enlarged.', 'lun...",test,The heart is mildly enlarged. Left hemidiaphra...
7460,3997,Opacity/lung/upper lobe/right/round/small;Gran...,Opacity;Granuloma,PA and lateral views of the chest.,XXXX-year-old male with positive PPD.,None available.,"Heart size within normal limits. Small, nodula...","No acute findings, no evidence for active TB.",3997_IM-2048-1001.dcm.png,Frontal,2048,1001,CXR3997_IM-2048,"{'bone': '', 'heart': 'Heart size within norma...",test,"Heart size within normal limits. Small, nodula..."


In [6]:
df[df['split']=='val']

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection,im_1,im_2,id,annotation,split,original_report
6,4,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...","PA and lateral views of the chest XXXX, XXXX a...",XXXX-year-old XXXX with XXXX.,None available,There are diffuse bilateral interstitial and a...,1. Bullous emphysema and interstitial fibrosis...,4_IM-2050-1001.dcm.png,Frontal,2050,1001,CXR4_IM-2050,,val,There are diffuse bilateral interstitial and a...
7,4,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...","PA and lateral views of the chest XXXX, XXXX a...",XXXX-year-old XXXX with XXXX.,None available,There are diffuse bilateral interstitial and a...,1. Bullous emphysema and interstitial fibrosis...,4_IM-2050-2001.dcm.png,Lateral,2050,2001,CXR4_IM-2050,,val,There are diffuse bilateral interstitial and a...
46,24,normal,normal,PA AND LATERAL VIEWS OF THE CHEST dated XXXX X...,"XXXX, dyspnea",None.,Cardiomediastinal silhouette is within normal ...,No acute cardiopulmonary abnormality.,24_IM-0949-1001.dcm.png,Frontal,949,1001,CXR24_IM-0949,,val,Cardiomediastinal silhouette is within normal ...
47,24,normal,normal,PA AND LATERAL VIEWS OF THE CHEST dated XXXX X...,"XXXX, dyspnea",None.,Cardiomediastinal silhouette is within normal ...,No acute cardiopulmonary abnormality.,24_IM-0949-2001.dcm.png,Lateral,949,2001,CXR24_IM-0949,,val,Cardiomediastinal silhouette is within normal ...
50,26,Spondylosis/thoracic vertebrae,Spondylosis,"Chest, 2 views, frontal and lateral",XXXX,None.,The cardiac contours are normal. The lungs are...,No acute process.,26_IM-1090-1001.dcm.png,Frontal,1090,1001,CXR26_IM-1090,,val,The cardiac contours are normal. The lungs are...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7360,3947,"Catheters, Indwelling/left","Catheters, Indwelling",Xray Chest PA and Lateral,Preprocedure evaluation prior to bone marrow t...,,The lungs appear clear. There are no suspiciou...,No acute cardiopulmonary disease.,3947_IM-2016-2001.dcm.png,Lateral,2016,2001,CXR3947_IM-2016,,val,The lungs appear clear. There are no suspiciou...
7383,3959,No Indexing,No Indexing,Xray Chest PA and Lateral,XXXX S/P XXXX XXXX. Aortic stenosis,Chest 2 views. XXXX.,The lungs are clear. Heart size is normal. No ...,Clear lungs. No acute cardiopulmonary abnormal...,3959_IM-2023-1001.dcm.png,Frontal,2023,1001,CXR3959_IM-2023,,val,The lungs are clear. Heart size is normal. No ...
7384,3959,No Indexing,No Indexing,Xray Chest PA and Lateral,XXXX S/P XXXX XXXX. Aortic stenosis,Chest 2 views. XXXX.,The lungs are clear. Heart size is normal. No ...,Clear lungs. No acute cardiopulmonary abnormal...,3959_IM-2023-3001.dcm.png,Lateral,2023,3001,CXR3959_IM-2023,,val,The lungs are clear. Heart size is normal. No ...
7415,3974,Lung/hyperdistention;Pulmonary Emphysema;Airsp...,Lung;Pulmonary Emphysema;Airspace Disease;Spine,PA and lateral chest x-XXXX XXXX,XXXX,None available for review,The lungs are hyperexpanded consistent with em...,1. Hyperexpanded lungs suggesting emphysema. 2...,3974_IM-2034-2002.dcm.png,Frontal,2034,2002,CXR3974_IM-2034,,val,The lungs are hyperexpanded consistent with em...


In [4]:
df_train = pd.read_csv(os.path.join(DATA_PATH, 'reports_annotations_train.csv'))

In [6]:
df_train

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection,im_1,im_2,id,annotation,split,original_report,annotations_1,annotations_2
0,2349,normal,normal,"PA and LAT view CHEST XXXX, XXXX XXXX PM",Positive TB test,None.,Heart size and vascularity normal. These conto...,Normal chest.,2349_IM-0914-2001.dcm.png,Lateral,914,2001,CXR2349_IM-0914,"{'bone': '', 'heart': 'Heart size and vascular...",train,Heart size and vascularity normal. These conto...,"{\n ""lung"": ""Lungs clear. No pleural effusion...","{\n ""lung"": ""Lungs clear. No pleural effusion..."
1,2949,Calcified Granuloma/lung/scattered/multiple;No...,Calcified Granuloma;Nodule;Calcinosis;Calcinos...,"Chest 2 views PA and lateral XXXX, XXXX XXXX a...","Malignant neoplasm, evaluate for metastases/re...","Chest 2 views PA and lateral XXXX, XXXX p.m., ...","Scattered calcified pulmonary nodules, XXXX re...",No acute cardiopulmonary abnormality.,2949_IM-1348-1002.dcm.png,Lateral,1348,1002,CXR2949_IM-1348,"{'bone': '', 'heart': 'Heart size is normal.',...",train,"Scattered calcified pulmonary nodules, XXXX re...","{\n ""lung"": ""Scattered calcified pulmonary no...","{\n ""lung"": ""Scattered calcified pulmonary no..."
2,2511,Sutures/lung/upper lobe/left;Cicatrix/costophr...,Sutures;Cicatrix;Calcified Granuloma,"Chest radiographs, 2 XXXX and lateral","XXXX-year-old XXXX, chest pain.","XXXX, XXXX.",Sternotomy XXXX noted. Suture material overlie...,No acute abnormality.,2511_IM-1034-2001.dcm.png,Lateral,1034,2001,CXR2511_IM-1034,{'bone': 'Sternotomy XXXX noted. Scarring left...,train,Sternotomy XXXX noted. Suture material overlie...,"{\n ""lung"": ""Negative for focal pulmonary con...","{\n ""lung"": ""Negative for focal pulmonary con..."
3,3363,Opacity/lung/base/bilateral/multiple;Foreign B...,Opacity;Foreign Bodies;Foreign Bodies;Foreign ...,CHEST 2V FRONTAL/LATERAL,"Shielded, PDP.",XXXX,The heart is normal in size. The mediastinum i...,"XXXX bullet fragments, as described above. No ...",3363_IM-1616-1002.dcm.png,Lateral,1616,1002,CXR3363_IM-1616,{'bone': 'Fragments overlying the posterior le...,train,The heart is normal in size. The mediastinum i...,"{\n ""lung"": ""The lungs are grossly clear. XXX...","{\n ""lung"": ""The lungs are grossly clear. XXX..."
4,2677,"Aorta, Thoracic/tortuous/mild;Foreign Bodies/t...","Aorta, Thoracic;Foreign Bodies","PA and lateral chest x-XXXX dated XXXX, XXXX a...",XXXX-year-old XXXX with chest pain.,None.,Heart size normal. Mild tortuosity of the thor...,No acute cardiopulmonary abnormality.,2677_IM-1151-2001.dcm.png,Lateral,1151,2001,CXR2677_IM-1151,"{'bone': 'No acute bony abnormality.', 'heart'...",train,Heart size normal. Mild tortuosity of the thor...,"{\n ""lung"": ""There is no focal consolidation,...","{\n ""lung"": ""There is no focal consolidation,..."


In [52]:
IMAGES_DIR = '../data/images'

df = pd.read_csv(os.path.join(DATA_PATH, 'data_prep.csv'))
df['image_folder'] = df['filename'].apply(lambda x: 'CXR' + '-'.join(x.split('-')[:2]) )
#df['image_number'] = df['filename'].apply(lambda x: x.split('-')[-1].split('0')[0].split('.')[0] )
# TODO can't reconnect image specific to row
df['image_number'] = df.groupby('image_folder').cumcount().astype(str)
df['image_filename'] = df.apply(lambda x: os.path.join(IMAGES_DIR, x['image_folder'], x['image_number']+'.png'), axis=1 )


In [56]:
df['image_filename'].apply(lambda x: os.path.exists(x) ).sum()

np.int64(6066)

In [57]:
from pathlib import Path

sum(1 for _ in Path(IMAGES_DIR).rglob('*.png'))

6067

In [51]:
df

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression,filename,projection,im_1,im_2,id,annotation,split,original_report,image_folder,image_number,image_filename
0,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size ar...,Normal chest x-XXXX.,1_IM-0001-4001.dcm.png,Frontal,1,4001,CXR1_1_IM-0001,"{'bone': '', 'heart': 'Cardiac silhouette with...",train,The cardiac silhouette and mediastinum size ar...,CXR1_IM-0001,0,../data/images/CXR1_IM-0001/0.png
1,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size ar...,Normal chest x-XXXX.,1_IM-0001-3001.dcm.png,Lateral,1,3001,CXR1_1_IM-0001,"{'bone': '', 'heart': 'Cardiac silhouette with...",train,The cardiac silhouette and mediastinum size ar...,CXR1_IM-0001,1,../data/images/CXR1_IM-0001/1.png
2,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XX...,No acute pulmonary findings.,2_IM-0652-1001.dcm.png,Frontal,652,1001,CXR2_IM-0652,"{'bone': 'Midline sternotomy.', 'heart': 'Bord...",test,Borderline cardiomegaly. Midline sternotomy XX...,CXR2_IM-0652,0,../data/images/CXR2_IM-0652/0.png
3,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XX...,No acute pulmonary findings.,2_IM-0652-2001.dcm.png,Lateral,652,2001,CXR2_IM-0652,"{'bone': 'Midline sternotomy.', 'heart': 'Bord...",test,Borderline cardiomegaly. Midline sternotomy XX...,CXR2_IM-0652,1,../data/images/CXR2_IM-0652/1.png
4,3,normal,normal,Xray Chest PA and Lateral,"rib pain after a XXXX, XXXX XXXX steps this XX...",,,"No displaced rib fractures, pneumothorax, or p...",3_IM-1384-1001.dcm.png,Frontal,1384,1001,,,,,CXR3_IM-1384,0,../data/images/CXR3_IM-1384/0.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7461,3997,Opacity/lung/upper lobe/right/round/small;Gran...,Opacity;Granuloma,PA and lateral views of the chest.,XXXX-year-old male with positive PPD.,None available.,"Heart size within normal limits. Small, nodula...","No acute findings, no evidence for active TB.",3997_IM-2048-1002.dcm.png,Lateral,2048,1002,CXR3997_IM-2048,"{'bone': '', 'heart': 'Heart size within norma...",test,"Heart size within normal limits. Small, nodula...",CXR3997_IM-2048,1,../data/images/CXR3997_IM-2048/1.png
7462,3998,normal,normal,"PA and lateral chest XXXX, XXXX XXXX comparis...",tuberculosis positive PPD,,,Heart size is normal and the lungs are clear.,3998_IM-2048-1001.dcm.png,Frontal,2048,1001,,,,,CXR3998_IM-2048,0,../data/images/CXR3998_IM-2048/0.png
7463,3998,normal,normal,"PA and lateral chest XXXX, XXXX XXXX comparis...",tuberculosis positive PPD,,,Heart size is normal and the lungs are clear.,3998_IM-2048-1002.dcm.png,Lateral,2048,1002,,,,,CXR3998_IM-2048,1,../data/images/CXR3998_IM-2048/1.png
7464,3999,normal,normal,"CHEST PA and LATERAL: on XXXX, XXXX.",This is a XXXX-year-old female patient with sh...,"Chest x-XXXX, XXXX, XXXX.",,The cardiac silhouette is normal in size and c...,3999_IM-2049-1001.dcm.png,Frontal,2049,1001,,,,,CXR3999_IM-2049,0,../data/images/CXR3999_IM-2049/0.png


In [71]:
IMAGES_DIR = '../data/images'
from PIL import Image

df = pd.read_csv(os.path.join(DATA_PATH, 'data_prep.csv'))

df = df[df['image_found']]

In [69]:
def load_images(image_filenames):
    """Load images from a directory."""
    images = []
    for filename in image_filenames:
        images.append(Image.open(filename))
    return images

In [70]:
load_images(df['image_filename'][:10])

[<PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x624>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x420>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x624>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x624>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x621>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x621>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x624>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x624>]

In [79]:
df = pd.read_csv(os.path.join(DATA_PATH, 'data_prep.csv'))

df['image_path'] = df['filename']
df['original_report'] = df['original_report']

df_train = df[ (df['split']=='train') & df['image_found'] ][['image_path', 'original_report']]

data = df_train.to_dict(orient='records')

# Write the list of dictionaries to a JSON file
output_file = os.path.join(DATA_PATH, 'finetune_data_train.json')
with open(output_file, 'w') as f:
    import json
    json.dump(data, f, indent=4)


In [69]:
df = pd.read_csv(os.path.join(DATA_PATH, 'data_prep.csv'))

df['image_path'] = df['image_filename']
df['original_report'] = df['original_report']
df[('annotation')] = df['annotation']

df_train = df[(df['split'] == 'train') & df['image_found']][['id', 'image_path', 'original_report', 'annotation']]

images = df_train.groupby('id').apply(lambda df: list(df['image_path']) )
reports = df_train.groupby('id').apply(lambda df: df['original_report'].mode()[0] )
annotations = df_train.groupby('id').apply(lambda df: df['annotation'].mode()[0] )

  images = df_train.groupby('id').apply(lambda df: list(df['image_path']) )
  reports = df_train.groupby('id').apply(lambda df: df['original_report'].mode()[0] )
  annotations = df_train.groupby('id').apply(lambda df: df['annotation'].mode()[0] )


In [70]:
df_train.head()

Unnamed: 0,id,image_path,original_report,annotation
0,CXR1_1_IM-0001,/xray_report_gen/data/images/CXR1_IM-0001/0.png,The cardiac silhouette and mediastinum size ar...,"{'bone': '', 'heart': 'Cardiac silhouette with..."
1,CXR1_1_IM-0001,/xray_report_gen/data/images/CXR1_IM-0001/1.png,The cardiac silhouette and mediastinum size ar...,"{'bone': '', 'heart': 'Cardiac silhouette with..."
8,CXR5_IM-2117,/xray_report_gen/data/images/CXR5_IM-2117/0.png,The cardiomediastinal silhouette and pulmonary...,"{'bone': 'Small T-spine osteophytes.', 'heart'..."
9,CXR5_IM-2117,/xray_report_gen/data/images/CXR5_IM-2117/1.png,The cardiomediastinal silhouette and pulmonary...,"{'bone': 'Small T-spine osteophytes.', 'heart'..."
10,CXR6_IM-2192,/xray_report_gen/data/images/CXR6_IM-2192/0.png,Heart size and mediastinal contour are within ...,{'bone': 'Mild degenerative change of the thor...


In [73]:
d = []
for image, report, annotation in zip(images, reports, annotations):
    d.append({
        "messages": [
            {"role": "system", "content": "system_prompt"},
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": im} for im in image
                ]
                    +
                    [
                        {"type": "text", "text": "original_report"}
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": "annotation"}
                ]
            }
        ]
    })

In [74]:
with open('../data/fake_data_2.json', 'w') as fp:
    json.dump(d, fp)

In [75]:
        with open('../data/fake_data_2.json', "r") as f:
            data = json.load(f)
            for p in data:
                p['messages'][0]['content'] = 'system_prompt'
                p['messages'][1]['content'][-1]['text'] = 'original_report'
                p['messages'][2]['content'] = 'annotation'

In [76]:
data

[{'messages': [{'role': 'system', 'content': 'system_prompt'},
   {'role': 'user',
    'content': [{'type': 'image',
      'image': '/xray_report_gen/data/images/CXR1000_IM-0003/0.png'},
     {'type': 'image',
      'image': '/xray_report_gen/data/images/CXR1000_IM-0003/1.png'},
     {'type': 'image',
      'image': '/xray_report_gen/data/images/CXR1000_IM-0003/2.png'},
     {'type': 'text', 'text': 'original_report'}]},
   {'role': 'assistant', 'content': 'annotation'}]},
 {'messages': [{'role': 'system', 'content': 'system_prompt'},
   {'role': 'user',
    'content': [{'type': 'image',
      'image': '/xray_report_gen/data/images/CXR1001_IM-0004/0.png'},
     {'type': 'image',
      'image': '/xray_report_gen/data/images/CXR1001_IM-0004/1.png'},
     {'type': 'text', 'text': 'original_report'}]},
   {'role': 'assistant', 'content': 'annotation'}]},
 {'messages': [{'role': 'system', 'content': 'system_prompt'},
   {'role': 'user',
    'content': [{'type': 'image',
      'image': '/xra