# PadChest-GR Dataset Analysis

This notebook analyzes the PadChest-GR dataset, specifically the master table and grounded reports.

In [4]:
import pandas as pd
import json
import os
import matplotlib.pyplot as plt

# Set paths
dataset_dir = 'dataset'
csv_path = os.path.join(dataset_dir, 'master_table.csv')
json_path = os.path.join(dataset_dir, 'grounded_reports_20240819.json')


## 1. Load Master Table (CSV)

In [5]:
# Check if CSV exists, otherwise try zip
if not os.path.exists(csv_path) and os.path.exists(csv_path + '.zip'):
    print(f"Loading {csv_path}.zip...")
    df = pd.read_csv(csv_path + '.zip')
else:
    print(f"Loading {csv_path}...")
    df = pd.read_csv(csv_path)

print(f"Shape: {df.shape}")
df.head()

Loading dataset/master_table.csv...
Shape: (8787, 23)


Unnamed: 0,StudyID,ImageID,label,boxes_count,extra_boxes_count,locations,prior_study,progression_status,prior_imageID,sentence_en,...,split,PatientID,patient_is_benchmark,PatientBirth,PatientSex_DICOM,StudyDate_DICOM,StudyDate,PatientAge,label_group,Year
0,251488034557732338959601580328898734705,251488034557732338959601580328898734705_wigcpj...,apical pleural thickening,2,2,['pleural'],False,,,Minimal biapical pleural thickening.,...,test,243089588168270594079245995953840813067,True,1950,F,20141114,2014,64,pleural thickening,2014
1,251488034557732338959601580328898734705,251488034557732338959601580328898734705_wigcpj...,costophrenic angle blunting,0,0,"['left costophrenic angle', 'costophrenic angle']",False,,,Slight blunting of the posterior left costophr...,...,test,243089588168270594079245995953840813067,True,1950,F,20141114,2014,64,pleural effusion,2014
2,109876241481532572619807710001909166420,109876241481532572619807710001909166420_dln48o...,Normal,0,0,[],True,,216840111366964013217898866992011328080029653_...,,...,train,202199914138297148429337408346379454019,True,1947,M,20140819,2014,67,Normal,2014
3,312447841912476392836878255221324752437,312447841912476392836878255221324752437_m4dphn...,mediastinic lipomatosis,1,0,"['cardiac', 'right', 'cardiophrenic angle']",False,,,Occupation of the right cardiophrenic angle pr...,...,validation,241659208169457881488090557648675833268,True,1955,M,20141230,2014,59,Other Entities,2014
4,312447841912476392836878255221324752437,312447841912476392836878255221324752437_m4dphn...,superior mediastinal enlargement,1,0,"['mediastinum', 'superior mediastinum']",False,,,Slight widening of the superior mediastinum.,...,validation,241659208169457881488090557648675833268,True,1955,M,20141230,2014,59,Other Entities,2014


In [6]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 8787 entries, 0 to 8786
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   StudyID               8787 non-null   str   
 1   ImageID               8787 non-null   str   
 2   label                 8787 non-null   str   
 3   boxes_count           8787 non-null   int64 
 4   extra_boxes_count     8787 non-null   int64 
 5   locations             8787 non-null   str   
 6   prior_study           8787 non-null   bool  
 7   progression_status    664 non-null    str   
 8   prior_imageID         3186 non-null   str   
 9   sentence_en           7331 non-null   str   
 10  sentence_es           7331 non-null   str   
 11  study_is_benchmark    8787 non-null   bool  
 12  study_is_validation   8787 non-null   bool  
 13  split                 8787 non-null   str   
 14  PatientID             8787 non-null   object
 15  patient_is_benchmark  8787 non-null   bool  
 16 

## 2. Load Grounded Reports (JSON)

In [7]:
print(f"Loading {json_path}...")
with open(json_path, 'r') as f:
    grounded_data = json.load(f)

print(f"Number of records in JSON: {len(grounded_data)}")

Loading dataset/grounded_reports_20240819.json...
Number of records in JSON: 4555


In [8]:
# Inspect the first item
if len(grounded_data) > 0:
    print(json.dumps(grounded_data[0], indent=2))

{
  "StudyID": "251488034557732338959601580328898734705",
  "ImageID": "251488034557732338959601580328898734705_wigcpj.png",
  "PreviousStudyID": null,
  "PreviousImageID": null,
  "findings": [
    {
      "sentence_en": "Minimal biapical pleural thickening.",
      "sentence_es": "M\u00ednimo engrosamiento pleural biapical.",
      "abnormal": true,
      "boxes": [
        [
          0.27670749,
          0.10440252,
          0.48518978,
          0.18993711
        ],
        [
          0.55240538,
          0.08930818,
          0.76772316,
          0.20251572
        ]
      ],
      "extra_boxes": [
        [
          0.21181802,
          0.07398137,
          0.50205083,
          0.2139724
        ],
        [
          0.54713554,
          0.08486956,
          0.77537688,
          0.2139724
        ]
      ],
      "labels": [
        "apical pleural thickening"
      ],
      "locations": [
        "pleural"
      ],
      "progression": null
    },
    {
      "sen

## 3. Basic Analysis
Join analysis or further exploration can be done here.