# Get the data and clone github

In [1]:
!pip install ftfy
!pip install gdown
import os
import glob
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from tqdm import tqdm

from PIL import Image
import h5py
import cv2
from typing import *
from pathlib import Path

import torch
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize

Collecting ftfy
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.2.0
Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0


In [2]:
os.chdir('/kaggle/input/adip-mimiccxr/data/train')

In [3]:
df = pd.read_csv('all.csv')

In [4]:
df.head(5)

Unnamed: 0,id,split,label,img,text
0,s59735352,train,"['Atelectasis', 'Pleural Effusion']",data/train/image/s59735352.jpg,"In comparison with the study of ___, there is ..."
1,s55468481,train,['Atelectasis'],data/train/image/s55468481.jpg,PA and lateral views of the chest. There is m...
2,s58538557,train,"['Atelectasis', 'Pleural Effusion']",data/train/image/s58538557.jpg,PA and lateral views of the chest were provide...
3,s57010157,train,"['Atelectasis', 'Lung_Lesion', 'Pleural Effusi...",data/train/image/s57010157.jpg,Frontal and lateral chest radiographs demonstr...
4,s59111318,train,"['Atelectasis', 'Pleural Effusion']",data/train/image/s59111318.jpg,Frontal and lateral views of the chest were ob...


In [5]:
os.chdir('/kaggle/working/')
!git clone https://github.com/rajpurkarlab/CheXzero.git

  pid, fd = os.forkpty()


Cloning into 'CheXzero'...
remote: Enumerating objects: 251, done.[K
remote: Counting objects: 100% (118/118), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 251 (delta 104), reused 88 (delta 88), pack-reused 133[K
Receiving objects: 100% (251/251), 1.39 MiB | 17.11 MiB/s, done.
Resolving deltas: 100% (141/141), done.


## Resized Image

In [6]:
def preprocess(img, desired_size=320):
    old_size = img.size
    ratio = float(desired_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    img = img.resize(new_size, Image.Resampling.LANCZOS)
    # create a new image and paste the resized on it

    new_img = Image.new('L', (desired_size, desired_size))
    new_img.paste(img, ((desired_size-new_size[0])//2,
                        (desired_size-new_size[1])//2))
    return new_img

In [7]:
def img_to_hdf5(cxr_paths: List[Union[str, Path]], out_filepath: str, resolution=320): 
    """
    Convert directory of images into a .h5 file given paths to all 
    images. 
    """
    dset_size = len(cxr_paths)
    failed_images = []
    with h5py.File(out_filepath,'w') as h5f:
        img_dset = h5f.create_dataset('cxr', shape=(dset_size, resolution, resolution))    
        for idx, path in enumerate(tqdm(cxr_paths)):
            try: 
                # read image using cv2
                img = cv2.imread(str(path))
                # convert to PIL Image object
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img_pil = Image.fromarray(img)
                # preprocess
                img = preprocess(img_pil, desired_size=resolution)     
                img_dset[idx] = img
            except Exception as e: 
                failed_images.append((path, e))
    print(f"{len(failed_images)} / {len(cxr_paths)} images failed to be added to h5.", failed_images)

In [8]:
input_data_paths = ['/kaggle/input/adip-mimiccxr/data/train/images/' + ids + '.jpg' for ids in df['id']]
output_h5_path = '/kaggle/working/CheXzero/data/test_cxr.h5'
os.makedirs(os.path.dirname(output_h5_path), exist_ok=True)
img_to_hdf5(input_data_paths,output_h5_path)

100%|██████████| 2360/2360 [00:40<00:00, 57.75it/s]

0 / 2360 images failed to be added to h5. []





# Download Model

In [9]:
import gdown

In [10]:
os.makedirs('/kaggle/working/CheXzero/checkpoints/chexzero_weights', exist_ok=True)
os.chdir('/kaggle/working/CheXzero/checkpoints/chexzero_weights')
output = os.path.join(os.getcwd(),'model.pt')
url = 'https://drive.google.com/uc?id=1S1qkcx0XDW0CRUDvZvpt_kChCNZ3VM2E'
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1S1qkcx0XDW0CRUDvZvpt_kChCNZ3VM2E
From (redirected): https://drive.google.com/uc?id=1S1qkcx0XDW0CRUDvZvpt_kChCNZ3VM2E&confirm=t&uuid=e1e6530e-0bc6-4d80-b653-621b6b1dc6f2
To: /kaggle/working/CheXzero/checkpoints/chexzero_weights/model.pt
100%|██████████| 354M/354M [00:05<00:00, 66.6MB/s] 


'/kaggle/working/CheXzero/checkpoints/chexzero_weights/model.pt'

# Zero-shot

In [11]:
os.chdir('/kaggle/working/CheXzero')
import zero_shot
from eval import evaluate, bootstrap
from zero_shot import make, make_true_labels, run_softmax_eval

%load_ext autoreload
%autoreload 2

In [12]:
cxr_filepath: str = '/kaggle/working/CheXzero/data/test_cxr.h5' # filepath of chest x-ray images (.h5)
cxr_true_labels_path: Optional[str] = '../data/groundtruth.csv' # (optional for evaluation) if labels are provided, provide path
predictions_dir = '/predictions'
cache_dir = '/predictions/cached'
model_paths = ['/kaggle/working/CheXzero/checkpoints/chexzero_weights/model.pt']
context_length: int = 77

# ------- LABELS ------  #
# Define labels to query each image | will return a prediction for each label
cxr_labels: List[str] = ['Atelectasis','Cardiomegaly', 
                                      'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion',
                                      'Lung Opacity', 'No Finding','Pleural Effusion', 'Pleural Other', 'Pneumonia', 
                                      'Pneumothorax', 'Support Devices']
    
# ---- TEMPLATES ----- # 
# Define set of templates | see Figure 1 for more details                        
cxr_pair_template: Tuple[str] = ("{}", "no {}")

In [13]:
## Run the model on the data set using ensembled models
def ensemble_models(
    model_paths: List[str], 
    cxr_filepath: str, 
    cxr_labels: List[str], 
    cxr_pair_template: Tuple[str], 
    cache_dir: str = None, 
    save_name: str = None,
) -> Tuple[List[np.ndarray], np.ndarray]: 
    """
    Given a list of `model_paths`, ensemble model and return
    predictions. Caches predictions at `cache_dir` if location provided.

    Returns a list of each model's predictions and the averaged
    set of predictions
    """

    predictions = []
    model_paths = sorted(model_paths) # ensure consistency of 
    for path in model_paths: # for each model
        model_name = Path(path).stem

        # load in model and `torch.DataLoader`
        model, loader = make(
            model_path=path, 
            cxr_filepath=cxr_filepath, 
        ) 
        
        # path to the cached prediction
        if cache_dir is not None:
            if save_name is not None: 
                cache_path = Path(cache_dir) / f"{save_name}_{model_name}.npy"
            else: 
                cache_path = Path(cache_dir) / f"{model_name}.npy"

        # if prediction already cached, don't recompute prediction
        if cache_dir is not None and os.path.exists(cache_path): 
            print("Loading cached prediction for {}".format(model_name))
            y_pred = np.load(cache_path)
        else: # cached prediction not found, compute preds
            print("Inferring model {}".format(path))
            y_pred = run_softmax_eval(model, loader, cxr_labels, cxr_pair_template)
            if cache_dir is not None: 
                Path(cache_dir).mkdir(exist_ok=True, parents=True)
                np.save(file=cache_path, arr=y_pred)
        predictions.append(y_pred)
    
    # compute average predictions
    y_pred_avg = np.mean(predictions, axis=0)
    
    return predictions, y_pred_avg

## Chạy trên toàn bộ tập test

In [14]:
predictions, y_pred_avg = ensemble_models(
    model_paths=model_paths, 
    cxr_filepath=cxr_filepath, 
    cxr_labels=cxr_labels, 
    cxr_pair_template=cxr_pair_template, 
    cache_dir=cache_dir,
)

100%|███████████████████████████████████████| 354M/354M [00:05<00:00, 66.5MiB/s]


Inferring model /kaggle/working/CheXzero/checkpoints/chexzero_weights/model.pt


  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/2360 [00:00<?, ?it/s]



  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/2360 [00:00<?, ?it/s]

## Chạy trên 10 mẫu test đầu tiên (nên thêm savename khác nhau để mỗi lần chạy là một kết quả khác)

In [15]:
input_data_paths = ['/kaggle/input/adip-mimiccxr/data/train/images/' + ids + '.jpg' for ids in df['id'][:10]]
output_h5_path = '/kaggle/working/CheXzero/data/test_cxr_10sample.h5'
os.makedirs(os.path.dirname(output_h5_path), exist_ok=True)
img_to_hdf5(input_data_paths,output_h5_path)

100%|██████████| 10/10 [00:00<00:00, 24.38it/s]

0 / 10 images failed to be added to h5. []





In [16]:
predictions, y_pred_avg = ensemble_models(
    model_paths=model_paths, 
    cxr_filepath='/kaggle/working/CheXzero/data/test_cxr_10sample.h5', 
    cxr_labels=cxr_labels, 
    cxr_pair_template=cxr_pair_template, 
    cache_dir=cache_dir,
    save_name = '1st'
)

Inferring model /kaggle/working/CheXzero/checkpoints/chexzero_weights/model.pt


  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [17]:
def get_true_labels(cxr_labels, predictions):
    # Define the threshold
    threshold = 0.501
    # Create true labels based on the predicted probabilities
    true_labels = [label for label, probability in zip(cxr_labels, predictions) if probability > threshold]
    return true_labels

true_labels = []
for prediction in predictions[0]:
    true_labels.append(get_true_labels(cxr_labels, prediction))

In [18]:
true_labels[1:2]

[['Atelectasis']]

In [19]:
df[1:2]

Unnamed: 0,id,split,label,img,text
1,s55468481,train,['Atelectasis'],data/train/image/s55468481.jpg,PA and lateral views of the chest. There is m...
