In [1]:
!pip3 install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import sys
sys.path.append('/home/jovyan/ChestXray-14')

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa

from modules.utils import get_dataset
from modules.dataset import LABELS

2023-04-09 20:31:47.859473: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


ROOT_PATH: /home/jovyan/ChestXray-14/experiments/Under_sampling-multilabel_classification


In [4]:
import os
import pprint

In [5]:
from pathlib import Path

In [6]:
CURRENT_PATH = os.path.abspath("")
CURRENT_PATH

'/home/jovyan/ChestXray-14/experiments/Under_sampling-multilabel_classification'

In [7]:
num_class = 15 # TODO: change to 15 for multi-labels
ROOT_PATH = "/home/jovyan/ChestXray-14"
INPUT_PATH = f"{ROOT_PATH}/dataset/ChestXray NIH"
EXPERIMENT_NAME = "under_sampling_5_folds_with_cross_entropy_loss_with_model_under_sampling_dropout"

In [8]:
class Dataset:
    INPUT_PATH = INPUT_PATH
    
    def __init__(self, fold_num):
        self.fold_num = fold_num
        self.DATA_PATH = "under_sampling_5_folds_dataset_train_valid_test"
    
    def get_train(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/{self.DATA_PATH}/folds/fold{self.fold_num}/train/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset

    def get_valid(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/{self.DATA_PATH}/folds/fold{self.fold_num}/valid/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset
    
    def get_test(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/{self.DATA_PATH}/test/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset

## Experiment

In [9]:
class Evaluate:
    def __init__(self, model_path):
        self.y_true = None
        self.y_preds = None
        self.model_path = model_path
        self.model = self.get_model(model_path)
        self.best_thresholds = None
        self.thresholds_200 = None
    
    def get_model(self, path):
        return tf.keras.models.load_model(path)
    
    def get_y_true(self, data):
        y_true=[]
        for X,y in data:
            for label in y:
                y_true.append(label)
        y_true = tf.Variable(y_true)
        self.y_true = y_true
        return y_true

    def get_confusion_metrics(self, y_true, y_preds):
        m = tf.keras.metrics.AUC(multi_label=True)
        m.update_state(y_true, y_preds)

        thresholds = m.thresholds
        variables = m.variables
        TP = variables[0]
        TN = variables[1]
        FP = variables[2]
        FN = variables[3]
        return thresholds, TP, TN, FP, FN

    def model_predict(self, valid_dataset):
        return self.model.predict(valid_dataset)

    def get_f1_scores_200_thresholds(self, valid_dataset):
        self.y_true = self.get_y_true(valid_dataset)
        self.y_preds = self.model_predict(valid_dataset)
        
        confusion_metrics = self.get_confusion_metrics(self.y_true, self.y_preds)
        thresholds, TP, TN, FP, FN = confusion_metrics
        self.thresholds_200 = thresholds
        f1_class_dict = dict()
        for i in range(len(thresholds)):
            tp, tn, fp, fn = TP[i], TN[i], FP[i], FN[i]
            for label_index in range(num_class):
                f1_score = 2*tp[label_index] / (2*tp[label_index] + fp[label_index] + fn[label_index])
                try:
                    f1_class_dict[LABELS[label_index]].append(f1_score)
                except KeyError:
                    f1_class_dict[LABELS[label_index]] = [f1_score]
        print(LABELS)
        return f1_class_dict
    
    def get_f1_scores(self, valid_dataset):
        self.y_true = self.get_y_true(valid_dataset)
        self.y_preds = self.model_predict(valid_dataset)
        metric = tfa.metrics.MultiLabelConfusionMatrix(num_classes=num_class)
        metric.update_state(self.y_true,
                            np.greater_equal(self.y_preds, self.best_thresholds).astype('int8'))
        result = metric.result()
        
        f1_class_dict = dict()
        for idx, confusion in enumerate(result):
            label = LABELS[idx]
            TP, TN, FP, FN = (confusion[1, 1],
                              confusion[0, 0],
                              confusion[0, 1],
                              confusion[1, 0])
            f1_score = 2*TP / (2*TP + FP + FN)
            f1_class_dict[label] = [f1_score.numpy()]
        return f1_class_dict
    
    def get_precision_scores(self, valid_dataset, new_calculate=True):
        if new_calculate is True:
            self.y_true = self.get_y_true(valid_dataset)
            self.y_preds = self.model_predict(valid_dataset)
        metric = tfa.metrics.MultiLabelConfusionMatrix(num_classes=num_class)
        metric.update_state(self.y_true,
                            np.greater_equal(self.y_preds, self.best_thresholds).astype('int8'))
        result = metric.result()
        
        precision_class_dict = dict()
        for idx, confusion in enumerate(result):
            label = LABELS[idx]
            TP, TN, FP, FN = (confusion[1, 1],
                              confusion[0, 0],
                              confusion[0, 1],
                              confusion[1, 0])
            precision = TP / (TP + FP)
            precision_class_dict[label] = [precision.numpy()]
        return precision_class_dict
    
    def get_recall_scores(self, valid_dataset, new_calculate=True):
        if new_calculate is True:
            self.y_true = self.get_y_true(valid_dataset)
            self.y_preds = self.model_predict(valid_dataset)
        metric = tfa.metrics.MultiLabelConfusionMatrix(num_classes=num_class)
        metric.update_state(self.y_true,
                            np.greater_equal(self.y_preds, self.best_thresholds).astype('int8'))
        result = metric.result()
        
        recall_class_dict = dict()
        for idx, confusion in enumerate(result):
            label = LABELS[idx]
            TP, TN, FP, FN = (confusion[1, 1],
                              confusion[0, 0],
                              confusion[0, 1],
                              confusion[1, 0])
            recall = TP / (TP + FN)
            recall_class_dict[label] = [recall.numpy()]
        return recall_class_dict
    
    def get_best_threshold(self,
                           valid_dataset=None,
                           save_best_thresholds=f"{ROOT_PATH}/results/paper/table3_1/best_thresholds.csv",
                           save_200_thresholds=f"{ROOT_PATH}/results/paper/table3_1/f1_per_thresholds.csv"):
        if valid_dataset is None:
            assert ValueError("test dataset is None")
        
        f1_scores_dict = self.get_f1_scores_200_thresholds(valid_dataset)
        best_thresholds_dict = {"thresholds": [], "f1_most": [], "label": []}
        for key, value in f1_scores_dict.items():
            f1_arg_max = np.argmax(value)
            best_thresholds_dict["f1_most"].append(value[f1_arg_max].numpy())
            best_thresholds_dict["label"].append(key)
            best_thresholds_dict["thresholds"].append(self.thresholds_200[f1_arg_max])
        
        df = pd.DataFrame(best_thresholds_dict)
        df = df.set_index("label")
        df.to_csv(save_best_thresholds, index=True)
        print(f"{save_best_thresholds} was success!")
        # print(df)
        
        df_200_thresholds = pd.DataFrame(f1_scores_dict)
        df_200_thresholds.to_csv(save_200_thresholds, index=True)
        print(f"{save_200_thresholds} was success!")
        self.best_thresholds = df.copy()["thresholds"].values

    def __enter__(self):
        print("Doing ...!")
        return self

    def __exit__(self, *arg):
        self.y_true = None
        self.y_preds = None
        print("Done!")

## Using

In [10]:
from tqdm.notebook import tqdm

In [11]:
import time

class TimeUsed:
    def start(self):
        self.start = time.time()
     
    def stop_and_report(self):
        self.end = time.time()
        print("===== " * 5,
              "Model used for: {:.2f} second(s)".format(self.end - self.start),
              "===== " * 5,
              "\n\n"
        )

In [13]:
for fold_num in tqdm(range(4, 5+1)):
    time_counter = TimeUsed()
    time_counter.start()
    
    # Record time for training
    MODEL_PATH = f'{CURRENT_PATH}/results/models/{EXPERIMENT_NAME}/EfficientNetB0_None_FOLD_{fold_num}.h5'
    RESULT_EVALUATE_PATH = os.path.join(CURRENT_PATH, "results", "evaluate", EXPERIMENT_NAME, "EfficientNetB0_None", "Folds", f"fold_{fold_num}")
    Path(RESULT_EVALUATE_PATH).mkdir(parents=True, exist_ok=True)
    
    test_dataset = Dataset(fold_num).get_test()
    
    best_model = Evaluate(MODEL_PATH)
    best_model.get_best_threshold(
        valid_dataset=test_dataset,
        save_best_thresholds=f"{RESULT_EVALUATE_PATH}/best_thresholds.csv",
        save_200_thresholds=f"{RESULT_EVALUATE_PATH}/f1_per_thresholds.csv"
    )
    
    with best_model:
        f1_each_class = best_model.get_f1_scores(test_dataset)
        pprint.pprint(f1_each_class)
        df = pd.DataFrame(f1_each_class)
        df.to_csv(f"{RESULT_EVALUATE_PATH}/f1_scores.csv", index=False)

        precision_each_class = best_model.get_precision_scores(test_dataset, new_calculate=False)
        pd.DataFrame(precision_each_class)\
            .to_csv(f"{RESULT_EVALUATE_PATH}/precision.csv", index=False)

        recall_each_class = best_model.get_recall_scores(test_dataset, new_calculate=False)
        pd.DataFrame(recall_each_class)\
            .to_csv(f"{RESULT_EVALUATE_PATH}/recall.csv", index=False)

        # print(df)
    
    time_counter.stop_and_report()

  0%|          | 0/2 [00:00<?, ?it/s]

['No Finding', 'Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening', 'Cardiomegaly', 'Nodule', 'Mass', 'Hernia']
/home/jovyan/ChestXray-14/experiments/Under_sampling-multilabel_classification/results/evaluate/under_sampling_5_folds_with_cross_entropy_loss_with_model_under_sampling_dropout/EfficientNetB0_None/Folds/fold_4/best_thresholds.csv was success!
/home/jovyan/ChestXray-14/experiments/Under_sampling-multilabel_classification/results/evaluate/under_sampling_5_folds_with_cross_entropy_loss_with_model_under_sampling_dropout/EfficientNetB0_None/Folds/fold_4/f1_per_thresholds.csv was success!
Doing ...!
{'Atelectasis': [0.1805347],
 'Cardiomegaly': [0.048474826],
 'Consolidation': [0.07667896],
 'Edema': [0.03900306],
 'Effusion': [0.2095892],
 'Emphysema': [0.04251233],
 'Fibrosis': [0.028052587],
 'Hernia': [0.004094348],
 'Infiltration': [0.29960188],
 'Mass': [0.096838534],
 'No Finding': [0