In [None]:
# export

import os
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import matplotlib.image as plt_image
import multiprocessing as mp

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import Callback

In [None]:
#export
NO_FINDING = "No Finding"
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
SUBSET_MEAN = 0.50589985
SUBSET_STD = 0.23221017
SMALL_FRAKTION = 0.1

In [None]:
#export
def seed_everything(seed=92):
    try: random.seed(seed)
    except: pass
    try: np.random.seed(seed)
    except: pass
    try: torch.manual_seed(seed)
    except: pass

In [None]:
#export
def ignore_warnings():
    warnings.filterwarnings("ignore")

In [None]:
#export
def get_working_dir():
    return Path(f"{os.environ.get('HOME')}/work/crx8")

In [None]:
#export
def get_data_path():
    return Path(f"{os.environ.get('HOME')}/.datasets/CRX8")

In [None]:
#export
def get_image_path(): return get_data_path()/"images"

In [1]:
#export
def get_train_val_list(): 
    with open(get_data_path()/"train_val_list.txt") as f:
        train_val_list = f.readlines()
    return [l.replace("\n", "") for l in train_val_list]

In [None]:
#export
def get_dataframes(reduced=True, small=False):
    image_dir = get_image_path()
    train_df, test_df = get_label_dfs(reduced=reduced)
    X_train, X_valid, y_train, y_valid = get_train_valid(train_df)
    train_df = pd.concat([X_train, y_train], axis=1)
    valid_df = pd.concat([X_valid, y_valid], axis=1)
    
    if small:
        train_idx = int(train_df.shape[0] * SMALL_FRAKTION)
        val_idx = int(valid_df.shape[0] * SMALL_FRAKTION)
        test_idx = int(test_df.shape[0] * SMALL_FRAKTION)
        
        train_df = train_df.iloc[:train_idx,:]
        valid_df = valid_df.iloc[:val_idx,:]
        test_df = test_df.iloc[:test_idx,:]
    
    return train_df, valid_df, test_df

In [None]:
#export
def get_label_dfs(reduced=True):
    data = pd.read_csv(get_data_path()/"Data_Entry_2017_v2020.csv")
    additional_data = data.drop(columns=["Image Index", "Finding Labels"])
    values = data.values[:,:2]
    labels = get_labels(reduced=reduced)
    cols = ["Image Index", *labels]
    col2idx = {c:i for i, c in enumerate(labels)}
    arr = np.zeros((values.shape[0], len(labels)))

    for row in range(arr.shape[0]):
        image_labels = values[row, 1].split("|")
        for col, col_name in enumerate(cols):
            for lbl in image_labels:
                if reduced:
                    if lbl == NO_FINDING: continue
                arr[row, col2idx[lbl]] = 1

    new_data = pd.DataFrame({"Image Index": values[:,0]})
    new_data = pd.concat([new_data, pd.DataFrame(arr, columns=labels), additional_data], axis=1)
    
    train_label_df = new_data[new_data["Image Index"].isin(get_train_val_list())]
    test_label_df = new_data[new_data["Image Index"].isin(get_test_list()) ]
    
    return train_label_df, test_label_df

In [None]:
#export
def get_labels(reduced=True):
    data = pd.read_csv(get_data_path()/"Data_Entry_2017_v2020.csv")
    tmp = data["Finding Labels"].values
    labels = set()
    for el in tmp:
        for l in el.split("|"): labels.add(l)
    if reduced: labels = [l for l in list(labels) if l != "No Finding"]
    return list(labels)

In [None]:
#export
def get_label_df():
    warnings.warn("Deprecated! Use 'get_label_dfs'!")
    
    data = pd.read_csv(get_data_path()/"Data_Entry_2017_v2020.csv")
    additional_data = data.drop(columns=["Image Index", "Finding Labels"])
    values = data.values[:,:2]
    labels = get_labels()
    cols = ["Image Index", *labels]
    col2idx = {c:i for i, c in enumerate(labels)}
    arr = np.zeros((values.shape[0], len(labels)))

    for row in range(arr.shape[0]):
        image_labels = values[row, 1].split("|")
        for col, col_name in enumerate(cols):
            for lbl in image_labels:
                arr[row, col2idx[lbl]] = 1

    new_data = pd.DataFrame({"Image Index": values[:,0]})
    new_data = pd.concat([new_data, pd.DataFrame(arr, columns=labels), additional_data], axis=1)
    return new_data

In [None]:
#export
def check_for_leakage(df1, df2):
    patient_col = "Patient ID"
    df1_patients_unique = set(df1[patient_col].values)
    df2_patients_unique = set(df2[patient_col].values)
    patients_in_both_groups = df1_patients_unique.intersection(df2_patients_unique)
    leakage = len(patients_in_both_groups) > 0
    return leakage

In [None]:
#export
def get_train_valid(data, val_size=0.2, seed=92):
    # Currently with patient overlap!
    warnings.warn("Train-Val-Split currently with patient overlap!")
    labels = get_labels()
    X = data[[c for c in data.columns if c not in labels]]
    y = data[labels]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=val_size, random_state=seed)
    return X_train, X_test, y_train, y_test

In [None]:
#export
def translate2label(arr):
    labels = get_labels()
    idx2lbl = {i: l for i, l in enumerate(labels)}
    pos_lbls = [idx2lbl[idx] for idx, v in enumerate(arr) if v == 1]
    if len(pos_lbls) == 0: return "No Finding"
    return "|".join(pos_lbls)

In [None]:
#export
def print_image(x, y):
    x = x * SUBSET_STD + SUBSET_MEAN
    plt.title(translate2label(y))
    plt.imshow(x, cmap="bone");

In [None]:
#export
def print_batch(X, y):
    assert X.shape[0] % 2 == 0
    X = X * SUBSET_STD + SUBSET_MEAN
    x_dim = int(X.shape[0] / 2)
    y_dim = int(X.shape[0] / x_dim)
    
    axes = []
    figure = plt.figure(figsize=(2*x_dim, 20*y_dim))
    i = 0
    for y_idx in range(y_dim):
        for x_idx in range(x_dim):
            axes.append(figure.add_subplot(x_dim, y_dim, i+1))
            axes[-1].set_title(translate2label(y[i]))
            plt.imshow(X[i])
            i += 1
    figure.tight_layout()
    plt.show()

In [None]:
#export
def get_test_list(): 
    with open(get_data_path()/"test_list.txt") as f:
        train_val_list = f.readlines()
    return [l.replace("\n", "") for l in train_val_list]

In [None]:
#export
def get_train_generator(df, image_dir=get_image_path(), x_col="Image Index", y_cols=get_labels(), shuffle=True, batch_size=8, seed=92, target_w=224, target_h=224):
    """
    Return generator for training set, normalizing using batch
    statistics.

    Args:
      train_df (dataframe): dataframe specifying training data.
      image_dir (str): directory where image files are held.
      x_col (str): name of column in df that holds filenames.
      y_cols (list): list of strings that hold y labels for images.
      sample_size (int): size of sample to use for normalization statistics.
      batch_size (int): images per batch to be fed into model during training.
      seed (int): random seed.
      target_w (int): final width of input images.
      target_h (int): final height of input images.
    
    Returns:
        train_generator (DataFrameIterator): iterator over training set
    """        
    print("getting train generator...") 
    # normalize images
    image_generator = ImageDataGenerator(
        samplewise_center=True,
        samplewise_std_normalization= True,
        horizontal_flip=True,
        rotation_range=20,
        width_shift_range=0.05,
        height_shift_range=0.05,
        brightness_range=(0.9, 1.1)
    )
    
    # flow from directory with specified batch size
    # and target image size
    generator = image_generator.flow_from_dataframe(
            dataframe=df,
            directory=image_dir,
            x_col=x_col,
            y_col=y_cols,
            class_mode="raw",
            batch_size=batch_size,
            shuffle=shuffle,
            seed=seed,
            target_size=(target_w,target_h))
    
    return generator

In [None]:
#export
def get_test_and_valid_generator(valid_df, test_df, train_df, image_dir=get_image_path(), x_col="Image Index", y_cols=get_labels(), sample_size=100, batch_size=8, seed=92, target_w=224, target_h=224):
    """
    Return generator for validation set and test test set using 
    normalization statistics from training set.

    Args:
      valid_df (dataframe): dataframe specifying validation data.
      test_df (dataframe): dataframe specifying test data.
      train_df (dataframe): dataframe specifying training data.
      image_dir (str): directory where image files are held.
      x_col (str): name of column in df that holds filenames.
      y_cols (list): list of strings that hold y labels for images.
      sample_size (int): size of sample to use for normalization statistics.
      batch_size (int): images per batch to be fed into model during training.
      seed (int): random seed.
      target_w (int): final width of input images.
      target_h (int): final height of input images.
    
    Returns:
        test_generator (DataFrameIterator) and valid_generator: iterators over test set and validation set respectively
    """
    print("getting train and valid generators...")
    # get generator to sample dataset
    raw_train_generator = ImageDataGenerator().flow_from_dataframe(
        dataframe=train_df, 
        directory=image_dir, 
        x_col=x_col, 
        y_col=y_cols, 
        class_mode="raw", 
        batch_size=sample_size, 
        shuffle=True, 
        target_size=(target_w, target_h))
    
    # get data sample
    batch = raw_train_generator.next()
    data_sample = batch[0]

    # use sample to fit mean and std for test set generator
    image_generator = ImageDataGenerator(
        featurewise_center=True,
        featurewise_std_normalization= True)
    
    # fit generator to sample from training data
    image_generator.fit(data_sample)

    # get test generator
    valid_generator = image_generator.flow_from_dataframe(
            dataframe=valid_df,
            directory=image_dir,
            x_col=x_col,
            y_col=y_cols,
            class_mode="raw",
            batch_size=batch_size,
            shuffle=False,
            seed=seed,
            target_size=(target_w,target_h))

    test_generator = image_generator.flow_from_dataframe(
            dataframe=test_df,
            directory=image_dir,
            x_col=x_col,
            y_col=y_cols,
            class_mode="raw",
            batch_size=batch_size,
            shuffle=False,
            seed=seed,
            target_size=(target_w,target_h))
    return valid_generator, test_generator

In [None]:
#export
def compute_class_freqs(labels):
    N = labels.shape[0]
    
    positive_frequencies = np.sum(labels, axis=0) / N
    negative_frequencies = 1 - positive_frequencies

    return positive_frequencies, negative_frequencies

In [None]:
#export
def get_weighted_loss(pos_weights, neg_weights, epsilon=1e-7):
    """
    Return weighted loss function given negative weights and positive weights.

    Args:
      pos_weights (np.array): array of positive weights for each class, size (num_classes)
      neg_weights (np.array): array of negative weights for each class, size (num_classes)
    
    Returns:
      weighted_loss (function): weighted loss function
    """
    def weighted_loss(y_true, y_pred):
        """
        Return weighted loss value. 

        Args:
            y_true (Tensor): Tensor of true labels, size is (num_examples, num_classes)
            y_pred (Tensor): Tensor of predicted labels, size is (num_examples, num_classes)
        Returns:
            loss (Tensor): overall scalar loss summed across all classes
        """
        # initialize loss to zero
        loss = 0.0

        for i in range(len(pos_weights)):
            # for each class, add average weighted loss for that class
            loss_pos = -1 * K.mean(pos_weights[i] * y_true[:, i] * K.log(y_pred[:, i] + epsilon))
            loss_neg = -1 * K.mean(neg_weights[i] * (1 - y_true[:, i]) * K.log(1 - y_pred[:, i] + epsilon))
            loss += loss_pos + loss_neg
        
        return loss
    return weighted_loss

In [None]:
#export
def transfer_learn_setup(model, loss, metrics, lr=0.001, n_new_top_layers=1):
    for l in model.layers[:-n_new_top_layers]: l.trainable = False
    for l in model.layers[-n_new_top_layers:]: l.trainable = True
    optim = tf.keras.optimizers.Adam(lr=lr)
    model.compile(optimizer=optim, 
                  loss=loss,
                  metrics=metrics)

In [None]:
#export
def fine_tune_setup(model, loss, metrics, lr=1e-5):
    for l in model.layers: l.trainable = True
    optim = tf.keras.optimizers.Adam(lr=lr)
    model.compile(optimizer=optim, 
                  loss=loss,
                  metrics=metrics)

In [None]:
#export
def auroc(y_hat, y, model_name="model_name", with_chexnet=True, with_previous=True):
    aurocs = {}
    for l_idx, l in enumerate(get_labels()):
        try:
            v = roc_auc_score(y[:, l_idx], y_hat[:, l_idx])
        except ValueError:
            warnings.warn(f"{l} only has one class. Returning 0!")
            v = 0.
        aurocs[l] = v
    df = pd.DataFrame(aurocs.values(), index=aurocs.keys(), columns=[model_name])
    if with_previous:
        prev = load_results()
        for i, c in enumerate(prev.columns):
            if c not in df.columns:
                df = pd.concat([df, prev.iloc[:, i]], axis=1)
    if with_chexnet: df = add_chexnet(df)
    return df

In [None]:
#export
def threshold_predictions(pred, t=0.5):
    return pred >= t

In [3]:
#export
def chexnet_df():
    values = [
        0.8094, 
        0.9248, 
        0.8638, 
        0.7345, 
        0.8676, 
        0.7802, 
        0.7680, 
        0.8887, 
        0.7901, 
        0.8878, 
        0.9371, 
        0.8047, 
        0.8062, 
        0.9164
    ]
    indices = [
        "Atelectasis", 
        "Cardiomegaly", 
        "Effusion", 
        "Infiltration", 
        "Mass", 
        "Nodule", 
        "Pneumonia", 
        "Pneumothorax", 
        "Consolidation", 
        "Edema", 
        "Emphysema", 
        "Fibrosis", 
        "Pleural_Thickening", 
        "Hernia"
    ]
    return pd.DataFrame(
        values,
        index=indices,
        columns=["CheXNet"]
    )

In [None]:
#export
def add_chexnet(df):
    if "CheXNet" not in df.columns:
        return pd.concat([df, chexnet_df()], axis=1)
    return df

In [None]:
#export
def save_results(df):
    df.to_csv(get_working_dir()/"AUROC_results.csv", index=True)

In [None]:
#export
def load_results():
    return pd.read_csv(get_working_dir()/"AUROC_results.csv", index_col="Unnamed: 0")

In [None]:
#export
def array_info(arr, with_hist=True):
    print("Shape:\t", arr.shape)
    print("Mean:\t", arr.mean())
    print("Std:\t", arr.std())
    print("Max:\t", arr.max())
    print("Min:\t", arr.min())
    if with_hist == True:
        print("Histogram:")
        plt.hist(arr.flatten());

In [None]:
#export
def show_image(fn):
    im = plt_image.imread(fn)
    plt.imshow(im, cmap="bone");

In [None]:
#export
def calc_stat(fn):
        image = plt_image.imread(fn)
        return np.array([image.mean(), image.std()])
def calc_stats(df):
    image_names = [get_image_path()/fn for fn in df.loc[:,"Image Index"]]
    with mp.Pool() as p:
        stats = p.map(calc_stat, image_names)
    stats = np.array(stats)
    return stats[:,0].mean(), stats[:,1].mean()

In [None]:
#export
def scheduler(epoch, lr):
    if epoch < 10: return lr
    return lr * tf.math.exp(-0.1)

In [None]:
#export
def lr_finder(model, data, lr_0=1e-8, lr_max=10., beta=0.98):
    N = len(data) - 1
    lr_factor = (lr_max / lr_0) ** (1 / N)
    lr = lr_0
    avg_loss = 0.
    best_loss = 0.
    losses = []
    log_lrs = []
    batch_num = 0
    for X, y in data:
        batch_num += 1
        model.optimizer.lr = lr
        loss = model.train_on_batch(X, y=y)

        #Smooth loss
        avg_loss = beta * avg_loss + (1 - beta) * loss
        smooth_loss = avg_loss / (1 - beta ** batch_num)

        if batch_num > 1 and smooth_loss > 4 * best_loss:
            return log_lrs, losses

        if smooth_loss < best_loss or batch_num == 1:
            best_loss = smooth_loss

        losses.append(smooth_loss)
        log_lrs.append(np.log10(lr))

        lr *= lr_factor
    return log_lrs, losses

In [None]:
#export
def plot_lr_finder(log_lrs, losses):
    plt.plot(log_lrs[5:-5], losses[5:-5], scalex="log10");

In [None]:
#export
# From: https://www.avanwyk.com/tensorflow-2-super-convergence-with-the-1cycle-policy/
class CosineAnnealer:
    def __init__(self, start, end, steps):
        self.start = start
        self.end = end
        self.steps = steps
        self.n = 0
        
    def step(self):
        self.n += 1
        cos = np.cos(np.pi * (self.n / self.steps)) + 1
        return self.end + (self.start - self.end) / 2. * cos

In [None]:
#export
# From: https://www.avanwyk.com/tensorflow-2-super-convergence-with-the-1cycle-policy/
class OneCycleScheduler(Callback):
    def __init__(self, lr_max, steps, mom_min=0.85, mom_max=0.95, phase_1_pct=0.3, div_factor=25.):
        super(OneCycleScheduler, self).__init__()
        lr_min = lr_max / div_factor
        final_lr = lr_max / (div_factor * 1e4)
        phase_1_steps = steps * phase_1_pct
        phase_2_steps = steps - phase_1_steps
        
        self.phase_1_steps = phase_1_steps
        self.phase_2_steps = phase_2_steps
        self.phase = 0
        self.step = 0
        
        self.phases = [[CosineAnnealer(lr_min, lr_max, phase_1_steps), CosineAnnealer(mom_max, mom_min, phase_1_steps)], 
                 [CosineAnnealer(lr_max, final_lr, phase_2_steps), CosineAnnealer(mom_min, mom_max, phase_2_steps)]]
        
        self.lrs = []
        self.moms = []

    def on_train_begin(self, logs=None):
        self.phase = 0
        self.step = 0

        self.set_lr(self.lr_schedule().start)
        self.set_momentum(self.mom_schedule().start)
        
    def on_train_batch_begin(self, batch, logs=None):
        self.lrs.append(self.get_lr())
        self.moms.append(self.get_momentum())

    def on_train_batch_end(self, batch, logs=None):
        self.step += 1
        if self.step >= self.phase_1_steps:
            self.phase = 1
            
        self.set_lr(self.lr_schedule().step())
        self.set_momentum(self.mom_schedule().step())
        
    def get_lr(self):
        try:
            return tf.keras.backend.get_value(self.model.optimizer.lr)
        except AttributeError:
            return None
        
    def get_momentum(self):
        try:
            return tf.keras.backend.get_value(self.model.optimizer.momentum)
        except AttributeError:
            return None
        
    def set_lr(self, lr):
        try:
            tf.keras.backend.set_value(self.model.optimizer.lr, lr)
        except AttributeError:
            pass # ignore
        
    def set_momentum(self, mom):
        try:
            tf.keras.backend.set_value(self.model.optimizer.momentum, mom)
        except AttributeError:
            pass # ignore

    def lr_schedule(self):
        return self.phases[self.phase][0]
    
    def mom_schedule(self):
        return self.phases[self.phase][1]
    
    def plot(self):
        ax = plt.subplot(1, 2, 1)
        ax.plot(self.lrs)
        ax.set_title('Learning Rate')
        ax = plt.subplot(1, 2, 2)
        ax.plot(self.moms)
        ax.set_title('Momentum')

In [None]:
#export
def one_cycle_fits(model, epochs, lr, train_gen, valid_gen, bs, 
                   model_name, metric_name, mode="max"):
    steps = np.ceil(len(train_gen) / bs)
    lr_schedule = OneCycleScheduler(lr, steps)
    
    checkpoint_filepath = str(get_data_path()/"models"/model_name)
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=False,
        monitor=metric_name,
        mode=mode,
        save_best_only=True)

    callbacks = [lr_schedule, model_checkpoint_callback]
    
    history = {}
    for e in range(epochs):
        print(f"Epoch {e+1}/{epochs}")
        training = model.fit(train_gen, 
                             validation_data=valid_gen,
                             epochs=1,
                             callbacks=callbacks)
        if e == 0:
            history = {k:v for k, v in training.history.items()}
        else:
            for k, v in training.history.items():
                history[k].append(v[-1])
    return model, history