imports

In [1]:
import torch
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from tqdm.notebook import tqdm as tqdm
from PIL import Image, ImageDraw 
import math

from HGNN.train.configParser import ConfigParser, getExperimentParamsAndRecord
from HGNN.train import CNN, dataLoader

testIndicesFile = "testIndex.csv"

parameters

In [2]:
experimentsPath="/home/elhamod/HGNN/experiments/"
dataPath="/data/BGNN_data/"
experimentName="biology_paper_augmentation_effect"
trial_hash="825aa4c30a7ace1285f54f2f26af4f9702b5fec2a2f7edf3b35666f7"

cuda=7

numOfRows=None

cuda

In [3]:
# set cuda
if torch.cuda.is_available():
    torch.cuda.set_device(cuda)
    print("using cuda", cuda)

using cuda 7


Get dataset

In [4]:
experimentPathAndName = os.path.join(experimentsPath, experimentName)
experiment_params, experimentRecord = getExperimentParamsAndRecord(experimentsPath, experimentName, trial_hash)
print(experiment_params)

config_parser = ConfigParser(experimentsPath, dataPath, experimentName)
datasetManager = dataLoader.datasetManager(experimentPathAndName, dataPath, True)
datasetManager.updateParams(config_parser.fixPaths({**experiment_params,**{'augmented': False}}))
dataset = datasetManager.getDataset()
dataset.toggle_image_loading(augmentation=False, normalization=dataset.normalization_enabled) # Needed so we always get the same prediction accuracy 
fineList = dataset.csv_processor.getFineList()
coarseList = dataset.csv_processor.getCoarseList()

# get a test loader without randomization
testIndicesFullPath = os.path.join(experimentPathAndName, experimentRecord['datasetName'].item(), testIndicesFile)
subset = torch.utils.data.Subset(dataset, dataLoader.readFile(testIndicesFullPath))
test_loader = torch.utils.data.DataLoader(subset, batch_size=1)

{'image_path': 'INHS_cropped', 'suffix': 'biology_paper_200max', 'training_count': 0.64, 'validation_count': 0.16, 'batchSize': 128, 'n_epochs': 500, 'learning_rate': 5e-05, 'numOfTrials': 3, 'patience': 10, 'fc_width': 200, 'fc_layers': 1, 'modelType': 'BB', 'lambda': 0.6, 'unsupervisedOnTest': False, 'tl_model': 'ResNet18', 'augmented': True, 'weight_decay': 0.0001, 'img_res': 448, 'tl_freeze': False, 'cnn_layers': 0, 'cnn_channels': 128, 'pretrained': True, 'two_nets': True, 'link_layer': 'layer3', 'dataset_norm': True, 'aug_profile': 'withRotation_PCA1.5'}
Creating dataset...


Loading images: 100%|██████████| 6522/6522 [00:26<00:00, 246.36it/s, fileName=/data/BGNN_data/INHS_cropped/images/INHS_FISH_63588_448.jpg]  


Creating dataset... Done.
file /home/elhamod/HGNN/experiments/biology_paper_augmentation_effect/datasplits/b4896471167c5599680115e51349929def797e403bead8f0fe554d79/testIndex.csv read


Get untrained model

In [5]:
architecture = {
    "fine": len(fineList),
    "coarse" : len(coarseList)
}
model = CNN.create_model(architecture, experiment_params)

# get the model and the parameters
modelName = experimentRecord.iloc[0]["modelName"]
trialName = os.path.join(experimentPathAndName, modelName)
_ = CNN.loadModel(model, trialName)

sort through predictions

In [6]:
df_misclassified = pd.DataFrame(columns=['file name', 'true label', 'probability of true label', 'predicted label'])
df_correctlyclassified_columns = ['file name', 'true label', 'probability of true label']
df_correctlyclassified = pd.DataFrame(columns=df_correctlyclassified_columns)

# get probability of correct prediction and true label
predProblist, lbllist = CNN.getLoaderPredictionProbabilities(test_loader, model, experiment_params)
_, predlist = torch.max(predProblist, 1)
lbllist = lbllist.reshape(lbllist.shape[0], -1)
# True label
correct_predProblist = predProblist.gather(1, lbllist)
correct_predProblist = correct_predProblist.reshape(1, -1)
correct_predProblist = correct_predProblist[0]
# Predicted label
predicted_predProblist = predProblist.gather(1, predlist.unsqueeze(0).T)
predicted_predProblist = predicted_predProblist.reshape(1, -1)
predicted_predProblist = predicted_predProblist[0]

for i, lbl in enumerate(lbllist):
    prd = predlist[i]
    correctProb = correct_predProblist[i]
    prdProb = predicted_predProblist[i]
    fileName = subset[i]['fileName']
    
    if torch.cuda.is_available():
        lbl = lbl.cpu()
        prd = prd.cpu()
        prdProb = prdProb.cpu()
        correctProb = correctProb.cpu()
    
    if(lbl != prd):
        row = {'file name' : fileName ,
           'true label' : int(lbl.numpy()), 
           'probability of true label': float(correctProb.numpy()),
           'probability of predicted label': float(prdProb.numpy()),
           'predicted label' : int(prd.numpy())}
        df_misclassified = df_misclassified.append(row, ignore_index=True)
    else:
        row = {'file name' : fileName ,
           'true label' : int(lbl.numpy()), 
           'probability of true label': float(correctProb.numpy())}
        df_correctlyclassified = df_correctlyclassified.append(row, ignore_index=True)

Transforming images: 100%|██████████| 6522/6522 [00:11<00:00, 577.45it/s]


In [7]:
df_misclassified = df_misclassified.sort_values(by=[ 'true label', 'probability of true label'])
df_correctlyclassified = df_correctlyclassified.sort_values(by=['true label', 'probability of true label'])

define function to plot top n of a category

In [8]:
images_path = os.path.join(dataPath, experiment_params['image_path'], 'images')

# Given a data frame of specimen, prints a pdf of a grid of those examples with information about them
# showPrediction: Should only used to show misclassifications
# showClosestClassTrainingExample: Should only used to show misclassifications
def plot_top_n(df, fig_file_name, numOfRows=None, perRow=5, show_same_class=True):
    
    # construct results data frame
    h_list = ['image','image','image','image','image',
              'closest example from training set','closest example from training set','closest example from training set',]
    h2_list = ['file name','true label','probability of true label','predicted label','probability of predicted label',
             'file name','true label','cosine similarity',]
    if show_same_class:
        h_list = h_list + ['closest same class example from training set','closest same class example from training set']
        h2_list = h2_list + ['file name','cosine similarity']
    df_result = pd.DataFrame(columns = [np.array(h_list), np.array(h2_list)]  )   
    
    # Disable augmentation
    augmentation, normalization, _ = dataset.toggle_image_loading(augmentation=False, normalization=dataset.normalization_enabled)
    training_indices = datasetManager.get_indices("trainingIndex.csv")
    training_dataset = torch.utils.data.Subset(dataset, training_indices)
    
    if numOfRows is None:
        numOfRows = df['true label'].nunique()
    topn = df.groupby('true label').head(perRow)
    
    rows_per_page = 10
    number_of_pages = math.floor(numOfRows/rows_per_page)+1
    with tqdm(total=perRow * numOfRows, desc="figure") as bar:
        with PdfPages(os.path.join(experimentPathAndName, modelName, fig_file_name+".pdf")) as pdf:
            for k in range(number_of_pages):
                fig, axes = plt.subplots(ncols=perRow, nrows=rows_per_page, figsize=(15, 4*rows_per_page), dpi= 300)

                for i, row in enumerate(axes):
                    if i >= numOfRows + k*rows_per_page:
                        break
                        
                    topn_lbl = topn[topn['true label']==i+k*rows_per_page]
                    for j, ax in enumerate(row):

                        if len(topn_lbl.index) > j:

                            entry = topn_lbl.iloc[j]
                            fileName = entry['file name']
                            trueLabel = entry['true label']
                            correct_prob = entry['probability of true label']
                            if show_same_class:
                                prediction = entry['predicted label']
                                predicted_prob = entry['probability of predicted label']
                            
                            img = Image.open(os.path.join(images_path,fileName))
                            img.thumbnail((448,448), Image.ANTIALIAS)

                            # get closest training image from dataset training set
                            closest, cosine_score = get_closest_example(fileName, dataset, training_dataset, experiment_params, model)
                            closest_fileName = closest['fileName']
                            closest_species = fineList[closest['fine']]
                            img2 = Image.open(os.path.join(images_path,closest_fileName))
                            img2.thumbnail((448,448), Image.ANTIALIAS)
                            draw = ImageDraw.Draw(img2)
                            draw.text((0, 0),"closest: " + closest_species,(0,0,0))
                            
                            vis = np.concatenate((img, img2), axis=0)
                            
                            # get closest training image from dataset training set of same class
                            if show_same_class:
                                # get subset of trainign set that corresponds to the true label
                                class_training_indices = [indx for indx in training_indices if dataset[indx]['fine'] == trueLabel]
                                class_training_dataset = torch.utils.data.Subset(dataset, class_training_indices)

                                closest_fromClass, cosine_score_fromClass = get_closest_example(fileName, dataset, class_training_dataset, experiment_params, model)
                                closest_fromClass_fileName = closest_fromClass['fileName']
                                closest_fromClass_species = fineList[closest_fromClass['fine']]
                                img3 = Image.open(os.path.join(images_path,closest_fromClass_fileName))
                                img3.thumbnail((448,448), Image.ANTIALIAS)
                                draw2 = ImageDraw.Draw(img3)
                                draw2.text((0, 0),"closest same label",(0,0,0))
                                vis = np.concatenate((vis, img3), axis=0)
                            
                            ax.imshow(vis)
                            txt = f"{fileName} \n {fineList[trueLabel]}"
                            if show_same_class:
                                txt = txt + f" \n as {fineList[prediction]}"
                            ax.set_title(txt)
                            
                            # add to dataframe
                            row = {
                                ('image', 'file name'): fileName,
                                ('image', 'true label'): fineList[trueLabel],
                                ('image', 'probability of true label'): round(correct_prob, 3),
                                ('closest example from training set', 'file name'): closest_fileName,
                                ('closest example from training set', 'true label'): closest_species,
                                ('closest example from training set', 'cosine similarity'): round(cosine_score, 3),
                            }
                            if show_same_class:
                                row = {**row, **{
                                   ('image', 'predicted label'): fineList[prediction],
                                   ('image', 'probability of predicted label'): round(predicted_prob, 3),
                                   ('closest same class example from training set', 'file name'): closest_fromClass_fileName,
                                   ('closest same class example from training set', 'cosine similarity'): round(cosine_score_fromClass, 3),
                               }}
                            df_result = df_result.append(row, ignore_index=True)

                        bar.update()

                fig.tight_layout(rect=[0, 0.03, 1, 0.95])
                pdf.savefig()
                df_result.to_csv(os.path.join(experimentPathAndName, modelName, fig_file_name+".csv"))
                plt.close()

    # Reenable aggregation if needed.
    dataset.toggle_image_loading(augmentation=augmentation, normalization=normalization)

def get_closest_example(fileName, source_dataset, target_dataset, experiment_params, model):
    fileName_index = source_dataset.csv_processor.get_index_from_fileName(fileName)
    top_1 = torch.topk(CNN.get_distance_from_example(target_dataset, source_dataset[fileName_index], model, experiment_params), 1)
    closest = target_dataset[top_1.indices[0][0].item()]
    cosine_score = top_1.values[0][0].item()
    return closest, cosine_score

Display and save mispredicted

In [9]:
df_misclassified.to_csv(os.path.join(experimentPathAndName, modelName, 'misclassified examples.csv'))
plot_top_n(df_misclassified, "misclassified examples", numOfRows=numOfRows)
df_misclassified

file /home/elhamod/HGNN/experiments/biology_paper_augmentation_effect/datasplits/b4896471167c5599680115e51349929def797e403bead8f0fe554d79/trainingIndex.csv read


HBox(children=(FloatProgress(value=0.0, description='figure', max=155.0, style=ProgressStyle(description_width…




Unnamed: 0,file name,true label,probability of true label,predicted label,probability of predicted label
24,INHS_FISH_51790.jpg,0,0.000484,33,0.984169
67,INHS_FISH_41599.jpg,0,0.004159,33,0.858448
17,INHS_FISH_86024.jpg,0,0.007538,33,0.942587
83,INHS_FISH_55544.jpg,0,0.040605,33,0.786328
69,INHS_FISH_4172.jpg,0,0.191964,33,0.540209
...,...,...,...,...,...
8,INHS_FISH_16124.jpg,33,0.002724,0,0.899437
13,INHS_FISH_85582.jpg,33,0.099280,0,0.564271
6,INHS_FISH_92608.jpg,33,0.104537,0,0.561070
78,INHS_FISH_25130.jpg,33,0.306219,0,0.428838


Display and save correctly predicted

In [10]:
df_correctlyclassified.to_csv(os.path.join(experimentPathAndName, modelName, 'correctly classified examples.csv'))
plot_top_n(df_correctlyclassified, "correctly classified", numOfRows=numOfRows, show_same_class=False)
df_correctlyclassified[df_correctlyclassified_columns]

file /home/elhamod/HGNN/experiments/biology_paper_augmentation_effect/datasplits/b4896471167c5599680115e51349929def797e403bead8f0fe554d79/trainingIndex.csv read


HBox(children=(FloatProgress(value=0.0, description='figure', max=190.0, style=ProgressStyle(description_width…




Unnamed: 0,file name,true label,probability of true label
1067,INHS_FISH_41181.jpg,0,0.389659
565,INHS_FISH_25715.jpg,0,0.565110
432,INHS_FISH_84473.jpg,0,0.578712
633,INHS_FISH_9758.jpg,0,0.634390
797,INHS_FISH_25694.jpg,0,0.846018
...,...,...,...
760,INHS_FISH_65534.jpg,37,0.984275
803,INHS_FISH_81262.jpg,37,0.986948
25,INHS_FISH_37323.jpg,37,0.986988
84,INHS_FISH_40590.jpg,37,0.988063
