imports

In [1]:
import torch
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from HGNN.train.configParser import ConfigParser, getExperimentParamsAndRecord
from HGNN.train import CNN, dataLoader

testIndicesFile = "testIndex.csv"

parameters

In [2]:
experimentsPath="/home/elhamod/HGNN/experiments/"
dataPath="/data/BGNN_data/"
experimentName="BestModelForJeremy"
trial_hash="4481a198e78a77a9e12c52f8fc3f40c73e1c948617ed7d37fbe4dbcb"

cuda=6

numOfRows=None

cuda

In [3]:
# set cuda
if torch.cuda.is_available():
    torch.cuda.set_device(cuda)
    print("using cuda", cuda)

using cuda 6


Get dataset

In [4]:
experimentPathAndName = os.path.join(experimentsPath, experimentName)
experiment_params, experimentRecord = getExperimentParamsAndRecord(experimentsPath, experimentName, trial_hash)
print(experiment_params)

config_parser = ConfigParser(experimentsPath, dataPath, experimentName)
datasetManager = dataLoader.datasetManager(experimentName, True)
datasetManager.updateParams(config_parser.fixPaths(experiment_params))
dataset = datasetManager.getDataset()
dataset.toggle_image_loading(augmentation=False, normalization=dataset.normalization_enabled) # Needed so we always get the same prediction accuracy 
fineList = dataset.csv_processor.getFineList()
coarseList = dataset.csv_processor.getCoarseList()

# get a test loader without randomization
testIndicesFullPath = os.path.join(experimentPathAndName, experimentRecord['datasetName'].item(), testIndicesFile)
subset = torch.utils.data.Subset(dataset, dataLoader.readFile(testIndicesFullPath))
test_loader = torch.utils.data.DataLoader(subset, batch_size=1)

{'image_path': 'INHS_cropped', 'suffix': '52', 'training_count': 0.64, 'validation_count': 0.16, 'batchSize': 32, 'n_epochs': 5000, 'learning_rate': 0.01, 'numOfTrials': 5, 'patience': 50, 'fc_width': 200, 'fc_layers': 1, 'modelType': 'DISCO', 'lambda': 0.6, 'unsupervisedOnTest': False, 'tl_model': 'ResNet18', 'augmented': False, 'weight_decay': 0}
Creating dataset...


Loading images: 100%|██████████| 2600/2600 [02:32<00:00, 17.06it/s, fileName=/data/BGNN_data/INHS_cropped/images/INHS_FISH_63588.jpg]  


Creating dataset... Done.
file /home/elhamod/HGNN/experiments/BestModelForJeremy/datasplits/2788d2bc2c355b1c8cf2f3ac00958c413530a8f359f22067b2b8cfbc/testIndex.csv read


Get untrained model

In [5]:
architecture = {
    "fine": len(fineList),
    "coarse" : len(coarseList)
}
model = CNN.create_model(architecture, experiment_params)

# get the model and the parameters
modelName = experimentRecord.iloc[0]["modelName"]
trialName = os.path.join(experimentPathAndName, modelName)
_ = CNN.loadModel(model, trialName)

sort through predictions

In [6]:
df_misclassified = pd.DataFrame(columns=['file name', 'true label', 'probability of true label', 'predicted label'])
df_correctlyclassified = pd.DataFrame(columns=['file name', 'true label', 'probability of true label', 'predicted label'])

# get probability of correct prediction and true label
predProblist, lbllist = CNN.getLoaderPredictionProbabilities(test_loader, model, experiment_params)
_, predlist = torch.max(predProblist, 1)
lbllist = lbllist.reshape(lbllist.shape[0], -1)
correct_predProblist = predProblist.gather(1, lbllist)
correct_predProblist = correct_predProblist.reshape(1, -1)
correct_predProblist = correct_predProblist[0]

for i, lbl in enumerate(lbllist):
    prd = predlist[i]
    prdProb = correct_predProblist[i]
    fileName = subset[i]['fileName']
    
    if torch.cuda.is_available():
        lbl = lbl.cpu()
        prd = prd.cpu()
        prdProb = prdProb.cpu()

    row = {'file name' : fileName ,
           'true label' : int(lbl.numpy()), 
           'probability of true label': float(prdProb.numpy()),
           'predicted label' : int(prd.numpy())}
    
    if(lbl != prd):
        df_misclassified = df_misclassified.append(row, ignore_index=True)
    else:
        df_correctlyclassified = df_correctlyclassified.append(row, ignore_index=True)
        
df_misclassified = df_misclassified.sort_values(by=[ 'true label', 'probability of true label'])
df_correctlyclassified = df_correctlyclassified.sort_values(by=['true label', 'probability of true label'])

Transforming images: 100%|██████████| 2600/2600 [00:54<00:00, 47.34it/s]


define function to plot top n of a category

In [7]:
def plot_top_n(df, fig_file_name, numOfRows=None, perRow=5):
    if numOfRows is None:
        numOfRows = df['true label'].nunique()
    topn = df.groupby('true label').head(perRow)
    fig, axes = plt.subplots(ncols=perRow, nrows=numOfRows, figsize=(15, 3*numOfRows), dpi= 300)
    for i, row in enumerate(axes):
        topn_lbl = topn[topn['true label']==i]
        for j, ax in enumerate(row):
            if len(topn_lbl.index) > j:
                entry = topn_lbl.iloc[j]
                fileName = entry['file name']
                prediction = entry['predicted label']
                trueLabel = entry['true label']
                prob = entry['probability of true label']
                ax.set_title(f"{fileName} \n {fineList[trueLabel]} \n as \n  {fineList[prediction]} \n prob: {round(prob, 3)}")
                img = plt.imread(os.path.join(dataPath, experiment_params['image_path'], 'images',fileName ))
                ax.imshow(img)             
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(os.path.join(experimentPathAndName, modelName, fig_file_name))
            

Display and save mispredicted

In [None]:
df_misclassified.to_csv(os.path.join(experimentPathAndName, modelName, 'misclassified examples.csv'))
plot_top_n(df_misclassified, "misclassified examples.pdf", numOfRows=numOfRows)
df_misclassified

Display and save correctly predicted

In [None]:
df_correctlyclassified.to_csv(os.path.join(experimentPathAndName, modelName, 'correctly classified examples.csv'))
plot_top_n(df_correctlyclassified, "correctly classified.pdf", numOfRows=numOfRows)
df_correctlyclassified