imports

In [1]:
import torch
import os
import numpy as np
from tqdm import tqdm

import pandas as pd

from myhelpers import cifar_dataLoader

from HGNN.train.configParser import ConfigParser
from HGNN.train import CNN, dataLoader
from myhelpers.memory import get_cuda_memory
from myhelpers.image_show import showExample, getClosestImageFromDataloader
from myhelpers.images_tsne import get_tsne
from misc import get_classification_df

experimetnsFileName = "experiments.csv"

parameters

In [2]:
experimentsPath="/raid/elhamod/Fish/experiments/"
dataPath="/raid/elhamod/Fish/"
experimentName="Fish30-5run-PhyloNN6"#"Fish-L1-experiment2"#"CIFAR_phylogeny_HGNN_lambdaExperiment"
# trial_hash="27a4da8bf0c3bca7096cddf192ecf3069e035ad3fbba69bbd47bd157" #MSE + L1
#"5922bad3c69f629daa0af24121e292a895f308e7cae3ec4f64536186"#hier
trial_hash="0e8bc6eb6edfb88c5a419e14ab0b445d72ee1945bd474a26a7abcbd4" #PhyloNN
SEED_INT=4 # order of the trial_hash when executed

legends=['fine', 'coarse']
use_submodel=False
dataset_name="test"
cuda=0
batch_size = 4
num_workers = 8

cuda

In [3]:
# set cuda
if torch.cuda.is_available():
    torch.cuda.set_device(cuda)
    print("using cuda", cuda)
    
torch.multiprocessing.set_start_method('spawn')

using cuda 0


Get dataset

In [4]:
# Get experiment parameters
experimentsFileNameAndPath = os.path.join(experimentsPath, experimetnsFileName)
if os.path.exists(experimentsFileNameAndPath):
    experiments_df = pd.read_csv(experimentsFileNameAndPath)
    experiments_df = experiments_df.where(pd.notnull(experiments_df), None)
else:
    raise Exception("Experiment not " + trial_hash + " found!")
experimentRecord = experiments_df[experiments_df["trialHash"] == trial_hash]
experiment_params = experimentRecord.to_dict('records')[0]
print(experiment_params)

config_parser = ConfigParser(experimentsPath, dataPath, experimentName)
experiment_params = config_parser.fixExperimentParams(experiment_params)
experimentPathAndName = os.path.join(experimentsPath, experimentName)
if experiment_params['image_path'] == 'cifar-100-python':
    datasetManager = cifar_dataLoader.datasetManager(experimentPathAndName, dataPath)
else:
    datasetManager = dataLoader.datasetManager(experimentPathAndName, dataPath)
datasetManager.updateParams(config_parser.fixPaths(experiment_params))
train_loader, validation_loader, test_loader = datasetManager.getLoaders(SEED_INT)

dataloader = torch.utils.data.DataLoader(test_loader.dataset if dataset_name=="test" else train_loader.dataset, pin_memory=True, batch_size=batch_size, num_workers=num_workers)
dataset = dataloader.dataset

{'experimentName': 'Fish30-5run-PhyloNN6', 'modelName': 'models/0e8bc6eb6edfb88c5a419e14ab0b445d72ee1945bd474a26a7abcbd4', 'datasetName': 'datasplits/a30fade0855f8d4a9e28fdac4e65ae71ab21444a323ee6e3927d97f8', 'experimentHash': '2f4afd9af84811c61585f52e9e69fb2b8fc8e83ae71fa846e602b672', 'trialHash': '0e8bc6eb6edfb88c5a419e14ab0b445d72ee1945bd474a26a7abcbd4', 'image_path': 'Curated4/Easy_30', 'suffix': None, 'img_res': 448, 'augmented': True, 'batchSize': 64, 'learning_rate': 0.001, 'numOfTrials': 5, 'fc_layers': 1, 'modelType': 'PhyloNN', 'lambda': 1.0, 'unsupervisedOnTest': None, 'tl_model': 'ResNet18', 'link_layer': 'avgpool', 'adaptive_smoothing': False, 'adaptive_lambda': 0.1, 'adaptive_alpha': 0.5, 'noSpeciesBackprop': False, 'phylogeny_loss': 'False', 'phylogeny_loss_epsilon': 0.03, 'tripletEnabled': False, 'tripletSamples': 2.0, 'tripletSelector': 'semihard', 'tripletMargin': 2.0, 'displayName': 'Fish30-5run-PhyloNN-smalldistances-intraKorthogonality', 'pretrained': True, 'epochs

TypeError: must be real number, not NoneType

Get untrained model

In [None]:
%%capture

# architecture = {
#     "fine": len(dataset.csv_processor.getFineList()),
#     "coarse" : len(dataset.csv_processor.getCoarseList())
# }
architecture = CNN.get_architecture(experiment_params, train_loader.dataset.csv_processor)
model = CNN.create_model(architecture, experiment_params, cuda)

# get the model and the parameters
modelName = experimentRecord.iloc[0]["modelName"]
trialName = os.path.join(experimentPathAndName, modelName)
_ = CNN.loadModel(model, trialName)
model.eval()

Show example and closest images

Show TSNE

In [None]:
# activation_layer = 'layer2'#'fine'
activation_layer ='01distance'

model_sub = model
if hasattr(model, 'network_fine') and use_submodel==True:
    model_sub = model.network_fine

get_tsne(dataloader, model_sub, activation_layer, experiment_params['img_res'], 
                os.path.join(experimentPathAndName, modelName), 
                dataset_name+"_"+activation_layer+("_submodule" if use_submodel==True else ""), legends, cuda)

In [None]:
# activation_layer = 'layer4'#'fine'
activation_layer ='03distance'

get_tsne(dataloader, model_sub, activation_layer, experiment_params['img_res'], 
                os.path.join(experimentPathAndName, modelName), 
                dataset_name+"_"+activation_layer+("_submodule" if use_submodel==True else ""), legends, cuda)

In [None]:
activation_layer ='05distance'

get_tsne(dataloader, model_sub, activation_layer, experiment_params['img_res'], 
                os.path.join(experimentPathAndName, modelName), 
                dataset_name+"_"+activation_layer+("_submodule" if use_submodel==True else ""), legends, cuda)

In [None]:
activation_layer = 'fine'#'fine'

get_tsne(dataloader, model_sub, activation_layer, experiment_params['img_res'], 
                os.path.join(experimentPathAndName, modelName), 
                dataset_name+"_"+activation_layer+("_submodule" if use_submodel==True else ""), legends, cuda)

In [None]:
activation_layer = 'gap_features'#'fine'

get_tsne(dataloader, model_sub, activation_layer, experiment_params['img_res'], 
                os.path.join(experimentPathAndName, modelName), 
                dataset_name+"_"+activation_layer+("_submodule" if use_submodel==True else ""), legends, cuda)

In [None]:
# Distribution and heat map analysis
activation_layer = 'gap_features'#'gap_features'
sub_vector_ratio = None # 0.25, 0.5,0.75,1.0 or None for default
phylo_level = 1. #0.5, 0.7.0.9 1.0 # this should be changed with the number above when gap_features is used.

# For heat map
# vmax=None
# vmin=None
# ,vmax=vmax,vmin=vmin

In [None]:
#dict_keys(['input', 'gap_features', 'layer4_features', 'fine', '05distance', '03distance', '01distance'])
# activation_layer ='gap_features'



accumlated_features = None
accumulated_labels= None
accumulated_predictions= None
a, n, _ = dataloader.dataset.toggle_image_loading(dataloader.dataset.augmentation_enabled, True)
for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
    image2 = batch['image'] 
    fine_label = batch['fine']
    if cuda is not None:
        image2 = image2.cuda()
    activations = model.activations(image2)
    features2 = activations[activation_layer].detach().cpu()
    features2 = features2.reshape(features2.shape[0], -1)
    pred, _ = CNN.getPredictions(activations['fine'], [])

    # Calculate distance for each pair.
    accumlated_features = features2 if accumlated_features is None else torch.cat([accumlated_features, features2]).detach()
    accumulated_labels = fine_label.tolist() if accumulated_labels is None else accumulated_labels + fine_label.tolist()
    accumulated_predictions = pred.tolist() if accumulated_predictions is None else accumulated_predictions + pred.tolist()
dataloader.dataset.toggle_image_loading(a, n)




In [None]:
features2.shape

In [None]:
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.spatial as sp
from sklearn.metrics import confusion_matrix

path_base = os.path.join(experimentPathAndName, 'models',trial_hash)


if sub_vector_ratio is not None:
    phylo_distance_cap = 1 - phylo_level
    sub_vector= slice(int((sub_vector_ratio-0.25)*512), int(sub_vector_ratio*512))
else:
    phylo_distance_cap = 0.
    sub_vector= slice(0, accumlated_features.shape[1])

name_base = dataset_name,activation_layer,str(sub_vector)
accumlated_features_sub = accumlated_features[:, sub_vector]

x = accumlated_features_sub.reshape(1,-1)
fig = plt.figure(0)
plt.hist(x, bins=100, density=True, label="activations")
plt.ylabel("Probability")
plt.xlabel("Activations")
plt.title("Distribution of values of activations");
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'values')))

fig = plt.figure(1)
sqr = torch.sqrt(torch.sum(torch.pow(accumlated_features_sub, 2), 1).reshape(-1))
plt.hist(sqr, density=True, bins=100, label="magnitude")
plt.ylabel("Probability")
plt.xlabel("Magnitude")
plt.title("Magnitude of embeddings");
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'magnitudes')))

fig = plt.figure(2)
sqrA = torch.sum(torch.pow(accumlated_features_sub, 2), 1, keepdim=True).expand(accumlated_features_sub.shape[0], x.shape[0])
sqrB = torch.sum(torch.pow(accumlated_features_sub, 2), 1, keepdim=True).expand(accumlated_features_sub.shape[0], x.shape[0]).t()
ans =  torch.nan_to_num(torch.sqrt(sqrA - 2*torch.mm(accumlated_features_sub, accumlated_features_sub.t()) + sqrB),0)
ans2 = ans[torch.triu_indices(ans.shape[0],ans.shape[1])[0], torch.triu_indices(ans.shape[0],ans.shape[1])[1]]
plt.hist(ans2, density=True, bins=100, label="distances")
plt.ylabel("Probability")
plt.xlabel("Distance")
plt.title("Distribution of distances between embeddings");
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'distances')))
distances_ = ans2

fig = plt.figure(3)
ax = sns.heatmap(ans)
plt.show()
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'distances_heat_map')))

fig = fig = plt.figure(4)
ans = sp.distance.cdist(accumlated_features_sub, accumlated_features_sub, 'cosine')
ans2 = ans[torch.triu_indices(ans.shape[0],ans.shape[1])[0], torch.triu_indices(ans.shape[0],ans.shape[1])[1]]
plt.hist(ans2, density=True, bins=100, label="cosine distance")
plt.ylabel("Probability")
plt.xlabel("Cosine Distance")
plt.title("Distribution of cosine distance between embeddings");
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base, 'cosine distance')))
distances_normalized = ans

fig = plt.figure(5)
ax = sns.heatmap(ans)
plt.title("Heat map of cosine distance between embeddings");
plt.show()
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'cosine_distance_heat_map')))

csv_processor = dataloader.dataset.csv_processor
fig = fig = plt.figure(6)
ans = torch.zeros(len(accumulated_labels),len(accumulated_labels))
for indx, i in enumerate(accumulated_labels):
    for indx2, j in enumerate(accumulated_labels[indx+1:]):
#         print(csv_processor.getFineList()[i], csv_processor.getFineList()[j],csv_processor.tax.get_distance(csv_processor.getFineList()[i], csv_processor.getFineList()[j]))
        dist = csv_processor.tax.get_distance(csv_processor.getFineList()[i], csv_processor.getFineList()[j])
        ans[indx][indx2+1+indx] = ans[indx2+1+indx][indx] = dist
        
ans2 = ans[torch.triu_indices(ans.shape[0],ans.shape[1])[0], torch.triu_indices(ans.shape[0],ans.shape[1])[1]]
ans = ans/torch.max(ans)
plt.hist(ans2, density=True, bins=100, label="phylo distance")
plt.ylabel("Probability")
plt.xlabel("Phylo distance")
plt.title("Distribution of phylo distance between embeddings");
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'phylo_distance')))
distances_phylo_ = ans2
ans = torch.clip(ans, min = phylo_distance_cap)
distances_phylo_normalized = ans

fig = plt.figure(7)
ax = sns.heatmap(ans)
plt.title("Heat map of phylo distances between fishes");
plt.show()
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'phylo_distance_heat_map'))

fig = plt.figure(8)
ax = sns.heatmap(abs(distances_phylo_normalized - distances_normalized))
plt.title("Heat map of normalized distance difference");
plt.show()
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base, 'normalized_distance_diff_heat_map'))


len_species = csv_processor.getFineList()
ans_phylo_dist = torch.zeros(len(len_species),len(len_species))
ans_embedding_dist = torch.zeros(len(len_species),len(len_species))
freq_phylo_dist = torch.zeros(len(len_species),len(len_species))
for indx, i in enumerate(accumulated_labels):
    for indx2, j in enumerate(accumulated_labels[indx+1:]):
#         print(csv_processor.getFineList()[i], csv_processor.getFineList()[j],csv_processor.tax.get_distance(csv_processor.getFineList()[i], csv_processor.getFineList()[j]))
        ans_phylo_dist[i][j] = ans_phylo_dist[j][i] = ans_phylo_dist[j][i]+distances_phylo_normalized[indx2+1+indx][indx]
        freq_phylo_dist[i][j] = freq_phylo_dist[j][i] = freq_phylo_dist[j][i]+1
        ans_embedding_dist[i][j] = ans_embedding_dist[j][i] = ans_embedding_dist[j][i]+distances_normalized[indx2+1+indx][indx]
ans_phylo_dist = torch.div(ans_phylo_dist,freq_phylo_dist)
ans_embedding_dist = torch.div(ans_embedding_dist,freq_phylo_dist)
ans = abs(ans_phylo_dist - ans_embedding_dist)
fig = plt.figure(9)
ax = sns.heatmap(ans_phylo_dist)
plt.title("Heat map of average phylo distances between fishes per species");
plt.show()
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'avg_phylo_distance_per_species_heat_map'))
fig = plt.figure(10)
ax = sns.heatmap(ans_embedding_dist)
plt.title("Heat map of average embedding distances between fishes per species");
plt.show()
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'avg_embedding_distance_per_species_heat_map'))
fig = plt.figure(11)
ax = sns.heatmap(ans)
plt.title("Heat map of error of average distances between fishes per species");
plt.show()
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'avg_diff_distance_per_species_heat_map'))
        
        

fig = plt.figure(12)
cf_matrix = confusion_matrix(accumulated_labels, accumulated_predictions)
ax = sns.heatmap(cf_matrix, cmap='Blues')
            
            


In [None]:
accumlated_features_sub.shape

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='cosine').fit(accumlated_features_sub)


fig = plt.figure(13,figsize=(15,15))
phylo_distance_per_example = []
embedding_distance_per_example = []
for indx, i in enumerate(range(accumlated_features_sub.shape[0])):
    embedding_distances, indices = nbrs.kneighbors(accumlated_features_sub[indx, :].reshape(1, -1))
    avg_embedding_distance = np.mean(embedding_distances)
    embedding_distance_per_example.append(avg_embedding_distance)
    
    phylo_distances = distances_phylo_normalized[indx, indices]
    avg_phylo_distance = torch.mean(phylo_distances).item()
    phylo_distance_per_example.append(avg_phylo_distance)

    
X_axis = np.arange(accumlated_features_sub.shape[0])

plt.bar(X_axis - 1, phylo_distance_per_example, 1, label = 'Phylo')
plt.bar(X_axis , embedding_distance_per_example, 1, label = 'Embedding')
  
# plt.xticks(X_axis, X)
plt.xlabel("Example")
plt.ylabel("Distance")
plt.title("Average KNN distances of each example")
plt.legend()
plt.show()
    
from scipy.stats import pearsonr
corr, _ = pearsonr(phylo_distance_per_example, embedding_distance_per_example)
print('Pearsons correlation: %.3f' % corr)


# tells how similar two distributions are (1 => very similar. 0=> not simialr)
import scipy
ks_stat, ks_p = scipy.stats.kstest(phylo_distance_per_example, embedding_distance_per_example)
print('KS test: %.3f' % ks_stat) 


In [None]:
import sklearn
sorted(sklearn.neighbors.VALID_METRICS['brute'])

In [None]:
from scipy.stats import pearsonr
corr, _ = pearsonr(distances_phylo_, distances_)
print('Pearsons correlation: %.3f' % corr)

import csv

with open(os.path.join(path_base, 'phylo_embeddign_correlation_{}.csv'.format(name_base)), 'w+', newline='') as f:
    thewriter = csv.writer(f)
    thewriter.writerow([corr])

In [None]:
#playing


csv_processor = dataloader.dataset.csv_processor
fig = fig = plt.figure(6)
ans = torch.zeros(len(accumulated_labels),len(accumulated_labels))
for indx, i in enumerate(accumulated_labels):
    for indx2, j in enumerate(accumulated_labels[indx+1:]):
#         print(csv_processor.getFineList()[i], csv_processor.getFineList()[j],csv_processor.tax.get_distance(csv_processor.getFineList()[i], csv_processor.getFineList()[j]))
        dist = csv_processor.tax.get_distance(csv_processor.getFineList()[i], csv_processor.getFineList()[j])
        ans[indx][indx2+1+indx] = ans[indx2+1+indx][indx] = dist
        
ans2 = ans[torch.triu_indices(ans.shape[0],ans.shape[1])[0], torch.triu_indices(ans.shape[0],ans.shape[1])[1]]
ans = ans/torch.max(ans)
plt.hist(ans2, density=True, bins=100, label="phylo distance")
plt.ylabel("Probability")
plt.xlabel("Phylo distance")
plt.title("Distribution of phylo distance between embeddings");
fig.savefig(os.path.join(path_base, "{}_{}.png".format(name_base,'phylo_distance')))
distances_phylo_ = ans2
ans = torch.clip(ans, min = phylo_distance_cap)
distances_phylo_normalized = ans

fig = plt.figure(7)
ax = sns.heatmap(ans)
plt.title("Heat map of phylo distances between fishes");
plt.show()


intervals = [0, int(0.25*512), int(0.5*512), int(0.75*512), 512]

fig, axs = plt.subplots(2,4,figsize=(30,15))
for i in range(len(intervals)-1):
    accumlated_features_sub = accumlated_features[:, 0:intervals[i+1]]
    
    ax1 = axs[0, i]
    ax2 = axs[1, i]
    
    ans = 1-sp.distance.cdist(accumlated_features_sub, accumlated_features_sub, 'cosine')
    ans2 = ans[torch.triu_indices(ans.shape[0],ans.shape[1])[0], torch.triu_indices(ans.shape[0],ans.shape[1])[1]]
    ax1.hist(ans2, density=True, bins=100, label="cosine distance")
    ax1.set_ylabel("Probability")
    ax1.set_xlabel("Cosine Distance")
    ax1.set_title("Distribution of cosine distance between embeddings");

    ax_ = sns.heatmap(ans, ax = ax2)
    ax2.set_title("Heat map of cosine distance between embeddings");
    
plt.show()