In [None]:
import os, sys
import importlib as impL
import numpy as np
from pandas import DataFrame as pd_df

In [None]:
def get_parent_dir(x):
    return os.path.dirname(x)
get_parent_dir(sys.path[0])

In [None]:
sys.path.insert(1,os.path.join(get_parent_dir(sys.path[0]),'vae_torch'))
sys.path.insert(1,get_parent_dir(sys.path[0]))

In [None]:
import vae_torch_model as vtm
import helperFuncs as funcH
desktop_dir = funcH.getVariableByComputerName('desktop_dir')

In [None]:
def load_from_saved_corr_file(fn = '/home/doga/Desktop/correspondance_find.npz'):
    a = np.load(fn)
    predictions = a['predictions']
    bottleneck_vec = a['bottleneck_vec']
    labels = a['labels']
    print("loaded from file - ", fn)
    print(predictions.shape)
    print(bottleneck_vec.shape)
    print(labels.shape)
    return bottleneck_vec, predictions, labels

def analyze_corresondance_results(correspondance_tuple, centroid_df, pred_vec, lab_vec):
    df = pd_df({'labels': lab_vec[np.asarray(centroid_df['sampleID'], dtype=int)],
                'klusterID': np.asarray(centroid_df['klusterID'], dtype=int),
                'sampleCounts': np.asarray(centroid_df['num_of_samples'], dtype=int)})
    print('correspondance results({:}):'.format(len(correspondance_tuple[0])))
    print(df.groupby(['labels'])[['labels', 'sampleCounts']].sum())
    corr_in_clust = pred_vec[correspondance_tuple[0]]
    corr_ou_clust = pred_vec[correspondance_tuple[1]]
    _confMat_corr_preds = confusion_matrix(corr_in_clust, corr_ou_clust)
    acc_corr_preds = 100 * np.sum(np.diag(_confMat_corr_preds)) / np.sum(
        np.sum(_confMat_corr_preds))
    print("_confMat_corr_preds - acc({:6.4f})".format(acc_corr_preds))

    corr_in_labels = lab_vec[correspondance_tuple[0]]
    corr_ou_labels = lab_vec[correspondance_tuple[1]]
    _confMat_corr = confusion_matrix(corr_in_labels, corr_ou_labels)
    acc_corr = 100 * np.sum(np.diag(_confMat_corr)) / np.sum(np.sum(_confMat_corr))
    print("confMat - acc({:6.4f}), correspondance match:\n".format(acc_corr), pd_df(_confMat_corr))

b_v = np.random.rand(10,3)
#print(b_v)

In [None]:
# np.savez('/home/doga/Desktop/correspondance_find.npz', bottleneck_vec=bottleneck_vec, predictions=pred_vec, labels=lab_vec)
bottleneck_vec, pred_vec, lab_vec = load_from_saved_corr_file(os.path.join(desktop_dir, 'correspondance_find_epoch1009_conf516.npz'))

In [None]:
impL.reload(funcH)
def get_cluster_correspondance_ids_jupy(X, cluster_ids, correspondance_type="shuffle", verbose=0):
    # uses X to find the center sample
    # returns inds_in, inds_out where:
    # if correspondance_type=='shuffle'
    # inds_in : shuffled indices of a cluster
    # inds_out: shuffled indices of a cluster
    # elseif correspondance_type=='centered'
    # inds_in : some_sample_id
    # inds_out: the center of cluster of that sample_id
    centroid_df = funcH.get_cluster_centroids(ft=X, predClusters=cluster_ids, verbose=0)
    uq_pr = np.unique(cluster_ids)
    inds_in = []
    inds_out = []
    num_of_samples = []
    for i in range(len(uq_pr)):
        cluster_id = uq_pr[i]
        cluster_inds = funcH.getInds(cluster_ids, i)
        num_of_samples.append(len(cluster_inds))
        if correspondance_type == 'shuffle':
            iin_cur = cluster_inds.copy()
            np.random.shuffle(iin_cur)
            out_cur = cluster_inds.copy()
            np.random.shuffle(out_cur)
        elif 'knear' in correspondance_type:
            if verbose > 0:
                print("\n***\nknear-row{:}\n".format(i), cluster_inds)
            k = int(correspondance_type.replace('knear', ''))
            # look at the closest k samples for each sample
            X_sub = X[cluster_inds, :]
            k = np.minimum(len(cluster_inds), k)
            d_inds, d_vals = funcH.get_dist_inds(X_sub, k=k, metric="euclidean", verbose=0)
            # d_inds are from 0 to len(cluster_inds)
            # we want to switcth them with real cluster_inds
            if verbose > 2:
                print("cluster_inds:\n", cluster_inds)
                print("d_inds in:", d_inds)
            # d_inds.shape = [len(cluster_inds), k]
            # each row represents a sample and all columns represent its nearest neighbours
            # so i need to have each corr and k neighbours as correspondant frames
            sidx = np.array([cluster_inds, ] * k).T.flatten()
            if verbose > 1:
                print("i = ", i)
                print("sidx = \n", sidx)
            d_inds = cluster_inds[d_inds.flatten()]
            if verbose > 1:
                print("d_inds = \n", d_inds)
            iin_cur = sidx.copy()
            out_cur = d_inds.copy()
        else:
            center_sample_inds = centroid_df['sampleID'].iloc[i]
            if verbose > 0:
                print("cluster_id({:-3d}), sampleCount({:-4d}), centerSampleId({:-5d})".format(int(cluster_id),
                                                                                               len(cluster_inds),
                                                                                               center_sample_inds))
            # inds_in <--all sampleids except cluster center
            # inds_out<--cluster sample id with length of inds_in
            iin_cur = np.asarray(cluster_inds[np.where(center_sample_inds != cluster_inds)], dtype=int).squeeze()
            out_cur = np.asarray(np.ones(iin_cur.shape) * center_sample_inds, dtype=int)

        if verbose > 0:
            print("iin_cur.shape{:}, out_cur.shape{:}".format(iin_cur.shape, out_cur.shape))
            #if i == 0:
            print("iin=", iin_cur)
            print("out=", out_cur)
        inds_in.append(iin_cur)
        inds_out.append(out_cur)

    # first concatanate the lists into ndarray
    inds_in = np.asarray(np.concatenate(inds_in), dtype=int)
    inds_out = np.asarray(np.concatenate(inds_out), dtype=int)

    if True:  # 'knear' not in correspondance_type:
        # now a-b and b-a
        ii_ret = np.asarray(np.concatenate([inds_in, inds_out]), dtype=int)
        io_ret = np.asarray(np.concatenate([inds_out, inds_in]), dtype=int)
    else:
        ii_ret = inds_in.copy()
        io_ret = inds_out.copy()

    # now shuffle so that clusters not sorted in learning
    print("shuffle all")
    p = np.random.permutation(len(ii_ret))
    ii_ret = ii_ret[p]
    io_ret = io_ret[p]

    if verbose > 0:
        print("inds_in.shape{:}, inds_out.shape{:}".format(inds_in.shape, inds_out.shape))
    centroid_df['num_of_samples'] = num_of_samples
    return (ii_ret, io_ret), centroid_df

In [None]:
correspondance_type = 'knear1'
impL.reload(funcH)
cluster_ids=np.asarray([0,0,0,0,1,1,2,2,1,1], dtype=int)
classss_ids=np.asarray([1,1,1,1,2,2,2,2,3,3], dtype=int)
corr_inds, centroid_df = funcH.get_cluster_correspondance_ids(b_v, cluster_ids=cluster_ids, correspondance_type="knear1", verbose=0)

In [None]:
from sklearn.metrics import confusion_matrix
analyze_corresondance_results(corr_inds, centroid_df, cluster_ids, classss_ids)

In [None]:
impL.reload(funcH)
D, sort_inds = funcH.create_dist_mat(x=b_v, metric="euclidean", verbose=0)

In [None]:
print(D)

In [None]:
print(sort_inds)

In [None]:
a = [np.nan, 0.67650314, 0.42890377, 0.92405289, 0.43106637, 0.58283084, 0.38471857, 0.35347397, 0.56549665, 0.76402288]

In [None]:
print(np.argsort(a))

In [None]:
print(sort_inds[:,:2])

In [28]:
_X_all = np.reshape(np.arange(25), (5,5))
print("_X_all = \n", _X_all)
_Y_id_i = np.asarray(([np.arange(5),]*5),dtype=int)
print("_Y_id_i = \n", _Y_id_i)
_Y_id_j = np.asarray(([np.arange(5),]*5),dtype=int).T
print("_Y_id_j = \n", _Y_id_j)
_X_udi = np.triu(_X_all, 1)
print("_X_udi = \n", _X_udi)

_X_all = 
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]]
_Y_id_i = 
 [[0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]]
_Y_id_j = 
 [[0 0 0 0 0]
 [1 1 1 1 1]
 [2 2 2 2 2]
 [3 3 3 3 3]
 [4 4 4 4 4]]
_X_udi = 
 [[ 0  1  2  3  4]
 [ 0  0  7  8  9]
 [ 0  0  0 13 14]
 [ 0  0  0  0 19]
 [ 0  0  0  0  0]]


In [29]:
idx_in = _Y_id_j[np.triu_indices(5, k = 1)]
idx_ou = _Y_id_i[np.triu_indices(5, k = 1)]
print("X = \n", _X_all[np.triu_indices(5, k = 1)])
print("in = \n", idx_in)
print("out = \n", idx_ou)
print("_X_ = \n", _X_all[idx_in,idx_ou])

X = 
 [ 1  2  3  4  7  8  9 13 14 19]
in = 
 [0 0 0 0 1 1 1 2 2 3]
out = 
 [1 2 3 4 2 3 4 3 4 4]
_X_ = 
 [ 1  2  3  4  7  8  9 13 14 19]


In [None]:
impL.reload(funcH)
_ = funcH.get_linearized_distance_matrix(_X_all,verbose=1, sort_dist=None)

In [None]:
_ = funcH.get_linearized_distance_matrix(_X_all,verbose=1, sort_dist="ascend")

In [None]:
_ = funcH.get_linearized_distance_matrix(_X_all,verbose=1, sort_dist="descend")

In [None]:
np.arange(5*4/2)

In [None]:
_ = funcH.get_linearized_distance_matrix(D,verbose=1, sort_dist="ascend")

In [16]:
import numpy as np
np.triu_indices(5, k = 1)[1]

array([1, 2, 3, 4, 2, 3, 4, 3, 4, 4])

In [6]:
for i in range(5):
    for j in range(5-i-1):
        print(i,end=',')

0,0,0,0,1,1,1,2,2,3,

In [9]:
[i for j in range(5-i-1) for i in range(5)]

[]

In [25]:
n = 5

In [26]:
np.asarray([i for i in range(n) for j in range(n-i-1)], dtype=np.int32)

array([0, 0, 0, 0, 1, 1, 1, 2, 2, 3], dtype=int32)

In [27]:
np.asarray([i+j+1 for i in range(n) for j in range(n-i-1)], dtype=np.int32)

array([1, 2, 3, 4, 2, 3, 4, 3, 4, 4], dtype=int32)