In [None]:
import numpy as np
import os 
import sys 
import matplotlib
from sklearn.manifold import MDS as classic_MDS
import time 
import scipy
import matplotlib.pyplot as mplt 
import pylab as PLT
import plotly
import plotly.tools as tls
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from itertools import product
from sklearn.neighbors import KNeighborsClassifier
from pprint import pprint
from sklearn.metrics.pairwise import euclidean_distances
plotly.offline.init_notebook_mode()
data_dir = './'
sys.path.append('./')
import bs_dev
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Create the distance graph for all the input 
def get_nearest_neighbors_edges(x, k=6, distance_metric = 'euclidean'):
    n_dim, n_samples = x.shape
    edges = [] 
    pairwise_distances = scipy.spatial.distance.squareform(
                         scipy.spatial.distance.pdist(x.T, metric=distance_metric))
    for s_id in np.arange(n_samples):
        neighbors = np.argsort(pairwise_distances[s_id, :])
        closest_neighbors = neighbors[:k+1]
        for neigh_id in closest_neighbors:
            edges.append([s_id, neigh_id, pairwise_distances[s_id, neigh_id]])
    return edges


def floyd_warshall(x, k=6, distance_metric='euclidean'):
    edges = get_nearest_neighbors_edges(x, k=6)
    n_dim, n_samples = x.shape
    D = np.full((n_samples, n_samples), np.finfo(np.float16).max, dtype=np.float32)
    for (st_ind, end_id, distance) in edges:
        D[st_ind][end_id] = distance
    for k in range(n_samples):
        D = np.minimum(D, D[:,int(k),np.newaxis] + D[np.newaxis,int(k),:]) 
    for i in range(n_samples):
        for j in range(n_samples):
            D[i][j] = min(D[i][j], D[j][i])
    return D
        

In [None]:
# Load the digtis from the given file
digits_path = os.path.join(data_dir, 'digits-labels.npz')
load_dic = np.load(digits_path)
digits_labels = load_dic['l']
digits_data = load_dic['d']

n_total_digits = digits_data.shape[1]
# just check one input datapoint and its corresponding label: 
indx = np.random.randint(n_total_digits)
print(n_total_digits)
print("Showing a digit: {}".format(digits_labels[indx]))
mplt.imshow(digits_data[:, indx].T.reshape(28, 28).T)

In [None]:
def pairwise_euclidean(X):
    return euclidean_distances(X, X)

In [None]:
selected_digits = [0, 1, 3, 9]
groups_counts = np.bincount(digits_labels)
sorted_numberes_indxes = digits_labels.argsort()
sums_of_counts = np.cumsum(groups_counts)
list_of_sel_digits_data = [sorted_numberes_indxes[sums_of_counts[dig-1] % sums_of_counts[-1]:sums_of_counts[dig]]
                           for dig in selected_digits]
selected_digits_data = digits_data[:, np.concatenate(list_of_sel_digits_data, axis=0)]
selected_digits_labels = digits_labels[np.concatenate(list_of_sel_digits_data, axis=0)]

n_digits = 1000
selected_indexes = np.random.choice(selected_digits_labels.shape[0], 
                                    n_digits, replace=False)
sel_digits_labels = selected_digits_labels[selected_indexes]
sel_digits_data = selected_digits_data[:, selected_indexes]

D_target = pairwise_euclidean(sel_digits_data.T)
# D_target = floyd_warshall(sel_digits_data.T, k=6, 
#                             distance_metric='euclidean')

### Get all the results in a dictionary form all MDS algorithms

In [None]:
mds_res = {}

In [None]:
before = time.time()
pat_search_MDS_creator = bs_dev.MDS(n_components=2,
                                    starting_radius=2.,
                                    max_iter=100,
                                    mode='bootstrapped',
                                    prob_thresh=0.2,
                                    initial_prob=.7,
                                    a_bs=0.05,
                                    verbose=0,
                                    dissimilarity='precomputed')
(x_low_rank,
 time_logger) = pat_search_MDS_creator.fit_transform(D_target)
now = time.time()
mds_res['BS CSMDS'] = {} 
mds_res['BS CSMDS']['time'] = now - before
mds_res['BS CSMDS']['embedding'] = x_low_rank
print(now - before)

In [None]:
before = time.time()
pat_search_MDS_creator = bs_dev.MDS(n_components=2,
                                    starting_radius=1.,
                                    max_iter=100,
                                    mode='randomized',
                                    prob_thresh=0.3,
                                    initial_prob=.7,
                                    a_bs=0.05,
                                    verbose=0,
                                    dissimilarity='precomputed')
(x_low_rank,
 time_logger) = pat_search_MDS_creator.fit_transform(D_target)
now = time.time()
mds_res['RN CSMDS'] = {} 
mds_res['RN CSMDS']['time'] = now - before
mds_res['RN CSMDS']['embedding'] = x_low_rank
print(now - before)

In [None]:
before = time.time()
pat_search_MDS_creator = bs_dev.MDS(n_components=2,
                                    starting_radius=1.,
                                    max_iter=100,
                                    mode='full_search',
                                    prob_thresh=0.3,
                                    initial_prob=.8,
                                    a_bs=0.05,
                                    verbose=0,
                                    dissimilarity='precomputed')
(x_low_rank,
 time_logger) = pat_search_MDS_creator.fit_transform(D_target)
now = time.time()
mds_res['FS CSMDS'] = {} 
mds_res['FS CSMDS']['time'] = now - before
mds_res['FS CSMDS']['embedding'] = x_low_rank
print(now - before)

In [None]:
before = time.time()
embedding = classic_MDS(n_components=2, n_init=1, 
                        n_jobs=1, dissimilarity='precomputed')
X_transformed = embedding.fit_transform(D_target)
now = time.time()
mds_res['SMACOF MDS'] = {} 
mds_res['SMACOF MDS']['time'] = now - before
mds_res['SMACOF MDS']['embedding'] = X_transformed
print(now - before)
print(embedding.stress_ )

In [None]:
import matplotlib.pyplot as PLT
from matplotlib.offsetbox import AnnotationBbox, OffsetImage

def digits_scatter2D(embedding, digit_labels, digit_true_data, title=''):
    color_mapper = ['green' ,'blue', 'sienna', 'orange', 'purple',  'grey', 'aqua', 'pink', 'sienna', 'red']
    all_inds = np.arange(embedding.shape[0])
    sel_inds = np.random.choice(all_inds, size=200, replace=False)

    fig = PLT.gcf()
    fig.set_size_inches(18.5, 18.5)
    fig.clf()
    ax = PLT.subplot(111)
    
    xys = [embedding[ind, :] for ind in sel_inds]
    xs = [xy[0] for xy in xys]
    ys = [xy[1] for xy in xys]
    PLT.scatter(xs, ys, s=10000,
               c=[color_mapper[digit_labels[ind]] for ind in sel_inds],
               alpha=0.5)

    for ind in sel_inds:
        xy = embedding[ind, :]
        img_to_show = digit_true_data[:, ind].T.reshape(28,28).T

        # add a first image
        imagebox = OffsetImage(img_to_show, zoom=1.1)
        
        ab = AnnotationBbox(imagebox, xy,
            xybox=(-5., 5.),
            xycoords='data',
            boxcoords="offset points",
            arrowprops=dict(arrowstyle="->"))                                  
        ax.add_artist(ab)


    # rest is just standard matplotlib boilerplate
    ax.grid(True)
    fig.suptitle(title, fontsize=24)
    PLT.tight_layout()
    PLT.savefig('./images/qual_mnist_'
                +"_".join(title.split()[:2])+'.pdf', 
                format='pdf')
    PLT.draw()
    PLT.show()
    

In [None]:
for k, v in mds_res.items():
    digits_scatter2D(v['embedding'], 
                     sel_digits_labels, 
                     sel_digits_data, 
                     title='{} ({} seconds)'.format(k, round(v['time'],2)))
# digits_scatter2D(X_transformed, sel_digits_labels, sel_digits_data, title='SMACOF MDS')

### Check how amenable are the obtained embeddings for classification

In [None]:
print(digits_data.shape)
print(digits_labels.shape)
n_total = 3000
all_inds = np.random.choice(np.arange(digits_labels.shape[0]), 
                            size=n_total, replace=False)
X_all = digits_data[:, all_inds]
Y_all = digits_labels[all_inds]
D_goal = pairwise_euclidean(X_all.T)

In [None]:
emb_dim = 10 
mds_res = {}
before = time.time()
pat_search_MDS_creator = bs_dev.MDS(n_components=emb_dim,
                                    starting_radius=5.,
                                    max_iter=200,
                                    mode='bootstrapped',
                                    prob_thresh=0.3,
                                    initial_prob=.7,
                                    a_bs=0.05,
                                    verbose=0,
                                    dissimilarity='precomputed')
(x_low_rank,
 time_logger) = pat_search_MDS_creator.fit_transform(D_goal)
now = time.time()
mds_res['Bootstrapped CSMDS'] = {} 
mds_res['Bootstrapped CSMDS']['time'] = now - before
mds_res['Bootstrapped CSMDS']['embedding'] = x_low_rank
print(now - before)
before = time.time()
pat_search_MDS_creator = bs_dev.MDS(n_components=emb_dim,
                                    starting_radius=5.,
                                    max_iter=200,
                                    mode='randomized',
                                    prob_thresh=0.3,
                                    initial_prob=.7,
                                    a_bs=0.05,
                                    verbose=0,
                                    dissimilarity='precomputed')
(x_low_rank,
 time_logger) = pat_search_MDS_creator.fit_transform(D_goal)
now = time.time()
mds_res['Randomized CSMDS'] = {} 
mds_res['Randomized CSMDS']['time'] = now - before
mds_res['Randomized CSMDS']['embedding'] = x_low_rank
print(now - before)
before = time.time()
pat_search_MDS_creator = bs_dev.MDS(n_components=emb_dim,
                                    starting_radius=5.,
                                    max_iter=200,
                                    mode='full_search',
                                    prob_thresh=0.3,
                                    initial_prob=.8,
                                    a_bs=0.05,
                                    verbose=0,
                                    dissimilarity='precomputed')
(x_low_rank,
 time_logger) = pat_search_MDS_creator.fit_transform(D_goal)
now = time.time()
mds_res['Full Search CSMDS'] = {} 
mds_res['Full Search CSMDS']['time'] = now - before
mds_res['Full Search CSMDS']['embedding'] = x_low_rank
print(now - before)
before = time.time()
embedding = classic_MDS(n_components=emb_dim, n_init=1, 
                        n_jobs=1, dissimilarity='precomputed')
X_transformed = embedding.fit_transform(D_goal)
now = time.time()
mds_res['SMACOF MDS'] = {} 
mds_res['SMACOF MDS']['time'] = now - before
mds_res['SMACOF MDS']['embedding'] = X_transformed
print(now - before)
print(embedding.stress_ )

In [None]:
class_res = {'Initial':{}, 
             'SMACOF MDS':{}, 
             'Full Search CSMDS':{}, 
             'Randomized CSMDS':{},
             'Bootstrapped CSMDS':{}}

n_train = int(D_goal.shape[0] * 0.9)
train_inds = np.random.choice(np.arange(D_goal.shape[0]), 
                           size=n_train, replace=False)
test_inds = [i for i in np.arange(D_goal.shape[0]) 
             if i not in train_inds]
X_te= X_all[:, test_inds]
Y_te= Y_all[test_inds]
X_tr= X_all[:, train_inds]
Y_tr= Y_all[train_inds]

for k in np.arange(1, 10, 2):
    knn = KNeighborsClassifier(n_neighbors=k, weights='uniform', 
                               algorithm='brute', leaf_size=30, 
                               p=2, metric='minkowski', 
                               metric_params=None, n_jobs=28)
    knn.fit(X_tr.T, Y_tr)
    Y_pred = knn.predict(X_te.T)
    class_res['Initial']['Dims'] = X_tr.shape[0]
    class_res['Initial']['Time'] = 0
    acc = accuracy_score(Y_pred, Y_te)
    class_res['Initial']['K='+str(k)] = round(100 * acc, 2)
    for method, res in mds_res.items():
        knn.fit(res['embedding'][train_inds, :], Y_tr)
        Y_pred = knn.predict(res['embedding'][test_inds, :])
        acc = accuracy_score(Y_pred, Y_te)
        
        class_res[method]['Dims'] = res['embedding'].shape[1]
        class_res[method]['Time'] = round(res['time'],2)
        class_res[method]['K='+str(k)] = round(100. * acc, 2)
df = pd.DataFrame.from_dict(class_res, orient='index')
df = df[['Dims', 'Time'] + ['K='+str(k) for k in np.arange(1, 10, 2)]]
df = df.reindex(['Initial', 'SMACOF MDS', 'Full Search CSMDS', 'Randomized CSMDS', 'Bootstrapped CSMDS'])
print(df)
print(df.to_latex())        

In [None]:
print(digits_data.shape)
print(digits_labels.shape)
n_total = 3000
all_inds = np.random.choice(np.arange(digits_labels.shape[0]), 
                            size=n_total, replace=False)
X_all = digits_data[:, all_inds]
Y_all = digits_labels[all_inds]
D_goal = pairwise_euclidean(X_all.T)

emb_dim = 10 
mds_res = {}
for rad in [1., 2., 5.]:
    before = time.time()
    pat_search_MDS_creator = bs_dev.MDS(n_components=emb_dim,
                                        starting_radius=rad,
                                        max_iter=200,
                                        mode='full_search',
                                        prob_thresh=0.3,
                                        initial_prob=.8,
                                        a_bs=0.05,
                                        verbose=0,
                                        dissimilarity='precomputed')
    (x_low_rank,
     time_logger) = pat_search_MDS_creator.fit_transform(D_goal)
    now = time.time()
    mds_res['Full Search CSMDS'] = {} 
    mds_res['Full Search CSMDS']['time'] = now - before
    mds_res['Full Search CSMDS']['embedding'] = x_low_rank
    print(now - before)
    before = time.time()
    embedding = classic_MDS(n_components=emb_dim, n_init=1, 
                            n_jobs=1, dissimilarity='precomputed')
    X_transformed = embedding.fit_transform(D_goal)
    now = time.time()
    mds_res['SMACOF MDS'] = {} 
    mds_res['SMACOF MDS']['time'] = now - before
    mds_res['SMACOF MDS']['embedding'] = X_transformed
    print(now - before)
    print(embedding.stress_ )

    for initial_prob in [0.3, 0.4, 0.5, 0.6, 0.7]:
        before = time.time()
        pat_search_MDS_creator = bs_dev.MDS(n_components=emb_dim,
                                            starting_radius=rad,
                                            max_iter=200,
                                            mode='randomized',
                                            prob_thresh=0.3,
                                            initial_prob=.7,
                                            a_bs=0.05,
                                            verbose=0,
                                            dissimilarity='precomputed')
        (x_low_rank,
         time_logger) = pat_search_MDS_creator.fit_transform(D_goal)
        now = time.time()
        mds_res['Randomized CSMDS'] = {} 
        mds_res['Randomized CSMDS']['time'] = now - before
        mds_res['Randomized CSMDS']['embedding'] = x_low_rank
        print(now - before)
        
        
        for prob_thresh in [0.1, 0.2, 0.3]:
            
            before = time.time()
            pat_search_MDS_creator = bs_dev.MDS(n_components=emb_dim,
                                                starting_radius=rad,
                                                max_iter=200,
                                                mode='bootstrapped',
                                                prob_thresh=prob_thresh,
                                                initial_prob=initial_prob,
                                                a_bs=0.05,
                                                verbose=0,
                                                dissimilarity='precomputed')
            (x_low_rank,
             time_logger) = pat_search_MDS_creator.fit_transform(D_goal)
            now = time.time()
            mds_res['Bootstrapped CSMDS'] = {} 
            mds_res['Bootstrapped CSMDS']['time'] = now - before
            mds_res['Bootstrapped CSMDS']['embedding'] = x_low_rank
            print(now - before)
    
            class_res = {'Initial':{}, 
                         'SMACOF MDS':{}, 
                         'Full Search CSMDS':{}, 
                         'Randomized CSMDS':{},
                         'Bootstrapped CSMDS':{}}

            n_train = int(D_goal.shape[0] * 0.9)
            train_inds = np.random.choice(np.arange(D_goal.shape[0]), 
                                       size=n_train, replace=False)
            test_inds = [i for i in np.arange(D_goal.shape[0]) 
                         if i not in train_inds]
            X_te= X_all[:, test_inds]
            Y_te= Y_all[test_inds]
            X_tr= X_all[:, train_inds]
            Y_tr= Y_all[train_inds]

            for k in np.arange(1, 10, 2):
                knn = KNeighborsClassifier(n_neighbors=k, weights='uniform', 
                                           algorithm='brute', leaf_size=30, 
                                           p=2, metric='minkowski', 
                                           metric_params=None, n_jobs=28)
                knn.fit(X_tr.T, Y_tr)
                Y_pred = knn.predict(X_te.T)
                class_res['Initial']['Dims'] = X_tr.shape[0]
                class_res['Initial']['Time'] = 0
                acc = accuracy_score(Y_pred, Y_te)
                class_res['Initial']['K='+str(k)] = round(100 * acc, 2)
                for method, res in mds_res.items():
                    knn.fit(res['embedding'][train_inds, :], Y_tr)
                    Y_pred = knn.predict(res['embedding'][test_inds, :])
                    acc = accuracy_score(Y_pred, Y_te)

                    class_res[method]['Dims'] = res['embedding'].shape[1]
                    class_res[method]['Time'] = round(res['time'],2)
                    class_res[method]['K='+str(k)] = round(100. * acc, 2)
            df = pd.DataFrame.from_dict(class_res, orient='index')
            df = df[['Dims', 'Time'] + ['K='+str(k) for k in np.arange(1, 10, 2)]]
            df = df.reindex(['Initial', 'SMACOF MDS', 'Full Search CSMDS', 'Randomized CSMDS', 'Bootstrapped CSMDS'])
            print(df)
            print(df.to_latex())
            
            print("==============Above was: r:{} init:{} thresh:{}==================="
                  "".format(rad, initial_prob, prob_thresh))