In [2]:
import os
import re
import time

PROJECTION = "knn_distance_2"
NUM_RUNS = 100
DATASET = "WORD_VECTOR"
SAMPLE_STEP = 50
MAX_SAMPLES = 500
WITH_REPLACEMENT = True
N_JOBS = 1
N_CUBES = 10
P_OVERLAP = 0.5
N_CLUSTERS = 3

In [3]:
PROJECT_NAME = "{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
    PROJECTION, NUM_RUNS, DATASET, SAMPLE_STEP, MAX_SAMPLES, WITH_REPLACEMENT, N_CUBES, P_OVERLAP, N_CLUSTERS)
print(PROJECT_NAME)

knn_distance_2_100_WORD_VECTOR_50_500_True_10_0.5_3


In [8]:
"""
https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.estrada_index.html#networkx.algorithms.centrality.estrada_index
https://kepler-mapper.scikit-tda.org/notebooks/Adapters.html
"""

# !pip install seaborn
# !pip install scikit-learn
# !pip install networkx
# !pip install umap
# !pip install umap-learn
# !pip install kmapper
# !pip install gensim


import json
import time
from collections import Counter
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
import pytz
import numpy as np
import seaborn as sns
from sklearn.metrics import roc_auc_score
from scipy.stats import spearmanr
from datetime import date, datetime
import calendar
from scipy.optimize import minimize_scalar
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from umap.umap_ import UMAP
import networkx as nx
import os
from sklearn.cluster import AgglomerativeClustering
from networkx.algorithms.approximation.treewidth import treewidth_min_fill_in
import kmapper as km
import os
from sklearn.cluster import AgglomerativeClustering
from networkx.algorithms.approximation.treewidth import treewidth_min_fill_in
from networkx.algorithms.centrality import estrada_index
from networkx.algorithms.cycles import cycle_basis 
import gensim.downloader as api
import tensorflow as tf

import logging
logging.getLogger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

sns.set(style='darkgrid')
pd.options.display.max_rows = 100
pd.options.display.max_columns = 20


def load_mnist(path, kind='train'):
    import os
    import gzip
    import numpy as np

    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784)

    return images, labels

def get_mnist_dataset(skip=1):
    mnist_path = 'fashion'

    raw_Xtrain, raw_ytrain = load_mnist(mnist_path, kind='train')
    raw_Xtest, raw_ytest = load_mnist(mnist_path, kind='t10k')

    raw_Xtrain = raw_Xtrain / 255
    raw_Xtest = raw_Xtest / 255

    train_indices = np.random.permutation(range(raw_Xtrain.shape[0]))[::skip]
    test_indices = np.random.permutation(range(raw_Xtest.shape[0]))[::skip]

    raw_Xtrain = raw_Xtrain[train_indices, :]
    raw_ytrain = raw_ytrain[train_indices]

    raw_Xtest = raw_Xtest[test_indices, :]
    raw_ytest = raw_ytest[test_indices]
    return raw_Xtrain, raw_ytrain, raw_Xtest, raw_ytest

def get_word_vector_dataset():

    model = api.load("glove-wiki-gigaword-50")
    vocab = list(model.vocab.keys())
    embeddings = np.array([model.get_vector(w) for w in vocab])

    D = {"vocab": vocab, "embeddings": embeddings}
    return D["embeddings"], np.array(D["vocab"])



In [9]:


def build_mapper_graph(data, labels, title, visualize, projection, verbose=0):
    path_html = "/Users/dshiebler/workspace/data/mapper/{}.html".format(title)

    # Initialize
    mapper = km.KeplerMapper(verbose=verbose)

    # Fit to and transform the data
#     projected_data = mapper.fit_transform(data, projection=UMAP(n_components=2))
#     projected_data = mapper.fit_transform(data, projection=PCA(n_components=1))

    if projection == "pca_{}":
        n_components = int(projection.split("_")[-1])
        projection = PCA(n_components=n_components)
    elif projection == "umap_1":
        n_components = int(projection.split("_")[-1])
        projection = UMAP(n_components=n_components)
        
    projected_data = mapper.fit_transform(data, projection=projection)

    # Create dictionary called 'graph' with nodes, edges and meta-information
    cover = km.Cover(n_cubes=N_CUBES, perc_overlap=P_OVERLAP)
    graph = mapper.map(projected_data, data, cover=cover, clusterer=AgglomerativeClustering(N_CLUSTERS))

    # Visualize it
    if visualize:
        path_html = mapper.visualize(
          graph, path_html=path_html,
          title=title, custom_tooltips=np.array([str(l) for l in labels]))
        out = graph, path_html
    else:
        out = graph
    return out


def nodes_in_component(component, graph):
    return sum([len(graph["nodes"][cluster]) for cluster in component])

def connected_component_node_counts(graph, nx_graph):
    cc_clusters = [cc for cc in nx.connected_components(nx_graph)]
    return [nodes_in_component(cc, graph) for cc in cc_clusters]

def resample(X, n_samples, with_replacement):
    indices = np.arange(0, X.shape[0])
    return X[np.random.choice(indices, size=n_samples, replace=with_replacement)]


def get_metrics(n_samples):
    return {
        "num_cc_p0": lambda graph, nx_graph: len([cc for cc in connected_component_node_counts(graph, nx_graph)]),
        "density": lambda graph, nx_graph: nx.density(nx_graph),
        "estrada_index": lambda graph, nx_graph: estrada_index(nx_graph),
        # "treewidth": lambda graph: treewidth_min_fill_in(km.adapter.to_nx(graph))[0],
        "num_basis_cycles": lambda graph, nx_graph: len(cycle_basis(nx_graph))
    }


def get_metric_values(n_samples):
    if N_JOBS == 1:
        print("calling get_graph with n_samples={}".format(n_samples))

    metric_to_sample_values = {metric_name: {n_samples: []} for metric_name in get_metrics(1).keys()}

    iterator = tqdm(range(NUM_RUNS)) if N_JOBS == 1 else range(NUM_RUNS)
    for i in iterator:
        data = resample(X, n_samples, with_replacement=WITH_REPLACEMENT)
        graph = build_mapper_graph(
            data, labels=Y, projection=PROJECTION, title="my_html-{}".format(i), visualize=False, verbose=0)
        nx_graph = km.adapter.to_nx(graph)
        for metric_name, metric_fn in get_metrics(n_samples).items():
            metric_to_sample_values[metric_name][n_samples].append(metric_fn(graph, nx_graph))

    return metric_to_sample_values



In [10]:
if "MNIST_UMAP_" in DATASET:
    n_components = int(DATASET.split("_")[-1])
    rawX, Y, _, _ = get_mnist_dataset(skip=1)
    X = UMAP(n_components=n_components, n_neighbors=15).fit_transform(rawX, y=Y)
elif DATASET == "WORD_VECTOR":
    X, Y = get_word_vector_dataset()
else:
    raise ValueError("Dataset not recognized")

print("{} X.shape: {} Y.shape: {}".format(DATASET, X.shape, Y.shape))


WORD_VECTOR X.shape: (400000, 50) Y.shape: (400000,)


In [11]:
from parallel_process import parallel_process


print("Computing metric_to_sample_values_list with N_JOBS={}...".format(N_JOBS))
if N_JOBS == 1:
    metric_to_sample_values_list = []
    for n_samples in tqdm(range(SAMPLE_STEP, MAX_SAMPLES, SAMPLE_STEP)):
        metric_to_sample_values_list.append(get_metric_values(n_samples))
else:
    metric_to_sample_values_list = parallel_process(
        array=range(SAMPLE_STEP, MAX_SAMPLES, SAMPLE_STEP),
        function=get_metric_values,
        n_jobs=N_JOBS)
print("metric_to_sample_values_list computed!")


out = {}
out["metric_to_sample_values"] = {metric_name: {} for metric_name in get_metrics(1).keys()}
for metric_to_sample_values in metric_to_sample_values_list:
    for metric in out["metric_to_sample_values"]:
        out["metric_to_sample_values"][metric].update(metric_to_sample_values[metric])


  0%|          | 0/9 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 14%|█▍        | 14/100 [00:00<00:00, 136.10it/s][A

Computing metric_to_sample_values_list with N_JOBS=1...
calling get_graph with n_samples=50



 30%|███       | 30/100 [00:00<00:00, 142.34it/s][A
 47%|████▋     | 47/100 [00:00<00:00, 147.62it/s][A
 63%|██████▎   | 63/100 [00:00<00:00, 150.30it/s][A
 79%|███████▉  | 79/100 [00:00<00:00, 150.87it/s][A
100%|██████████| 100/100 [00:00<00:00, 152.79it/s][A
 11%|█         | 1/9 [00:00<00:05,  1.52it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 13%|█▎        | 13/100 [00:00<00:00, 124.10it/s][A

calling get_graph with n_samples=100



 27%|██▋       | 27/100 [00:00<00:00, 127.54it/s][A
 40%|████      | 40/100 [00:00<00:00, 126.97it/s][A
 54%|█████▍    | 54/100 [00:00<00:00, 128.90it/s][A
 66%|██████▌   | 66/100 [00:00<00:00, 121.55it/s][A
 77%|███████▋  | 77/100 [00:00<00:00, 109.20it/s][A
100%|██████████| 100/100 [00:00<00:00, 117.17it/s][A
 22%|██▏       | 2/9 [00:01<00:05,  1.39it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 11%|█         | 11/100 [00:00<00:00, 102.82it/s][A

calling get_graph with n_samples=150



 21%|██        | 21/100 [00:00<00:00, 101.29it/s][A
 32%|███▏      | 32/100 [00:00<00:00, 101.85it/s][A
 43%|████▎     | 43/100 [00:00<00:00, 103.14it/s][A
 54%|█████▍    | 54/100 [00:00<00:00, 103.89it/s][A
 64%|██████▍   | 64/100 [00:00<00:00, 100.95it/s][A
 75%|███████▌  | 75/100 [00:00<00:00, 101.64it/s][A
 87%|████████▋ | 87/100 [00:00<00:00, 106.00it/s][A
100%|██████████| 100/100 [00:00<00:00, 104.93it/s][A
 33%|███▎      | 3/9 [00:02<00:04,  1.27it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 11%|█         | 11/100 [00:00<00:00, 100.00it/s][A

calling get_graph with n_samples=200



 20%|██        | 20/100 [00:00<00:00, 93.91it/s] [A
 29%|██▉       | 29/100 [00:00<00:00, 92.11it/s][A
 38%|███▊      | 38/100 [00:00<00:00, 91.00it/s][A
 48%|████▊     | 48/100 [00:00<00:00, 91.72it/s][A
 57%|█████▋    | 57/100 [00:00<00:00, 90.46it/s][A
 66%|██████▌   | 66/100 [00:00<00:00, 89.72it/s][A
 76%|███████▌  | 76/100 [00:00<00:00, 89.88it/s][A
 85%|████████▌ | 85/100 [00:00<00:00, 89.69it/s][A
100%|██████████| 100/100 [00:01<00:00, 89.62it/s][A
 44%|████▍     | 4/9 [00:03<00:04,  1.13it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  8%|▊         | 8/100 [00:00<00:01, 71.47it/s][A

calling get_graph with n_samples=250



 15%|█▌        | 15/100 [00:00<00:01, 70.94it/s][A
 22%|██▏       | 22/100 [00:00<00:01, 70.58it/s][A
 30%|███       | 30/100 [00:00<00:00, 70.94it/s][A
 38%|███▊      | 38/100 [00:00<00:00, 71.32it/s][A
 46%|████▌     | 46/100 [00:00<00:00, 72.04it/s][A
 55%|█████▌    | 55/100 [00:00<00:00, 74.48it/s][A
 63%|██████▎   | 63/100 [00:00<00:00, 72.97it/s][A
 71%|███████   | 71/100 [00:00<00:00, 74.38it/s][A
 79%|███████▉  | 79/100 [00:01<00:00, 75.20it/s][A
 87%|████████▋ | 87/100 [00:01<00:00, 75.50it/s][A
100%|██████████| 100/100 [00:01<00:00, 73.44it/s][A
 56%|█████▌    | 5/9 [00:04<00:04,  1.03s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  7%|▋         | 7/100 [00:00<00:01, 62.02it/s][A

calling get_graph with n_samples=300



 14%|█▍        | 14/100 [00:00<00:01, 61.82it/s][A
 21%|██        | 21/100 [00:00<00:01, 61.77it/s][A
 28%|██▊       | 28/100 [00:00<00:01, 61.90it/s][A
 35%|███▌      | 35/100 [00:00<00:01, 62.15it/s][A
 42%|████▏     | 42/100 [00:00<00:00, 62.73it/s][A
 49%|████▉     | 49/100 [00:00<00:00, 62.75it/s][A
 56%|█████▌    | 56/100 [00:00<00:00, 63.21it/s][A
 63%|██████▎   | 63/100 [00:01<00:00, 62.70it/s][A
 70%|███████   | 70/100 [00:01<00:00, 62.61it/s][A
 77%|███████▋  | 77/100 [00:01<00:00, 62.54it/s][A
 84%|████████▍ | 84/100 [00:01<00:00, 62.53it/s][A
 91%|█████████ | 91/100 [00:01<00:00, 63.23it/s][A
100%|██████████| 100/100 [00:01<00:00, 63.03it/s][A
 67%|██████▋   | 6/9 [00:06<00:03,  1.20s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  6%|▌         | 6/100 [00:00<00:01, 55.54it/s][A

calling get_graph with n_samples=350



 12%|█▏        | 12/100 [00:00<00:01, 55.34it/s][A
 18%|█▊        | 18/100 [00:00<00:01, 54.73it/s][A
 24%|██▍       | 24/100 [00:00<00:01, 54.36it/s][A
 30%|███       | 30/100 [00:00<00:01, 53.89it/s][A
 36%|███▌      | 36/100 [00:00<00:01, 54.25it/s][A
 42%|████▏     | 42/100 [00:00<00:01, 54.40it/s][A
 48%|████▊     | 48/100 [00:00<00:00, 54.48it/s][A
 54%|█████▍    | 54/100 [00:00<00:00, 54.30it/s][A
 60%|██████    | 60/100 [00:01<00:00, 54.28it/s][A
 66%|██████▌   | 66/100 [00:01<00:00, 54.56it/s][A
 72%|███████▏  | 72/100 [00:01<00:00, 54.66it/s][A
 78%|███████▊  | 78/100 [00:01<00:00, 55.04it/s][A
 84%|████████▍ | 84/100 [00:01<00:00, 54.93it/s][A
 90%|█████████ | 90/100 [00:01<00:00, 54.13it/s][A
100%|██████████| 100/100 [00:01<00:00, 54.28it/s][A
 78%|███████▊  | 7/9 [00:08<00:02,  1.39s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  5%|▌         | 5/100 [00:00<00:02, 47.39it/s][A

calling get_graph with n_samples=400



 10%|█         | 10/100 [00:00<00:01, 47.35it/s][A
 15%|█▌        | 15/100 [00:00<00:01, 47.35it/s][A
 20%|██        | 20/100 [00:00<00:01, 47.19it/s][A
 25%|██▌       | 25/100 [00:00<00:01, 46.95it/s][A
 30%|███       | 30/100 [00:00<00:01, 46.05it/s][A
 35%|███▌      | 35/100 [00:00<00:01, 46.24it/s][A
 40%|████      | 40/100 [00:00<00:01, 46.30it/s][A
 45%|████▌     | 45/100 [00:00<00:01, 46.52it/s][A
 50%|█████     | 50/100 [00:01<00:01, 46.18it/s][A
 55%|█████▌    | 55/100 [00:01<00:01, 44.96it/s][A
 60%|██████    | 60/100 [00:01<00:00, 45.05it/s][A
 65%|██████▌   | 65/100 [00:01<00:00, 45.08it/s][A
 70%|███████   | 70/100 [00:01<00:00, 45.39it/s][A
 75%|███████▌  | 75/100 [00:01<00:00, 45.76it/s][A
 80%|████████  | 80/100 [00:01<00:00, 45.98it/s][A
 85%|████████▌ | 85/100 [00:01<00:00, 45.41it/s][A
 90%|█████████ | 90/100 [00:01<00:00, 46.02it/s][A
 95%|█████████▌| 95/100 [00:02<00:00, 45.73it/s][A
100%|██████████| 100/100 [00:02<00:00, 45.91it/s][A
 89%|█████

calling get_graph with n_samples=450



  9%|▉         | 9/100 [00:00<00:02, 38.37it/s][A
 13%|█▎        | 13/100 [00:00<00:02, 38.56it/s][A
 17%|█▋        | 17/100 [00:00<00:02, 38.48it/s][A
 21%|██        | 21/100 [00:00<00:02, 38.61it/s][A
 25%|██▌       | 25/100 [00:00<00:01, 38.15it/s][A
 29%|██▉       | 29/100 [00:00<00:01, 38.51it/s][A
 33%|███▎      | 33/100 [00:00<00:01, 38.23it/s][A
 37%|███▋      | 37/100 [00:00<00:01, 38.69it/s][A
 42%|████▏     | 42/100 [00:01<00:01, 39.17it/s][A
 46%|████▌     | 46/100 [00:01<00:01, 39.10it/s][A
 51%|█████     | 51/100 [00:01<00:01, 39.69it/s][A
 56%|█████▌    | 56/100 [00:01<00:01, 40.29it/s][A
 61%|██████    | 61/100 [00:01<00:00, 40.27it/s][A
 66%|██████▌   | 66/100 [00:01<00:00, 40.57it/s][A
 71%|███████   | 71/100 [00:01<00:00, 40.70it/s][A
 76%|███████▌  | 76/100 [00:01<00:00, 40.92it/s][A
 81%|████████  | 81/100 [00:02<00:00, 40.91it/s][A
 86%|████████▌ | 86/100 [00:02<00:00, 41.00it/s][A
 91%|█████████ | 91/100 [00:02<00:00, 41.24it/s][A
100%|███████

metric_to_sample_values_list computed!





In [12]:
out = {}
out["metric_to_sample_values"] = {metric_name: {} for metric_name in get_metrics(1).keys()}
for metric_to_sample_values in metric_to_sample_values_list:
    for metric in out["metric_to_sample_values"]:
        out["metric_to_sample_values"][metric].update(metric_to_sample_values[metric])
