In [5]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

from tqdm import tqdm
import random
import numpy as np
import seaborn as sns
import time
from collections import defaultdict
from sklearn.manifold import spectral_embedding, MDS, SpectralEmbedding
from scipy.spatial.distance import squareform, pdist, cdist
from sklearn.decomposition import PCA
from umap import UMAP
from scipy.spatial.distance import cdist 
from copy import deepcopy


import numpy as np
from numba import njit, jit
from numba import types
from numba.typed import Dict
from scipy.cluster.hierarchy import linkage
from matplotlib.colors import BASE_COLORS
from helpers import (
    mds, write_embedding_to_text_file, write_embedding_to_two_text_files, is_numeric, fit_laplacian_eigenmaps
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import SpectralEmbedding 
from sklearn.metrics.pairwise import rbf_kernel, euclidean_distances


sns.reset_defaults()
sns.set_context(context='talk',font_scale=0.7)
%matplotlib inline


def load_mnist(path, kind='train'):
    import os
    import gzip
    import numpy as np

    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784)

    return images, labels

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import vstack 
from sklearn.decomposition import TruncatedSVD

def get_news_dataset():
    train_bunch = fetch_20newsgroups(subset="train")
    test_bunch = fetch_20newsgroups(subset="test")
    raw_Xtrain, raw_ytrain = train_bunch['data'], train_bunch['target']
    raw_Xtest, raw_ytest = test_bunch['data'], test_bunch['target']

    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('svd', TruncatedSVD(n_components=500)),
    ])

    raw_Xtrain = raw_Xtrain[::3]
    raw_Xtest = raw_Xtest[::3]
    raw_ytrain = raw_ytrain[::3]
    raw_ytest = raw_ytest[::3]
    rawX = raw_Xtrain + raw_Xtest
    rawX = pipeline.fit_transform(rawX)
    raw_Xtrain, raw_Xtest = rawX[:len(raw_ytrain)], rawX[len(raw_ytrain):]
    return rawX, raw_Xtrain, raw_ytrain, raw_Xtest, raw_ytest

def get_mnist_dataset():
    raw_Xtrain, raw_ytrain = load_mnist('data/fashion', kind='train')
    raw_Xtest, raw_ytest = load_mnist('data/fashion', kind='t10k')

    raw_Xtrain = raw_Xtrain[::3]
    raw_Xtest = raw_Xtest[::3]
    raw_ytrain = raw_ytrain[::3]
    raw_ytest = raw_ytest[::3]
    rawX = np.vstack((raw_Xtrain, raw_Xtest))
    return rawX, raw_Xtrain, raw_ytrain, raw_Xtest, raw_ytest


dataset_fn_dict = {
    "news": get_news_dataset,
    "mnist": get_mnist_dataset
}

In [None]:
import umap.distances as dist
from sklearn.manifold import SpectralEmbedding 



def get_pca_embeddings(rawX, train_count, n_components):
    raw_pca_embeddings = PCA(n_components=n_components).fit_transform(rawX)
    pca_embeddings = Normalizer().fit_transform(raw_pca_embeddings)
    train_pca_embeddings = pca_embeddings[:train_count]
    test_pca_embeddings = pca_embeddings[train_count:]  
    return train_pca_embeddings, test_pca_embeddings


def get_le_embeddings(rawX, train_count, n_components, alpha):
    assert alpha >= 0 and alpha <= 1.0
    kernelized_distances = rbf_kernel(raw_Xtrain, raw_Xtrain, gamma=1.0)
    kernelized_distances = kernelized_distances * (kernelized_distances >= alpha)
    embeddings = SpectralEmbedding(
        n_components=n_components,
        affinity="precomputed").fit_transform(kernelized_distances)

    embeddings = Normalizer().fit_transform(embeddings)
    train_embeddings = embeddings[:train_count]
    test_embeddings = embeddings[train_count:]
    return train_embeddings, test_embeddings


def get_msle_embeddings(rawX, train_count, n_components):
    kernelized_distances = rbf_kernel(raw_Xtrain, raw_Xtrain, gamma=1.0)**2
    embeddings = SpectralEmbedding(
        n_components=n_components,
        affinity="precomputed").fit_transform(kernelized_distances)
    embeddings = Normalizer().fit_transform(embeddings)
    train_embeddings = embeddings[:train_count]
    test_embeddings = embeddings[train_count:]
    return train_embeddings, test_embeddings



def train_model(Xtrain, ytrain, Xtest, ytest):
    results = {}
    for name, model in [("KNeighborsClassifier", KNeighborsClassifier(n_neighbors=5))]:
        model.fit(Xtrain, ytrain)
        results[name] = {}

        encoder = OneHotEncoder()
        encoder.fit(ytrain[:, None])
        results[name]["train_score"] = roc_auc_score(
            encoder.transform(ytrain[:, None]).todense(),
            encoder.transform(model.predict(Xtrain)[:, None]).todense())

        results[name]["test_score"] = roc_auc_score(
            encoder.transform(ytest[:, None]).todense(),
            encoder.transform(model.predict(Xtest)[:, None]).todense())
    return results

In [None]:
# Construct the n x n matrix W 
# W_ij = 


In [None]:
# gradient descent? 
# https://medium.com/swlh/euclidean-distance-matrix-4c3e1378d87f