In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

from tqdm import tqdm
import random
import numpy as np
import seaborn as sns
import time
from collections import defaultdict
from sklearn.manifold import spectral_embedding, MDS, SpectralEmbedding
from scipy.spatial.distance import squareform, pdist, cdist
from sklearn.decomposition import PCA
from umap import UMAP
from scipy.spatial.distance import cdist 
from copy import deepcopy


import numpy as np
from numba import njit, jit
from numba import types
from numba.typed import Dict
from scipy.cluster.hierarchy import linkage
from matplotlib.colors import BASE_COLORS
from helpers import (
    mds, write_embedding_to_text_file, write_embedding_to_two_text_files, is_numeric, fit_laplacian_eigenmaps
)
import lumap
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

sns.reset_defaults()
sns.set_context(context='talk',font_scale=0.7)
%matplotlib inline


def load_mnist(path, kind='train'):
    import os
    import gzip
    import numpy as np

    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784)

    return images, labels

# Load Data

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import vstack 
from sklearn.decomposition import TruncatedSVD

def get_news_dataset():
    train_bunch = fetch_20newsgroups(subset="train")
    test_bunch = fetch_20newsgroups(subset="test")
    raw_Xtrain, raw_ytrain = train_bunch['data'], train_bunch['target']
    raw_Xtest, raw_ytest = test_bunch['data'], test_bunch['target']

    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('svd', TruncatedSVD(n_components=500)),
    ])

    raw_Xtrain = raw_Xtrain[::3]
    raw_Xtest = raw_Xtest[::3]
    raw_ytrain = raw_ytrain[::3]
    raw_ytest = raw_ytest[::3]
    rawX = raw_Xtrain + raw_Xtest
    rawX = pipeline.fit_transform(rawX)
    raw_Xtrain, raw_Xtest = rawX[:len(raw_ytrain)], rawX[len(raw_ytrain):]
    return rawX, raw_Xtrain, raw_ytrain, raw_Xtest, raw_ytest

def get_mnist_dataset():
    raw_Xtrain, raw_ytrain = load_mnist('data/fashion', kind='train')
    raw_Xtest, raw_ytest = load_mnist('data/fashion', kind='t10k')

    raw_Xtrain = raw_Xtrain[::3]
    raw_Xtest = raw_Xtest[::3]
    raw_ytrain = raw_ytrain[::3]
    raw_ytest = raw_ytest[::3]
    rawX = np.vstack((raw_Xtrain, raw_Xtest))
    return rawX, raw_Xtrain, raw_ytrain, raw_Xtest, raw_ytest


dataset_fn_dict = {
    "news": get_news_dataset,
    "mnist": get_mnist_dataset
}

# Train Model

In [None]:
import umap.distances as dist

def get_pca_embeddings(rawX, train_count, n_components):
    raw_pca_embeddings = PCA(n_components=n_components).fit_transform(rawX)
    pca_embeddings = Normalizer().fit_transform(raw_pca_embeddings)
    train_pca_embeddings = pca_embeddings[:train_count]
    test_pca_embeddings = pca_embeddings[train_count:]  
    return train_pca_embeddings, test_pca_embeddings

def get_fumap_embeddings(rawX, train_count, n_components, n_neighbors):
    embeddings = lumap.fit_umap(
        X=rawX,
        n_components=n_components,
        n_neighbors=n_neighbors,
        metric="euclidean")
    embeddings = Normalizer().fit_transform(embeddings)
    train_embeddings = embeddings[:train_count]
    test_embeddings = embeddings[train_count:]
    return train_embeddings, test_embeddings

def get_lumap_embeddings(rawX, train_count, n_components, n_neighbors):
    embeddings = lumap.fit_lumap(
        X=rawX,
        n_components=n_components,
        n_neighbors=n_neighbors,
        metric="euclidean")
    embeddings = Normalizer().fit_transform(embeddings)
    train_embeddings = embeddings[:train_count]
    test_embeddings = embeddings[train_count:]
    return train_embeddings, test_embeddings


def train_model(Xtrain, ytrain, Xtest, ytest):
    results = {}
    for name, model in [("KNeighborsClassifier", KNeighborsClassifier(n_neighbors=5))]:
        model.fit(Xtrain, ytrain)
        results[name] = {}

        encoder = OneHotEncoder()
        encoder.fit(ytrain[:, None])
        results[name]["train_score"] = roc_auc_score(
            encoder.transform(ytrain[:, None]).todense(),
            encoder.transform(model.predict(Xtrain)[:, None]).todense())

        results[name]["test_score"] = roc_auc_score(
            encoder.transform(ytest[:, None]).todense(),
            encoder.transform(model.predict(Xtest)[:, None]).todense())
    return results

In [None]:
all_n_components = [25, 50]
all_results = {}

for dataset_name, dataset_fn in dataset_fn_dict.items():
    print("==========================")
    print(dataset_name)
    print("==========================")
    rawX, raw_Xtrain, raw_ytrain, raw_Xtest, raw_ytest = dataset_fn()
    all_results[dataset_name] = {}
    for n_components in all_n_components:
        start = time.time()
        print("==========================")
        result = {}

        print("building lumap embeddings")
        train_lumap_embeddings, test_lumap_embeddings = get_lumap_embeddings(
            rawX=rawX,
            train_count=raw_Xtrain.shape[0],
            n_components=n_components,
            n_neighbors=15)
        print("training lumap model")
        result["lumap_results"] = train_model(
            Xtrain=train_lumap_embeddings,
            ytrain=raw_ytrain,
            Xtest=test_lumap_embeddings,
            ytest=raw_ytest)

        print("building umap embeddings")
        train_fumap_embeddings, test_fumap_embeddings = get_fumap_embeddings(
            rawX=rawX,
            train_count=raw_Xtrain.shape[0],
            n_components=n_components,
            n_neighbors=15)
        print("training fumap model")
        result["fumap_results"] = train_model(
            Xtrain=train_fumap_embeddings,
            ytrain=raw_ytrain,
            Xtest=test_fumap_embeddings,
            ytest=raw_ytest)

        print("n_components: {}".format(n_components))
        for k, v in result.items():
            print(k, v)

        all_results[dataset_name][n_components] = result
        print("iteration time: {}".format(time.time() - start))
