In [16]:
import pandas as pd
import numpy as np
import umap
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from bokeh.plotting import figure, output_file, show
from bokeh.models import CategoricalColorMapper, ColumnDataSource
from bokeh.palettes import Category10
from bokeh.layouts import row

In [11]:
data = pd.read_csv('./data/Test_Data_1102_enumerated_filledin.csv')
y = data.columns
X = data.values

array([[2.500000e+01, 0.000000e+00, 1.221372e+06, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       [2.100000e+02, 0.000000e+00, 1.229858e+06, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       [5.000000e+02, 0.000000e+00, 1.235588e+06, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       ...,
       [5.905350e+06, 0.000000e+00, 5.902531e+06, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       [5.905351e+06, 0.000000e+00, 5.902654e+06, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       [5.905353e+06, 0.000000e+00, 5.902863e+06, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00]])

In [14]:
umap_embedding = umap.UMAP(random_state=42, n_neighbors=50, min_dist=0.3, n_components=2).fit_transform(X)

In [None]:
tsne_embedding = TSNE(perplexity=50).fit_transform(X)

In [None]:
pca_embedding = PCA(n_components=5).fit_transform(X)

In [17]:
output_file("mydata_umap.html")

umap_source = ColumnDataSource(
    dict(
        x=umap_embedding[:,0],
        y=umap_embedding[:,1],
        label=y,
    )
)

cmap = CategoricalColorMapper(factors=np.unique(y), palette=Category10[10])

p1 = figure(title="UMAP")
p1.circle(
    x="x",
    y="y",
    source=umap_source,
    color={"field": "label", "transform": cmap},
)

tsne_source = ColumnDataSource(
    dict(
        x=tsne_embedding[:,0],
        y=tsne_embedding[:,1],
        label=y,
    )
)

p2 = figure(title="TSNE")
p2.circle(
    x="x",
    y="y",
    source=tsne_source,
    color={"field": "label", "transform": cmap},
)

pca_source = ColumnDataSource(
    dict(
        x=pca_embedding[:,0],
        y=pca_embedding[:,1],
        label=y,
    )
)

p3 = figure(title="PCA")
p3.circle(
    x="x",
    y="y",
    source=pca_source,
    color={"field": "label", "transform": cmap},
)

show(row(p1,p2, p3))

 '285.0' '291.8' '295.3' '296.0' '296.1' '296.2' '296.22' '297.1' '297.2'
 '3.0' '300.0' '300.1' '300.11' '300.12' '300.4' '300.9' '301.0' '306.0'
 '313.1' '316.0' '317.0' '317.1' '318.0' '327.3' '327.32' '327.4' '338.1'
 '338.2' '339.0' '340.0' '345.3' '351.0' '386.9' '4.0' '401.1' '411.4'
 '418.0' '426.7' '427.3' '427.7' '455.0' '457.0' '464.0' '465.0' '465.2'
 '475.0' '479.0' '480.0' '483.0' '495.0' '497.0' '5.0' '508.0' '509.1'
 '512.7' '512.8' '512.9' '530.1' '530.11' '532.0' '535.9' '550.2' '558.0'
 '561.1' '562.1' '563.0' '585.1' '591.0' '593.0' '599.3' '600.0' '665.0'
 '687.4' '7.0' '716.9' '721.1' '722.1' '722.6' '726.0' '740.1' '740.9'
 '745.0' '760.0' '761.0' '763.0' '770.0' '771.1' '773.0' '782.3' '783.0'
 '785.0' '788.0' '789.0' '790.6' '798.0' '8.0' '804.0' '809.0' '819.0'
 '830.0' '835.0' '840.0' '841.0' '870.3' '871.0' '871.2' '9.0' '915.0'
 '916.0' '939.0' '965.0' '965.1' '967.0' '969.0' '979.0' 'Age' 'CaseCtrl'
 'EducationMax' 'EgoMaritalStatus' 'Female' 'Month_Birth'