In [1]:
import pandas as pd
import numpy as np
import umap
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from bokeh.plotting import figure, output_file, show
from bokeh.models import CategoricalColorMapper, ColumnDataSource
from bokeh.palettes import Category10
from bokeh.layouts import row

In [40]:
data = pd.read_csv('./data/Test_Data_1102_non_control.csv')
y = data['Race'].values
X = data.values

In [41]:
std_data = StandardScaler().fit_transform(X)
targets = []
for i in y:
    if i == 1:
        targets.append('American Indian or Alaska Native')
    elif i == 2:
        targets.append('Asian')
    elif i == 3:
        targets.append('Native Hawaiian or Other Pacific Islander')
    elif i == 4:
        targets.append('Black or African American')
    elif i == 5:
        targets.append('White')
    elif i == 6:
        targets.append('Multiple Races')
    elif i == 7:
        targets.append('Unknown')
    else:
        targets.append('Other')


# '5. White' '2. Asian' '6. Multiple Races' '7. Unknown'
#  '4. Black or African American' '1. American Indian or Alaska Native'
#  '3. Native Hawaiian or Other Pacific Islander'

In [53]:
umap_embedding = umap.UMAP(random_state=42, n_neighbors=50, min_dist=0.3, n_components=2).fit_transform(std_data)

In [None]:
tsne_embedding = TSNE(perplexity=30).fit_transform(std_data)

In [43]:
pca_embedding = PCA(n_components=2).fit_transform(std_data)

In [44]:
print(type(targets))

<class 'list'>


In [55]:
output_file("dim_reduce_vis.html")

umap_source = ColumnDataSource(
    dict(
        x=umap_embedding[:,0],
        y=umap_embedding[:,1],
        label=targets,
    )
)

cmap = CategoricalColorMapper(factors=np.unique(targets), palette=['red', 'green', 'blue', 'orange', 'purple', 'brown', 'blue', 'gray'])

p1 = figure(title="UMAP - True Case Subpopulation")
p1.circle(
    x="x",
    y="y",
    source=umap_source,
    color={"field": "label", "transform": cmap},
    legend_field ="label"
)

# tsne_source = ColumnDataSource(
#     dict(
#         x=tsne_embedding[:,0],
#         y=tsne_embedding[:,1],
#         label=y,
#     )
# )

# p2 = figure(title="TSNE")
# p2.circle(
#     x="x",
#     y="y",
#     source=tsne_source,
#     color={"field": "label", "transform": cmap},
#     legend_field ="label",
# )

pca_source = ColumnDataSource(
    dict(
        x=pca_embedding[:,0],
        y=pca_embedding[:,1],
        label=targets,
    )
)

p3 = figure(title="PCA - True Case Subpopulation")
p3.circle(
    x="x",
    y="y",
    source=pca_source,
    color={"field": "label", "transform": cmap},
    legend_field ="label"
)

show(row(p1, p3))