In [1]:
import sys
sys.path.append("/home/ethan/mixture_embeddings/")

In [2]:
import numpy as np
import pandas as pd

from geomstats.learning.preprocessing import ToTangentSpace
from geomstats.geometry.hyperbolic import Hyperbolic

import plotly.express as px
import plotly.graph_objects as go

import sklearn.datasets
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import LocallyLinearEmbedding, MDS
from sklearn.model_selection import train_test_split

# local files
from src.util.data_handling.data_loader import load_dataset

from icecream import ic

INFO: Using numpy backend


# Load Data

In [3]:
ibd_euclidean_embeddings_path = '../data/processed/mixture_embeddings/ibd/cnn_euclidean_128_mixture_embeddings.pickle'
ibd_hyperbolic_embeddings_path = '../data/processed/mixture_embeddings/ibd/cnn_hyperbolic_128_mixture_embeddings.pickle'
ibd_raw_path = '../data/interim/ihmp/ibd_data.pickle'
ibd_metadata_path = '../data/interim/ihmp/ibd_metadata.pickle'

In [4]:
ibd_metadata = load_dataset(ibd_metadata_path)
ibd_metadata

Unnamed: 0_level_0,Participant ID,Project,External ID,date_of_receipt,ProjectSpecificID,visit_num,site_name,consent_age,diagnosis,hbi,sex,race,fecalcal,sccai
sample id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
CSM5FZ3N,C3001,G79084,CSM5FZ3N,2014-03-14,3001,4,Cedars-Sinai,43.0,CD,4.0,Female,White,193.89,0.0
CSM5FZ3X,C3002,G79124,CSM5FZ3X,2014-05-13,3002,5,Cedars-Sinai,76.0,CD,7.0,Female,White,71.48,0.0
CSM5FZ3Z,C3002,G79144,CSM5FZ3Z,2014-05-28,3002,6,Cedars-Sinai,76.0,CD,8.0,Female,White,156.73,0.0
CSM5FZ44,C3002,G79211,CSM5FZ44,2014-06-24,3002,8,Cedars-Sinai,76.0,CD,7.0,Female,White,54.33,0.0
CSM5FZ46,C3002,G79189,CSM5FZ46,2014-07-08,3002,9,Cedars-Sinai,76.0,CD,6.0,Female,White,54.74,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MSM5LLIO,M2021,G79228,MSM5LLIO,2014-06-17,2021,11,MGH,26.0,CD,2.0,Male,White,89.32,0.0
MSM5LLIQ,M2026,G79099,MSM5LLIQ,2014-04-16,2026,4,MGH,21.0,UC,0.0,Female,White,224.07,7.0
MSM5LLIS,M2027,G79114,MSM5LLIS,2014-05-02,2027,4,MGH,41.0,CD,0.0,Male,Other,194.74,0.0
MSM5ZOJY,M2014,G79103,MSM5ZOJY,2014-04-22,2014,9,MGH,30.0,CD,1.0,Male,White,219.23,0.0


In [45]:
ibd_metadata.columns.to_list()

['Participant ID',
 'Project',
 'External ID',
 'date_of_receipt',
 'ProjectSpecificID',
 'visit_num',
 'site_name',
 'consent_age',
 'diagnosis',
 'hbi',
 'sex',
 'race',
 'fecalcal',
 'sccai']

In [5]:
X_euclidean = load_dataset(ibd_euclidean_embeddings_path).astype('float32')
X_hyperbolic = load_dataset(ibd_hyperbolic_embeddings_path).astype('float32')

X_euclidean.shape, X_hyperbolic.shape

((96, 128), (96, 128))

In [6]:
# map hyperbolic data to Euclidean space TANGENT to the mean of the hyperbolic data
embedding_size = X_hyperbolic.shape[1]
hyperbolic = Hyperbolic(dim=embedding_size, default_coords_type='ball') # why do we have the -1 here?
to_tangent = ToTangentSpace(geometry=hyperbolic, method='adaptive', epsilon=1e-3)
to_tangent.fit(X_hyperbolic)
X_tangent = to_tangent.transform(X_hyperbolic).astype('float32')

X_tangent.shape

(96, 128)

In [7]:
X_raw = load_dataset(ibd_raw_path).values

# Note: many dimension reduction techniques need n_samples > n_components. And
# with IBD the n_samples = 96 < n_components = 128.
# dim_red = TruncatedSVD(n_components=embedding_size)
# dim_red = LocallyLinearEmbedding(n_components=95)
dim_red = MDS(n_components=embedding_size)
X_raw = dim_red.fit_transform(X_raw)
X_raw = X_raw.astype('float32')

X_raw.shape

(96, 128)

In [8]:
type_to_data = {
    'raw': X_raw,
    'euclidean': X_euclidean,
    'hyperbolic': X_hyperbolic,
    'tangent': X_tangent
}

# UMAP Data

In [40]:
from umap import UMAP

In [52]:
y_type = 'sex'
y = ibd_metadata[y_type].to_numpy()

In [53]:
n_neighbors_list = [3, 5, 8, 12, 15, 20, 25]

for n_neighbors in n_neighbors_list:

    type_to_umap = {}
    for type, X in type_to_data.items():
        if type == 'hyperbolic':
            hyperbolic = Hyperbolic(dim=X.shape[1], default_coords_type='ball')
            metric = hyperbolic._metric.dist
        else:
            metric='euclidean'
            
        X_red = UMAP(metric=metric, n_neighbors=n_neighbors, n_components=2).fit_transform(X)
        type_to_umap[type] = X_red
        
        df = pd.DataFrame(np.hstack((type_to_umap[type], y[:, np.newaxis])), columns=['umap1', 'umap2', y_type])
        fig = px.scatter(df, x='umap1', y='umap2', color=y_type)
        fig.update_layout(title={'text': 'Predict {} with UMAP on {} data with {} neighbors'.format(y_type, type, n_neighbors), 'xanchor': 'center', 'x':0.5})
        fig.show()


custom distance metric does not return gradient; inverse_transform will be unavailable. To enable using inverse_transform method, define a distance function that returns a tuple of (distance [float], gradient [np.array])




custom distance metric does not return gradient; inverse_transform will be unavailable. To enable using inverse_transform method, define a distance function that returns a tuple of (distance [float], gradient [np.array])




custom distance metric does not return gradient; inverse_transform will be unavailable. To enable using inverse_transform method, define a distance function that returns a tuple of (distance [float], gradient [np.array])




custom distance metric does not return gradient; inverse_transform will be unavailable. To enable using inverse_transform method, define a distance function that returns a tuple of (distance [float], gradient [np.array])




custom distance metric does not return gradient; inverse_transform will be unavailable. To enable using inverse_transform method, define a distance function that returns a tuple of (distance [float], gradient [np.array])




custom distance metric does not return gradient; inverse_transform will be unavailable. To enable using inverse_transform method, define a distance function that returns a tuple of (distance [float], gradient [np.array])




custom distance metric does not return gradient; inverse_transform will be unavailable. To enable using inverse_transform method, define a distance function that returns a tuple of (distance [float], gradient [np.array])



In [38]:
type = 'raw'
df = pd.DataFrame(np.hstack((type_to_umap[type], y[:, np.newaxis])), columns=['umap1', 'umap2', y_type])
df

Unnamed: 0,umap1,umap2,site_name
0,9.205824,7.978291,Cedars-Sinai
1,10.761662,6.881513,Cedars-Sinai
2,11.105639,6.558615,Cedars-Sinai
3,11.360098,1.513365,Cedars-Sinai
4,11.274348,6.184087,Cedars-Sinai
...,...,...,...
91,9.885825,7.070191,MGH
92,10.168128,8.439391,MGH
93,11.981827,1.279893,MGH
94,5.946434,6.281927,MGH


In [39]:
px.scatter(df, x='umap1', y='umap2', color='site_name')

In [10]:
seed = 42
results = []
y_types = ['diagnosis', 'site_name', 'sex']

for y_type in y_types:
    

IndentationError: expected an indented block (3284456953.py, line 6)