In [1]:
import sys
sys.path.append("/home/ethan/mixture_embeddings/")

In [2]:
import numpy as np
import pandas as pd

from geomstats.learning.preprocessing import ToTangentSpace
from geomstats.geometry.hyperbolic import Hyperbolic

import plotly.express as px
import plotly.graph_objects as go

import sklearn.datasets
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import LocallyLinearEmbedding, MDS
from sklearn.model_selection import train_test_split

# local files
from src.util.data_handling.data_loader import load_dataset

from icecream import ic

INFO: Using numpy backend


In [3]:
# supress pandas warning
# Source: https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
pd.options.mode.chained_assignment = None

# Load Data

In [4]:
seed = 42

In [5]:
ibd_euclidean_embeddings_path = '../data/processed/mixture_embeddings/ibd/cnn_euclidean_128_mixture_embeddings.pickle'
ibd_hyperbolic_embeddings_path = '../data/processed/mixture_embeddings/ibd/cnn_hyperbolic_128_mixture_embeddings.pickle'
ibd_raw_path = '../data/interim/ihmp/ibd_data.pickle'
ibd_metadata_path = '../data/interim/ihmp/ibd_metadata.pickle'

In [6]:
ibd_metadata = load_dataset(ibd_metadata_path)
ibd_metadata

Unnamed: 0_level_0,Participant ID,Project,External ID,date_of_receipt,ProjectSpecificID,visit_num,site_name,consent_age,diagnosis,hbi,sex,race,fecalcal,sccai
sample id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
CSM5FZ3N,C3001,G79084,CSM5FZ3N,2014-03-14,3001,4,Cedars-Sinai,43.0,CD,4.0,Female,White,193.89,0.0
CSM5FZ3X,C3002,G79124,CSM5FZ3X,2014-05-13,3002,5,Cedars-Sinai,76.0,CD,7.0,Female,White,71.48,0.0
CSM5FZ3Z,C3002,G79144,CSM5FZ3Z,2014-05-28,3002,6,Cedars-Sinai,76.0,CD,8.0,Female,White,156.73,0.0
CSM5FZ44,C3002,G79211,CSM5FZ44,2014-06-24,3002,8,Cedars-Sinai,76.0,CD,7.0,Female,White,54.33,0.0
CSM5FZ46,C3002,G79189,CSM5FZ46,2014-07-08,3002,9,Cedars-Sinai,76.0,CD,6.0,Female,White,54.74,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MSM5LLIO,M2021,G79228,MSM5LLIO,2014-06-17,2021,11,MGH,26.0,CD,2.0,Male,White,89.32,0.0
MSM5LLIQ,M2026,G79099,MSM5LLIQ,2014-04-16,2026,4,MGH,21.0,UC,0.0,Female,White,224.07,7.0
MSM5LLIS,M2027,G79114,MSM5LLIS,2014-05-02,2027,4,MGH,41.0,CD,0.0,Male,Other,194.74,0.0
MSM5ZOJY,M2014,G79103,MSM5ZOJY,2014-04-22,2014,9,MGH,30.0,CD,1.0,Male,White,219.23,0.0


In [7]:
ibd_metadata.columns.to_list()

['Participant ID',
 'Project',
 'External ID',
 'date_of_receipt',
 'ProjectSpecificID',
 'visit_num',
 'site_name',
 'consent_age',
 'diagnosis',
 'hbi',
 'sex',
 'race',
 'fecalcal',
 'sccai']

In [8]:
y_type = 'sex'
ibd_metadata[y_type].value_counts()

sex
Female    55
Male      41
Name: count, dtype: int64

In [9]:
y = ibd_metadata[y_type].to_numpy()

In [10]:
X_euclidean = load_dataset(ibd_euclidean_embeddings_path).astype('float32')
X_hyperbolic = load_dataset(ibd_hyperbolic_embeddings_path).astype('float32')

X_euclidean.shape, X_hyperbolic.shape

((96, 128), (96, 128))

In [11]:
# map hyperbolic data to Euclidean space TANGENT to the mean of the hyperbolic data
embedding_size = X_hyperbolic.shape[1]
hyperbolic = Hyperbolic(dim=embedding_size, default_coords_type='ball') # why do we have the -1 here?
to_tangent = ToTangentSpace(geometry=hyperbolic, method='adaptive', epsilon=1e-3)
to_tangent.fit(X_hyperbolic)
X_tangent = to_tangent.transform(X_hyperbolic).astype('float32')

X_tangent.shape

(96, 128)

In [12]:
X_raw = load_dataset(ibd_raw_path).values

# Note: many dimension reduction techniques need n_samples > n_components. And
# with IBD the n_samples = 96 < n_components = 128.
# dim_red = TruncatedSVD(n_components=embedding_size)
# dim_red = LocallyLinearEmbedding(n_components=95)
dim_red = MDS(n_components=embedding_size, random_state=seed)
X_raw = dim_red.fit_transform(X_raw)
X_raw = X_raw.astype('float32')

X_raw.shape

(96, 128)

In [13]:
type_to_data = {
    'raw': X_raw,
    'euclidean': X_euclidean,
    'hyperbolic': X_hyperbolic,
    'tangent': X_tangent
}

In [14]:
type_to_split = {type: train_test_split(X, y, shuffle=True, stratify=y) for type, X in type_to_data.items()}

# AutoML

In [15]:
import autosklearn.classification
import autosklearn.metrics
import autosklearn.regression

from collections import defaultdict

from sklearn.utils.multiclass import type_of_target

In [17]:
results = []
y_types = ['diagnosis', 'site_name', 'sex']
y_types = ['sex']
best = {}

for y_type in y_types:
    y = ibd_metadata[y_type].to_numpy()
    
    if y_type == 'diagnosis':
        mask = ibd_metadata[y_type] == 'nonIBD'
        y = y[~mask]
        type_to_data = {
            'raw': X_raw[~mask],
            'euclidean': X_euclidean[~mask],
            'hyperbolic': X_hyperbolic[~mask],
            'tangent': X_tangent[~mask]
        }
    else:
        type_to_data = {
            'raw': X_raw,
            'euclidean': X_euclidean,
            'hyperbolic': X_hyperbolic,
            'tangent': X_tangent
        }
    
    type_to_split = {type: train_test_split(X, y, stratify=y, shuffle=True, random_state=seed) for type, X in type_to_data.items()}
    
    for type, (X_train, X_test, y_train, y_test) in type_to_split.items():
        
        clf = autosklearn.classification.AutoSklearnClassifier(
            seed=seed, 
            metric=[autosklearn.metrics.f1_macro, autosklearn.metrics.balanced_accuracy],
            per_run_time_limit=15,
            time_left_for_this_task=65,
            include={"classifier": ["random_forest", "gradient_boosting"]},
            memory_limit=20000,
            tmp_folder='./tmp',
            delete_tmp_folder_after_terminate=True,
            initial_configurations_via_metalearning=0
            )
        
        clf.fit(X_train, y_train, dataset_name='ibd')
        # print(clf.leaderboard(), '\n\n')
        
        y_pred = clf.predict(X_test)
        f1 = autosklearn.metrics.f1_macro(y_test, y_pred)
        bal_acc = autosklearn.metrics.balanced_accuracy(y_test, y_pred)
        
        ic(f1, bal_acc)
        
        # record best values
        if (y_type, type, 'f1') not in best:
            best[(y_type, type, 'f1')] = (f1, clf)
        if (y_type, type, 'bal acc') not in best:
            best[(y_type, type, 'bal acc')] = (bal_acc, clf)
        if f1 > best[(y_type, type, 'f1')][0]:
            best[(y_type, type, 'f1')] = (f1, clf)
        if bal_acc > best[(y_type, type, 'bal acc')][0]:
            best[(y_type, type, 'bal acc')] = (bal_acc, clf)
        
        results.append({
            'y_type': y_type,
            'type': type,
            'f1': f1,
            'balanced accuracy': bal_acc,
        })



ic| f1: 0.4957983193277311, bal_acc: 0.5285714285714286




In [None]:
clf_df = pd.DataFrame(results)
# clf_df.to_csv('results2.csv')
clf_df

In [None]:
# clf_df.to_csv('results.csv')
# clf_df = pd.read_csv('results2.csv')
clf_df['y_type'][clf_df['y_type'] == 'sex'] = 'Sex'
clf_df['y_type'][clf_df['y_type'] == 'diagnosis'] = 'IBD Diagnosis'
clf_df['y_type'][clf_df['y_type'] == 'site_name'] = 'Hospital Location'
clf_df['f1 rounded'] = clf_df['f1'].round(3)

clf_df

In [None]:
fig = px.bar(clf_df, x='y_type', y='f1', color='type', barmode='group', text='f1 rounded')
fig.update_layout(title={'text': 'F1 Score for CNN 128 Embeddings with Random Forest Classifier', 'xanchor': 'center', 'x':0.5})
fig.update_xaxes(title="Predicted IBD Metadata")
fig.show()

In [None]:
f1, clf = best[('site_name', 'hyperbolic', 'f1')]
list(vars(clf).keys())

In [None]:
# vars(clf.ensemble_class).keys()
clf.ensemble_class.get_models_with_weights

# WandB

In [None]:
path = 'wandb_results.csv'
wandb = pd.read_csv(path)
wandb

In [None]:
records = [
    {'distance': 'NeuroSeed<br>Euclidean', 'embedding_size': 128, 'final/%rmse_test': 1.37},
    {'distance': 'NeuroSeed<br>Hyperbolic', 'embedding_size': 128, 'final/%rmse_test': 1.00}
    ]
wandb = pd.concat([wandb, pd.DataFrame(records)])
wandb

In [None]:
wandb = wandb.rename(columns={'final/%rmse_test': '%rmse test', 'embedding_size': 'Embedding Dimension'})
wandb['distance'][wandb['distance'] == 'euclidean'] = 'Euclidean'
wandb['distance'][wandb['distance'] == 'hyperbolic'] = 'Hyperbolic'


wandb['text'] = [dist.capitalize() if 'NeuroSeed' not in dist else dist.split('<br>')[1].capitalize() for dist in wandb['distance'].to_list()]
wandb = wandb.sort_values(by=['Embedding Dimension'])
wandb

In [None]:
fig = px.bar(wandb, x='Embedding Dimension', y='%rmse test', barmode='group', color='distance', text='distance')
fig.update_xaxes(type='category')

fig.update_layout(title={'text': 'Predicting Edit Distance with CNN', 'xanchor': 'center', 'x':0.5})
fig

# Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [None]:
def random_forset_clf(y_types, type_to_data, metadata):
    results = []
    X_raw, X_euclidean, X_hyperbolic, X_tangent = type_to_data.values()

    for y_type in y_types:
        y = metadata[y_type].to_numpy()
        
        encoder = LabelEncoder()
        y = encoder.fit_transform(y)
        
        if y_type == 'diagnosis':
            mask = ibd_metadata[y_type] == 'nonIBD'
            y = y[~mask]
            type_to_data = {
                'raw': X_raw[~mask],
                'euclidean': X_euclidean[~mask],
                'hyperbolic': X_hyperbolic[~mask],
                'tangent': X_tangent[~mask]
            }
        else:
            type_to_data = {
                'raw': X_raw,
                'euclidean': X_euclidean,
                'hyperbolic': X_hyperbolic,
                'tangent': X_tangent
            }
        
        type_to_split = {type: train_test_split(X, y, stratify=y) for type, X in type_to_data.items()}
        for type, (X_train, X_test, y_train, y_test) in type_to_split.items():
            
            clf = RandomForestClassifier(random_state=seed)
            clf.fit(X_train, y_train)
            
            y_pred = clf.predict(X_test)
            f1 = f1_score(y_test, y_pred, average='macro')
            
            results.append({
                'y_type': y_type,
                'type': type,
                'f1': f1,
            })
            
    return results

In [None]:
y_types = ['diagnosis', 'site_name', 'sex']
results = random_forset_clf(y_types, type_to_data, ibd_metadata)

In [None]:
df = pd.DataFrame(results)

df['y_type'][df['y_type'] == 'sex'] = 'Sex'
df['y_type'][df['y_type'] == 'diagnosis'] = 'IBD Diagnosis'
df['y_type'][df['y_type'] == 'site_name'] = 'Hospital Location'
df['f1 rounded'] = df['f1'].round(3)

df

In [None]:
fig = px.bar(df, x='y_type', y='f1', color='type', barmode='group', text='f1 rounded')
fig.update_layout(title={'text': 'F1 Score for CNN 128 Embeddings with Random Forest Classifier', 'xanchor': 'center', 'x':0.5})
fig.update_xaxes(title="Predicted IBD Metadata")
fig.show()

In [None]:
def random_forest_regression(y_types, type_to_data, metadata):
    seed = 42
    results = []
    X_raw, X_euclidean, X_hyperbolic, X_tangent = type_to_data.values()

    for y_type in y_types:
        y = metadata[y_type].to_numpy()

        if y_type == 'diagnosis':
            mask = metadata[y_type] == 'nonIBD'
            y = y[~mask]
            type_to_data = {
                'raw': X_raw[~mask],
                'euclidean': X_euclidean[~mask],
                'hyperbolic': X_hyperbolic[~mask],
                'tangent': X_tangent[~mask]
            }
        else:
            type_to_data = {
                'raw': X_raw,
                'euclidean': X_euclidean,
                'hyperbolic': X_hyperbolic,
                'tangent': X_tangent
            }
        
        type_to_split = {type: train_test_split(X, y, shuffle=True) for type, X in type_to_data.items()}
        
        for type, (X_train, X_test, y_train, y_test) in type_to_split.items():
            
            clf = RandomForestRegressor(random_state=seed)
            clf.fit(X_train, y_train)
            
            y_pred = clf.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            
            results.append({
                'y_type': y_type,
                'type': type,
                'rmse': rmse,
            })
            
    return results

In [None]:
y_types = ['hbi', 'fecalcal', 'sccai']

In [None]:
df = pd.DataFrame(results)
df['rmse rounded'] = df['rmse'].round(3)
df

In [None]:
fig = px.bar(df, x='y_type', y='rmse', color='type', barmode='group', text='rmse rounded')
fig.update_layout(title={'text': 'F1 Score for CNN 128 Embeddings with Random Forest Regression', 'xanchor': 'center', 'x':0.5})
fig.update_xaxes(title="Predicted IBD Metadata")
fig.show()

# T2D Data

In [None]:
t2d_euclidean_embeddings_path = '../data/processed/mixture_embeddings/t2d/cnn_euclidean_128_mixture_embeddings.pickle'
t2d_hyperbolic_embeddings_path = '../data/processed/mixture_embeddings/t2d/cnn_hyperbolic_128_mixture_embeddings.pickle'
t2d_raw_path = '../data/interim/ihmp/t2d_data.pickle'
t2d_metadata_path = '../data/interim/ihmp/t2d_metadata.pickle'

In [None]:
t2d_metadata = load_dataset(t2d_metadata_path)
t2d_metadata

In [None]:
t2d_metadata.columns.to_list()

In [None]:
y_types_reg['Age']

In [None]:
y_types_reg = ['Age', 'BMI', 'SSPG']
y_types_clf = ['Sex', 'IR_IS_classification'] # Event

In [None]:
X_euclidean = load_dataset(t2d_euclidean_embeddings_path).astype('float32')
X_hyperbolic = load_dataset(t2d_hyperbolic_embeddings_path).astype('float32')

# map hyperbolic data to Euclidean space TANGENT to the mean of the hyperbolic data
embedding_size = X_hyperbolic.shape[1]
hyperbolic = Hyperbolic(dim=embedding_size, default_coords_type='ball') # why do we have the -1 here?
to_tangent = ToTangentSpace(geometry=hyperbolic, method='adaptive', epsilon=1e-3)
to_tangent.fit(X_hyperbolic)
X_tangent = to_tangent.transform(X_hyperbolic).astype('float32')

# compute MDS on X_raw
X_raw = load_dataset(t2d_raw_path).values
dim_red = MDS(n_components=embedding_size)
X_raw = dim_red.fit_transform(X_raw)
X_raw = X_raw.astype('float32')

type_to_data = {
    'raw': X_raw,
    'euclidean': X_euclidean,
    'hyperbolic': X_hyperbolic,
    'tangent': X_tangent
}

In [None]:
results = random_forset_clf(y_types_clf, type_to_data, t2d_metadata)

In [None]:
df = pd.DataFrame(results)
df['f1 rounded'] = df['f1'].round(3)
df

In [None]:
fig = px.bar(df, x='y_type', y='f1', color='type', barmode='group', text='f1 rounded')
fig.update_layout(title={'text': 'F1 Score for CNN 128 Embeddings with Random Forest Classifier', 'xanchor': 'center', 'x':0.5})
fig.update_xaxes(title="Predicted T2D Metadata")
fig.show()

In [None]:
results = random_forest_regression(y_types_reg, type_to_data, t2d_metadata)

In [None]:
df = pd.DataFrame(results)
df['rmse rounded'] = df['rmse'].round(3)
df

In [None]:
fig = px.bar(df, x='y_type', y='rmse', color='type', barmode='group', text='rmse rounded')
fig.update_layout(title={'text': 'F1 Score for CNN 128 Embeddings with Random Forest Regression', 'xanchor': 'center', 'x':0.5})
fig.update_xaxes(title="Predicted IBD Metadata")
fig.show()