In [1]:
%load_ext autoreload
%autoreload 2

In [34]:
import pandas as pd

from tqdm import tqdm

from geomstats.learning.preprocessing import ToTangentSpace
from geomstats.geometry.hyperbolic import Hyperbolic

from sklearn.manifold import MDS
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold
from sklearn.dummy import DummyClassifier, DummyRegressor

import plotly.express as px
import plotly.graph_objects as go

# local files
from src.util.data_handling.data_loader import load_dataset
from src.classifiers.mlp import MLP

from icecream import ic

In [3]:
# supress pandas warning
# Source: https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
pd.options.mode.chained_assignment = None

In [4]:
seed = 43

# Load Data

In [5]:
data_name = 'ibd'
euclidean_embeddings_path = '../../data/processed/mixture_embeddings/{}/cnn_euclidean_128_mixture_embeddings.pickle'.format(data_name)
hyperbolic_embeddings_path = '../../data/processed/mixture_embeddings/{}/cnn_hyperbolic_128_mixture_embeddings.pickle'.format(data_name)
raw_path = '../../data/interim/ihmp/{}_data.pickle'.format(data_name)
metadata_path = '../../data/interim/ihmp/{}_metadata.pickle'.format(data_name)

In [6]:
metadata = load_dataset(metadata_path)

In [7]:
X_euclidean = load_dataset(euclidean_embeddings_path).astype('float32')
X_hyperbolic = load_dataset(hyperbolic_embeddings_path).astype('float32')

# X_tangent is data in Euclidean plane tangent to hyperbolic manifold
embedding_size = X_hyperbolic.shape[1]
hyperbolic = Hyperbolic(dim=embedding_size, default_coords_type='ball') # why do we have the -1 here?
to_tangent = ToTangentSpace(geometry=hyperbolic, method='adaptive', epsilon=1e-3)
to_tangent.fit(X_hyperbolic)
X_tangent = to_tangent.transform(X_hyperbolic).astype('float32')

# Multidimensional scaling on X_raw
X_raw = load_dataset(raw_path).values
dim_red = MDS(n_components=embedding_size, random_state=seed, normalized_stress='auto')
X_raw = dim_red.fit_transform(X_raw)
X_raw = X_raw.astype('float32')

space_to_data = {
    'raw': X_raw,
    'euclidean': X_euclidean,
    'hyperbolic': X_hyperbolic,
    'tangent': X_tangent
}

# Predict

In [8]:
def my_f1_score(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

In [35]:
feature_to_task = {
    'diagnosis': 'clf',
    'site_name': 'clf',
    'sex': 'clf'
    }

task_to_models = {
    'clf': {'rf': RandomForestClassifier(random_state=seed), 'dummy': DummyClassifier()}, # 'mlp': MLP(duration='100ep'), 
    'reg': {'rf': RandomForestRegressor(random_state=seed), 'dummy': DummyRegressor()}
}

task_to_metrics = {
    'clf': {'f1': my_f1_score},
    'ref': {'mse': mean_squared_error}
}

In [36]:
results = []
n_splits = 5
n_repeats = 5

for feature, task in feature_to_task.items():
    for space, X in tqdm(space_to_data.items(), desc='Predict {}'.format(feature)):
        y = metadata[feature].to_numpy()
        
        if feature == 'diagnosis':
            mask = metadata[feature] == 'nonIBD'
            y = y[~mask]
            X = X[~mask]
                        
        if task == 'clf':
            encoder = LabelEncoder()
            y = encoder.fit_transform(y)
            
        model_name_to_model = task_to_models[task]
        metric_name_to_metric = task_to_metrics[task]
            
        # split the data
        if task == 'clf':
            folds = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
        else:
            folds = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
            
        # loop over splits
        for split_idx, (train_idx, test_idx) in enumerate(folds.split(X, y)):
            X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
            
            # loop over models
            for model_name, model in model_name_to_model.items():
                
                # fit model and predict
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                
                # measure and save metrics
                scores = {metric_name: metric(y_test, y_pred) for metric_name, metric in metric_name_to_metric.items()}
                results.append({
                    'feature': feature,
                    'task': task,
                    'space': space,
                    'model': model_name,
                    'split': split_idx,
                    } | scores
                            )

Predict diagnosis: 100%|██████████| 4/4 [00:15<00:00,  3.78s/it]
Predict site_name: 100%|██████████| 4/4 [00:15<00:00,  3.86s/it]
Predict sex: 100%|██████████| 4/4 [00:15<00:00,  3.77s/it]


In [62]:
df = pd.DataFrame(results)
df

Unnamed: 0,feature,task,space,model,split,f1
0,diagnosis,clf,raw,rf,0,0.441176
1,diagnosis,clf,raw,dummy,0,0.441176
2,diagnosis,clf,raw,rf,1,0.654545
3,diagnosis,clf,raw,dummy,1,0.441176
4,diagnosis,clf,raw,rf,2,0.441176
...,...,...,...,...,...,...
595,sex,clf,tangent,dummy,22,0.366667
596,sex,clf,tangent,rf,23,0.773810
597,sex,clf,tangent,dummy,23,0.366667
598,sex,clf,tangent,rf,24,0.784091


# Plot Results

In [81]:
group = df.groupby(['feature', 'task', 'space', 'model'])
group_describe = group.describe(percentiles=[])
group_describe = group_describe.drop(columns=['split'])
group_describe.columns = [' '.join(column).strip() for column in group_describe.columns]
group_describe = group_describe.reset_index()
group_describe = group_describe.drop(columns=[column for column in group_describe.columns if 'count' in column])
df2 = group_describe
df2

Unnamed: 0,feature,task,space,model,f1 mean,f1 std,f1 min,f1 50%,f1 max
0,diagnosis,clf,euclidean,dummy,0.441176,0.0,0.441176,0.441176,0.441176
1,diagnosis,clf,euclidean,rf,0.563234,0.119222,0.387097,0.604167,0.802083
2,diagnosis,clf,hyperbolic,dummy,0.441176,0.0,0.441176,0.441176,0.441176
3,diagnosis,clf,hyperbolic,rf,0.529307,0.105542,0.40625,0.525,0.737327
4,diagnosis,clf,raw,dummy,0.441176,0.0,0.441176,0.441176,0.441176
5,diagnosis,clf,raw,rf,0.525847,0.107301,0.424242,0.441176,0.654545
6,diagnosis,clf,tangent,dummy,0.441176,0.0,0.441176,0.441176,0.441176
7,diagnosis,clf,tangent,rf,0.529307,0.105542,0.40625,0.525,0.737327
8,sex,clf,euclidean,dummy,0.364301,0.004829,0.354839,0.366667,0.366667
9,sex,clf,euclidean,rf,0.587546,0.080072,0.413333,0.590769,0.724638


In [82]:
rf_df = df2[df2['model'] == 'rf']
dummy_df = df2[df2['model'] == 'dummy']

In [85]:
rf_df

Unnamed: 0,feature,task,space,model,f1 mean,f1 std,f1 min,f1 50%,f1 max
1,diagnosis,clf,euclidean,rf,0.563234,0.119222,0.387097,0.604167,0.802083
3,diagnosis,clf,hyperbolic,rf,0.529307,0.105542,0.40625,0.525,0.737327
5,diagnosis,clf,raw,rf,0.525847,0.107301,0.424242,0.441176,0.654545
7,diagnosis,clf,tangent,rf,0.529307,0.105542,0.40625,0.525,0.737327
9,sex,clf,euclidean,rf,0.587546,0.080072,0.413333,0.590769,0.724638
11,sex,clf,hyperbolic,rf,0.645773,0.099105,0.450549,0.680135,0.784091
13,sex,clf,raw,rf,0.616481,0.120403,0.366667,0.634615,0.784091
15,sex,clf,tangent,rf,0.645773,0.099105,0.450549,0.680135,0.784091
17,site_name,clf,euclidean,rf,0.40935,0.089153,0.25,0.416667,0.583333
19,site_name,clf,hyperbolic,rf,0.377588,0.094687,0.180556,0.4,0.575752


In [94]:
dummy_values = dummy_df['f1 mean']
dummy_df

Unnamed: 0,feature,task,space,model,f1 mean,f1 std,f1 min,f1 50%,f1 max
0,diagnosis,clf,euclidean,dummy,0.441176,0.0,0.441176,0.441176,0.441176
2,diagnosis,clf,hyperbolic,dummy,0.441176,0.0,0.441176,0.441176,0.441176
4,diagnosis,clf,raw,dummy,0.441176,0.0,0.441176,0.441176,0.441176
6,diagnosis,clf,tangent,dummy,0.441176,0.0,0.441176,0.441176,0.441176
8,sex,clf,euclidean,dummy,0.364301,0.004829,0.354839,0.366667,0.366667
10,sex,clf,hyperbolic,dummy,0.364301,0.004829,0.354839,0.366667,0.366667
12,sex,clf,raw,dummy,0.364301,0.004829,0.354839,0.366667,0.366667
14,sex,clf,tangent,dummy,0.364301,0.004829,0.354839,0.366667,0.366667
16,site_name,clf,euclidean,dummy,0.14709,0.00216,0.142857,0.148148,0.148148
18,site_name,clf,hyperbolic,dummy,0.14709,0.00216,0.142857,0.148148,0.148148


In [87]:
rf_df['f1 mean rounded'] = rf_df['f1 mean'].round(3)
rf_df['feature'][rf_df['feature'] == 'sex'] = 'Sex'
rf_df['feature'][rf_df['feature'] == 'diagnosis'] = 'IBD Diagnosis'
rf_df['feature'][rf_df['feature'] == 'site_name'] = 'Hospital Location'

In [88]:
rf_df

Unnamed: 0,feature,task,space,model,f1 mean,f1 std,f1 min,f1 50%,f1 max,f1 mean rounded
1,IBD Diagnosis,clf,euclidean,rf,0.563234,0.119222,0.387097,0.604167,0.802083,0.563
3,IBD Diagnosis,clf,hyperbolic,rf,0.529307,0.105542,0.40625,0.525,0.737327,0.529
5,IBD Diagnosis,clf,raw,rf,0.525847,0.107301,0.424242,0.441176,0.654545,0.526
7,IBD Diagnosis,clf,tangent,rf,0.529307,0.105542,0.40625,0.525,0.737327,0.529
9,Sex,clf,euclidean,rf,0.587546,0.080072,0.413333,0.590769,0.724638,0.588
11,Sex,clf,hyperbolic,rf,0.645773,0.099105,0.450549,0.680135,0.784091,0.646
13,Sex,clf,raw,rf,0.616481,0.120403,0.366667,0.634615,0.784091,0.616
15,Sex,clf,tangent,rf,0.645773,0.099105,0.450549,0.680135,0.784091,0.646
17,Hospital Location,clf,euclidean,rf,0.40935,0.089153,0.25,0.416667,0.583333,0.409
19,Hospital Location,clf,hyperbolic,rf,0.377588,0.094687,0.180556,0.4,0.575752,0.378


In [110]:
model = 'RF'
fig = px.bar(rf_df, x='feature', y='f1 mean', color='space', barmode='group', error_y='f1 std', text='f1 mean rounded')
fig.update_layout(title={'text': '{}: Mean F1 Score for CNN 128 Embeddings'.format(model), 'xanchor': 'center', 'x':0.5})
fig.update_xaxes(title="Predicted IBD Metadata")
fig.show()

In [106]:
for data in fig.data:
    x = data['x']
    y = data['y']
    print(x, y)

['IBD Diagnosis' 'Sex' 'Hospital Location'] [0.56323364 0.5875464  0.40935022]
['IBD Diagnosis' 'Sex' 'Hospital Location'] [0.52930694 0.64577349 0.37758785]
['IBD Diagnosis' 'Sex' 'Hospital Location'] [0.5258467  0.61648098 0.28948548]
['IBD Diagnosis' 'Sex' 'Hospital Location'] [0.52930694 0.64577349 0.37758785]


In [108]:
fig.add_annotation(ax=x)
fig.show()

In [None]:
# model = 'MLP'
# fig = px.bar(mlp_df, x='feature', y='f1', color='space', barmode='group', text='f1 rounded')
# fig.update_layout(title={'text': 'F1 Score for CNN 128 Embeddings with {}'.format(model), 'xanchor': 'center', 'x':0.5})
# fig.update_xaxes(title="Predicted IBD Metadata")
# fig.show()