In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from geomstats.learning.preprocessing import ToTangentSpace
from geomstats.geometry.hyperbolic import Hyperbolic

from sklearn.manifold import MDS
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold
from sklearn.dummy import DummyClassifier, DummyRegressor

import plotly.express as px
import plotly.graph_objects as go

# local files
from src.util.data_handling.data_loader import load_dataset
from src.classifiers.mlp import MLP

from icecream import ic

INFO: Using numpy backend
  warn(f"Failed to load image Python extension: {e}")


In [3]:
# supress pandas warning
# Source: https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
pd.options.mode.chained_assignment = None

In [4]:
seed = 43

# Load Data

In [5]:
data_name = 'ibd'
euclidean_embeddings_path = '../../data/processed/mixture_embeddings/{}/cnn_euclidean_128_mixture_embeddings.pickle'.format(data_name)
hyperbolic_embeddings_path = '../../data/processed/mixture_embeddings/{}/cnn_hyperbolic_128_mixture_embeddings.pickle'.format(data_name)
raw_path = '../../data/interim/ihmp/{}_data.pickle'.format(data_name)
metadata_path = '../../data/interim/ihmp/{}_metadata.pickle'.format(data_name)

In [6]:
metadata = load_dataset(metadata_path)

In [7]:
def inverse_log_norm_transformation(X):
    """When we generate X_raw, we add 1e-10 and take the log. Here we perform
    the inverse of these operations. However, instead of using 1e-10 we use 1e-11
    to ensure our numbers are all positive. If we subtracted 1e-10, then due to
    numerical stability issues, we'd end up with some negative values."""
    return np.exp(X) - 1e-11

In [8]:
X_euclidean = load_dataset(euclidean_embeddings_path).astype('float32')
X_hyperbolic = load_dataset(hyperbolic_embeddings_path).astype('float32')

# X_tangent is data in Euclidean plane tangent to hyperbolic manifold
embedding_size = X_hyperbolic.shape[1]
hyperbolic = Hyperbolic(dim=embedding_size, default_coords_type='ball') # why do we have the -1 here?
to_tangent = ToTangentSpace(geometry=hyperbolic, method='adaptive', epsilon=1e-3)
to_tangent.fit(X_hyperbolic)
X_tangent = to_tangent.transform(X_hyperbolic).astype('float32')

# Multidimensional scaling on X_raw
X_raw = load_dataset(raw_path).values
X_raw = inverse_log_norm_transformation(X_raw)
assert np.all(X_raw >= 0) # sanity check
dim_red = MDS(n_components=embedding_size, random_state=seed, normalized_stress='auto')
X_raw = dim_red.fit_transform(X_raw).astype('float32')


space_to_data = {
    'raw': X_raw,
    'euclidean': X_euclidean,
    'hyperbolic': X_hyperbolic,
    'tangent': X_tangent
}

# Predict

In [9]:
def my_f1_score(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

In [10]:
feature_to_task = {
    'diagnosis': 'clf',
    'site_name': 'clf',
    'sex': 'clf'
    }

task_to_models = {
    'clf': {
        # 'rf': RandomForestClassifier(random_state=seed), 
        # 'dummy': DummyClassifier(),
        'mlp': MLP(duration='100ep')
        }, 
    'reg': {
        'rf': RandomForestRegressor(random_state=seed), 
        'dummy': DummyRegressor()
        }
}

task_to_metrics = {
    'clf': {'f1': my_f1_score},
    'ref': {'mse': mean_squared_error}
}

In [11]:
results = []
n_splits = 5
n_repeats = 1

for feature, task in feature_to_task.items():
    for space, X in tqdm(space_to_data.items(), desc='Predict {}'.format(feature)):
        y = metadata[feature].to_numpy()
        
        if feature == 'diagnosis':
            mask = metadata[feature] == 'nonIBD'
            y = y[~mask]
            X = X[~mask]
                        
        if task == 'clf':
            encoder = LabelEncoder()
            y = encoder.fit_transform(y)
            
        model_name_to_model = task_to_models[task]
        metric_name_to_metric = task_to_metrics[task]
            
        # split the data
        if task == 'clf':
            folds = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
        else:
            folds = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
            
        # loop over splits
        for split_idx, (train_idx, test_idx) in enumerate(folds.split(X, y)):
            X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
            
            # loop over models
            for model_name, model in model_name_to_model.items():
                
                # fit model and predict
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                
                # measure and save metrics
                scores = {metric_name: metric(y_test, y_pred) for metric_name, metric in metric_name_to_metric.items()}
                results.append({
                    'feature': feature,
                    'task': task,
                    'space': space,
                    'model': model_name,
                    'split': split_idx,
                    } | scores
                            )

Predict diagnosis:   0%|          | 0/4 [00:00<?, ?it/s]INFO: Setting seed to 42
INFO: Run name: 1683515860-elite-gaur
INFO: Stepping schedulers every batch. To step schedulers every epoch, set `step_schedulers_every_batch=False`.
INFO: Setting seed to 42
INFO: Setting seed to 42
INFO: Using precision Precision.AMP_FP16


In [None]:
df = pd.DataFrame(results)
df

# Plot Results

In [None]:
group = df.groupby(['feature', 'task', 'space', 'model'])
group_describe = group.describe(percentiles=[])
group_describe = group_describe.drop(columns=['split'])
group_describe.columns = [' '.join(column).strip() for column in group_describe.columns]
group_describe = group_describe.reset_index()
group_describe = group_describe.drop(columns=[column for column in group_describe.columns if 'count' in column])
df2 = group_describe
df2

In [None]:
rf_df = df2[df2['model'] == 'rf']
dummy_df = df2[df2['model'] == 'dummy']
mlp_df = df2[df2['model'] == 'mlp']

In [None]:
rf_df

In [None]:
dummy_values = dummy_df['f1 mean']
dummy_df

In [None]:
rf_df['f1 mean rounded'] = rf_df['f1 mean'].round(3)
rf_df['feature'][rf_df['feature'] == 'sex'] = 'Sex'
rf_df['feature'][rf_df['feature'] == 'diagnosis'] = 'IBD Diagnosis'
rf_df['feature'][rf_df['feature'] == 'site_name'] = 'Hospital Location'

In [None]:
rf_df

In [None]:
model = 'RF'
fig = px.bar(rf_df, x='feature', y='f1 mean', color='space', barmode='group', error_y='f1 std', text='f1 mean rounded')
fig.update_layout(title={'text': '{}: Mean F1 Score for CNN 128 Embeddings'.format(model), 'xanchor': 'center', 'x':0.5})
fig.update_xaxes(title="Predicted IBD Metadata")
fig.show()

In [None]:
for data in fig.data:
    x = data['x']
    y = data['y']
    print(x, y)

In [None]:
fig.add_annotation(ax=x)
fig.show()

In [None]:
# model = 'MLP'
# fig = px.bar(mlp_df, x='feature', y='f1', color='space', barmode='group', text='f1 rounded')
# fig.update_layout(title={'text': 'F1 Score for CNN 128 Embeddings with {}'.format(model), 'xanchor': 'center', 'x':0.5})
# fig.update_xaxes(title="Predicted IBD Metadata")
# fig.show()