In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('../../results/data/ml/train_data.csv', index_col=0)
test_df = pd.read_csv('../../results/data/ml/test_data.csv', index_col=0)

In [14]:
train_df.group.value_counts()

group
HCC    225
CHB    102
LC     102
HC      70
Name: count, dtype: int64

In [3]:
train_data = TabularDataset(train_df)
train_data[["group"]] = train_data[["group"]] == "HCC"
test_data = TabularDataset(test_df)
test_data[["group"]] = test_data[["group"]] == "HCC"

In [4]:
predictor = TabularPredictor('group', eval_metric='roc_auc').fit(train_data, time_limit=120, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20241203_065008"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:02:26 PDT 2024; root:xnu-11215.41.3~2/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       5.45 GB / 16.00 GB (34.1%)
Disk Space Avail:   31.42 GB / 460.43 GB (6.8%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on

In [5]:
predictor.evaluate(test_data)

{'roc_auc': 0.9562920678197153,
 'accuracy': 0.8899082568807339,
 'balanced_accuracy': 0.8824231064156087,
 'mcc': 0.7793228131593767,
 'f1': 0.8681318681318682,
 'precision': 0.9294117647058824,
 'recall': 0.8144329896907216}

In [6]:
clinical_features = ['AST', 'ALT', 'GGT', 'ALB','TBIL', 'TP', 'AFP', 'child_pugh', 'AAR', 'ALBI_score']
glycan_features = [col for col in train_df.columns if col not in clinical_features]
glycan_features.remove('group')

In [5]:
# Use only clinical features
clinical_train_data = train_data[['group'] + clinical_features]
clinical_test_data = test_data[['group'] + clinical_features]

clinical_predictor = TabularPredictor('group', eval_metric='roc_auc').fit(clinical_train_data, time_limit=120, presets='best_quality')
clinical_predictor.evaluate(clinical_test_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20241203_062808"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:02:26 PDT 2024; root:xnu-11215.41.3~2/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       5.80 GB / 16.00 GB (36.2%)
Disk Space Avail:   27.92 GB / 460.43 GB (6.1%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on

{'roc_auc': 0.9520320354434694,
 'accuracy': 0.8990825688073395,
 'balanced_accuracy': 0.8957996080770214,
 'mcc': 0.7954033705719252,
 'f1': 0.8842105263157894,
 'precision': 0.9032258064516129,
 'recall': 0.865979381443299}

In [6]:
# Use only glycans
glycan_train_data = train_data[['group'] + glycan_features]
glycan_test_data = test_data[['group'] + glycan_features]

glycan_predictor = TabularPredictor('group', eval_metric='roc_auc').fit(glycan_train_data, time_limit=120, presets='best_quality')
glycan_predictor.evaluate(glycan_test_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20241203_063025"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:02:26 PDT 2024; root:xnu-11215.41.3~2/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       5.87 GB / 16.00 GB (36.7%)
Disk Space Avail:   27.81 GB / 460.43 GB (6.0%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on

{'roc_auc': 0.880122688932436,
 'accuracy': 0.7706422018348624,
 'balanced_accuracy': 0.7657834199539917,
 'mcc': 0.5341213678827081,
 'f1': 0.7368421052631579,
 'precision': 0.7526881720430108,
 'recall': 0.7216494845360825}

In [8]:
# Train three models to predict HCC vs HC, HCC vs CHB, HCC vs LC
# using only glycans.

HCC_HC_train_df = train_df[np.isin(train_df.group, ['HCC', 'HC'])]
HCC_HC_test_df = test_df[np.isin(test_df.group, ['HCC', 'HC'])]
HCC_HC_train_data = TabularDataset(HCC_HC_train_df)
HCC_HC_test_data = TabularDataset(HCC_HC_test_df)
HCC_HC_train_data['group'] = HCC_HC_train_data['group'] == 'HCC'
HCC_HC_test_data['group'] = HCC_HC_test_data['group'] == 'HCC'
HCC_HC_train_data = HCC_HC_train_data[['group'] + glycan_features]
HCC_HC_test_data = HCC_HC_test_data[['group'] + glycan_features]

HCC_HC_predictor = TabularPredictor('group', eval_metric='roc_auc').fit(HCC_HC_train_data, time_limit=120, presets='best_quality')
HCC_HC_predictor.evaluate(HCC_HC_test_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20241203_065447"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:02:26 PDT 2024; root:xnu-11215.41.3~2/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       5.19 GB / 16.00 GB (32.5%)
Disk Space Avail:   31.30 GB / 460.43 GB (6.8%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on

{'roc_auc': 0.9527768540073163,
 'accuracy': 0.921875,
 'balanced_accuracy': 0.8935816428333887,
 'mcc': 0.7871632856667775,
 'f1': 0.9484536082474226,
 'precision': 0.9484536082474226,
 'recall': 0.9484536082474226}

In [9]:
# Train three models to predict HCC vs HC, HCC vs CHB, HCC vs LC
# using only glycans.

HCC_CHB_train_df = train_df[np.isin(train_df.group, ['HCC', 'CHB'])]
HCC_CHB_test_df = test_df[np.isin(test_df.group, ['HCC', 'CHB'])]
HCC_CHB_train_data = TabularDataset(HCC_CHB_train_df)
HCC_CHB_test_data = TabularDataset(HCC_CHB_test_df)
HCC_CHB_train_data['group'] = HCC_CHB_train_data['group'] == 'HCC'
HCC_CHB_test_data['group'] = HCC_CHB_test_data['group'] == 'HCC'
HCC_CHB_train_data = HCC_CHB_train_data[['group'] + glycan_features]
HCC_CHB_test_data = HCC_CHB_test_data[['group'] + glycan_features]

HCC_CHB_predictor = TabularPredictor('group', eval_metric='roc_auc').fit(HCC_CHB_train_data, time_limit=120, presets='best_quality')
HCC_CHB_predictor.evaluate(HCC_CHB_test_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20241203_065746"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:02:26 PDT 2024; root:xnu-11215.41.3~2/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       5.08 GB / 16.00 GB (31.7%)
Disk Space Avail:   31.21 GB / 460.43 GB (6.8%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on

{'roc_auc': 0.9040091638029782,
 'accuracy': 0.8169014084507042,
 'balanced_accuracy': 0.7766323024054982,
 'mcc': 0.5680314683204689,
 'f1': 0.8686868686868687,
 'precision': 0.8514851485148515,
 'recall': 0.8865979381443299}

In [10]:
# Train three models to predict HCC vs HC, HCC vs CHB, HCC vs LC
# using only glycans.

HCC_LC_train_df = train_df[np.isin(train_df.group, ['HCC', 'LC'])]
HCC_LC_test_df = test_df[np.isin(test_df.group, ['HCC', 'LC'])]
HCC_LC_train_data = TabularDataset(HCC_LC_train_df)
HCC_LC_test_data = TabularDataset(HCC_LC_test_df)
HCC_LC_train_data['group'] = HCC_LC_train_data['group'] == 'HCC'
HCC_LC_test_data['group'] = HCC_LC_test_data['group'] == 'HCC'
HCC_LC_train_data = HCC_LC_train_data[['group'] + glycan_features]
HCC_LC_test_data = HCC_LC_test_data[['group'] + glycan_features]

HCC_LC_predictor = TabularPredictor('group', eval_metric='roc_auc').fit(HCC_LC_train_data, time_limit=120, presets='best_quality')
HCC_LC_predictor.evaluate(HCC_LC_test_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20241203_070019"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:02:26 PDT 2024; root:xnu-11215.41.3~2/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       5.01 GB / 16.00 GB (31.3%)
Disk Space Avail:   31.15 GB / 460.43 GB (6.8%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on

{'roc_auc': 0.8258877434135166,
 'accuracy': 0.7746478873239436,
 'balanced_accuracy': 0.668270332187858,
 'mcc': 0.4410901292841179,
 'f1': 0.8532110091743119,
 'precision': 0.768595041322314,
 'recall': 0.9587628865979382}