In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('../../results/data/ml/train_data.csv', index_col=0)
test_df = pd.read_csv('../../results/data/ml/test_data.csv', index_col=0)

In [3]:
train_df.group.value_counts()

group
HCC    226
LC     104
CHB    102
HC      70
Name: count, dtype: int64

In [4]:
train_data = TabularDataset(train_df)
train_data[["group"]] = train_data[["group"]] == "HCC"
test_data = TabularDataset(test_df)
test_data[["group"]] = test_data[["group"]] == "HCC"

In [5]:
clinical_features = ['AST', 'ALT', 'GGT', 'ALB','TBIL', 'TP', 'AFP', 'child_pugh', 'AAR', 'ALBI_score']
glycan_features = ['H3N4F1', 'H3N5F1', 'H4N3S1', 'H4N4', 'H4N4S1', 'H4N4F1', 'H4N5F1', 'H5N2', 'H5N4', 
                   'H5N4S1', 'H5N4S2', 'H5N4F1', 'H5N4F1S1', 'H5N4F1S2', 'H5N5S1', 'H5N5F1', 'H5N5F1S1', 
                   'H5N5F1S2', 'H6N2', 'H6N5S2', 'H6N5S3', 'H6N5F1S2', 'H6N5F1S3', 'H7N2', 'H8N2', 'H9N2']

In [6]:
# Use only glycans
glycan_train_data = train_data[['group'] + glycan_features]
glycan_test_data = test_data[['group'] + glycan_features]

glycan_predictor = TabularPredictor('group', eval_metric='roc_auc').fit(glycan_train_data, time_limit=300, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20241209_014559"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:02:26 PDT 2024; root:xnu-11215.41.3~2/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       5.03 GB / 16.00 GB (31.4%)
Disk Space Avail:   26.35 GB / 460.43 GB (5.7%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on

{'roc_auc': 0.8665757859759734,
 'accuracy': 0.7935779816513762,
 'balanced_accuracy': 0.7884893925193832,
 'mcc': 0.5805761764934408,
 'f1': 0.7619047619047619,
 'precision': 0.782608695652174,
 'recall': 0.7422680412371134}

In [7]:
glycan_predictor.evaluate(glycan_test_data)

{'roc_auc': 0.8665757859759734,
 'accuracy': 0.7935779816513762,
 'balanced_accuracy': 0.7884893925193832,
 'mcc': 0.5805761764934408,
 'f1': 0.7619047619047619,
 'precision': 0.782608695652174,
 'recall': 0.7422680412371134}

In [8]:
# Train three models to predict HCC vs HC, HCC vs CHB, HCC vs LC
# using only glycans.

HCC_HC_train_df = train_df[np.isin(train_df.group, ['HCC', 'HC'])]
HCC_HC_test_df = test_df[np.isin(test_df.group, ['HCC', 'HC'])]
HCC_HC_train_data = TabularDataset(HCC_HC_train_df)
HCC_HC_test_data = TabularDataset(HCC_HC_test_df)
HCC_HC_train_data['group'] = HCC_HC_train_data['group'] == 'HCC'
HCC_HC_test_data['group'] = HCC_HC_test_data['group'] == 'HCC'
HCC_HC_train_data = HCC_HC_train_data[['group'] + glycan_features]
HCC_HC_test_data = HCC_HC_test_data[['group'] + glycan_features]

HCC_HC_predictor = TabularPredictor('group', eval_metric='roc_auc').fit(HCC_HC_train_data, time_limit=300, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20241209_015153"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:02:26 PDT 2024; root:xnu-11215.41.3~2/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       4.87 GB / 16.00 GB (30.4%)
Disk Space Avail:   26.15 GB / 460.43 GB (5.7%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on

In [9]:
HCC_HC_predictor.evaluate(HCC_HC_test_data)

{'roc_auc': 0.9334885267708679,
 'accuracy': 0.84375,
 'balanced_accuracy': 0.7871632856667775,
 'mcc': 0.574326571333555,
 'f1': 0.8969072164948454,
 'precision': 0.8969072164948454,
 'recall': 0.8969072164948454}

In [10]:
# Train three models to predict HCC vs HC, HCC vs CHB, HCC vs LC
# using only glycans.

HCC_CHB_train_df = train_df[np.isin(train_df.group, ['HCC', 'CHB'])]
HCC_CHB_test_df = test_df[np.isin(test_df.group, ['HCC', 'CHB'])]
HCC_CHB_train_data = TabularDataset(HCC_CHB_train_df)
HCC_CHB_test_data = TabularDataset(HCC_CHB_test_df)
HCC_CHB_train_data['group'] = HCC_CHB_train_data['group'] == 'HCC'
HCC_CHB_test_data['group'] = HCC_CHB_test_data['group'] == 'HCC'
HCC_CHB_train_data = HCC_CHB_train_data[['group'] + glycan_features]
HCC_CHB_test_data = HCC_CHB_test_data[['group'] + glycan_features]

HCC_CHB_predictor = TabularPredictor('group', eval_metric='roc_auc').fit(HCC_CHB_train_data, time_limit=300, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20241209_015808"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:02:26 PDT 2024; root:xnu-11215.41.3~2/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       4.95 GB / 16.00 GB (30.9%)
Disk Space Avail:   25.94 GB / 460.43 GB (5.6%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on

In [11]:
HCC_CHB_predictor.evaluate(HCC_CHB_test_data)

{'roc_auc': 0.8969072164948453,
 'accuracy': 0.8661971830985915,
 'balanced_accuracy': 0.8603665521191295,
 'mcc': 0.7020817408643542,
 'f1': 0.8994708994708994,
 'precision': 0.9239130434782609,
 'recall': 0.8762886597938144}

In [12]:
# Train three models to predict HCC vs HC, HCC vs CHB, HCC vs LC
# using only glycans.

HCC_LC_train_df = train_df[np.isin(train_df.group, ['HCC', 'LC'])]
HCC_LC_test_df = test_df[np.isin(test_df.group, ['HCC', 'LC'])]
HCC_LC_train_data = TabularDataset(HCC_LC_train_df)
HCC_LC_test_data = TabularDataset(HCC_LC_test_df)
HCC_LC_train_data['group'] = HCC_LC_train_data['group'] == 'HCC'
HCC_LC_test_data['group'] = HCC_LC_test_data['group'] == 'HCC'
HCC_LC_train_data = HCC_LC_train_data[['group'] + glycan_features]
HCC_LC_test_data = HCC_LC_test_data[['group'] + glycan_features]

HCC_LC_predictor = TabularPredictor('group', eval_metric='roc_auc').fit(HCC_LC_train_data, time_limit=300, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20241209_020344"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:02:26 PDT 2024; root:xnu-11215.41.3~2/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       5.00 GB / 16.00 GB (31.3%)
Disk Space Avail:   25.75 GB / 460.43 GB (5.6%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on

In [13]:
HCC_LC_predictor.evaluate(HCC_LC_test_data)

{'roc_auc': 0.838717067583047,
 'accuracy': 0.7746478873239436,
 'balanced_accuracy': 0.6742268041237114,
 'mcc': 0.44004763507398714,
 'f1': 0.8518518518518519,
 'precision': 0.773109243697479,
 'recall': 0.9484536082474226}