In [None]:
# %%

from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config

import numpy as np
import pandas as pd

import helper_funs as helper

from sklearnex import patch_sklearn
patch_sklearn()

label = 'signature'
time_limit = 10*60       # max training time (seconds)
infer_limit = 1/2500    # prediction seconds per row
finalModel = 'CatBoost_BAG_L2'
seed_value = 2024
importance_threshold = 1e-6

data_url = 'https://raw.githubusercontent.com/mli/ag-docs/main/knot_theory/'

train_data = TabularDataset(f'{data_url}train.csv')
test_data = TabularDataset(f'{data_url}test.csv')

pruning_test_data = test_data.sample(frac=0.5, random_state=seed_value)
final_test_data = test_data.drop(pruning_test_data.index)

train_data.head()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Unnamed: 0.1,Unnamed: 0,chern_simons,cusp_volume,hyperbolic_adjoint_torsion_degree,hyperbolic_torsion_degree,injectivity_radius,longitudinal_translation,meridinal_translation_imag,meridinal_translation_real,short_geodesic_imag_part,short_geodesic_real_part,Symmetry_0,Symmetry_D3,Symmetry_D4,Symmetry_D6,Symmetry_D8,Symmetry_Z/2 + Z/2,volume,signature
0,70746,0.09053,12.226322,0,10,0.507756,10.685555,1.144192,-0.519157,-2.760601,1.015512,0.0,0.0,0.0,0.0,0.0,1.0,11.393225,-2
1,240827,0.232453,13.800773,0,14,0.413645,10.453156,1.320249,-0.158522,-3.013258,0.827289,0.0,0.0,0.0,0.0,0.0,1.0,12.742782,0
2,155659,-0.144099,14.76103,0,14,0.436928,13.405199,1.101142,0.768894,2.233106,0.873856,0.0,0.0,0.0,0.0,0.0,0.0,15.236505,2
3,239963,-0.171668,13.738019,0,22,0.249481,27.819496,0.493827,-1.188718,-2.042771,0.498961,0.0,0.0,0.0,0.0,0.0,0.0,17.27989,-8
4,90504,0.235188,15.896359,0,10,0.389329,15.330971,1.036879,0.722828,-3.056138,0.778658,0.0,0.0,0.0,0.0,0.0,0.0,16.749298,4


In [None]:
# %%

# Detect features to prune
helper.set_global_seed(seed_value)

pruning_predictor = TabularPredictor(label=label, problem_type='multiclass', eval_metric='log_loss', log_to_file=True)

pruning_predictor.fit(train_data,
                    presets="good_quality",
                    dynamic_stacking=False,
                    save_bag_folds=True,
                    refit_full=False,
                    set_best_to_refit_full=False,
                    time_limit=time_limit)

pruning_predictor.persist(models='all', max_memory=0.5) # improves prediction time, consumes more memory

pruning_leaders = pruning_predictor.leaderboard(pruning_test_data)
pruning_leaders

No path specified. Models will be saved in: "AutogluonModels\ag-20240827_071420"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.14
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          20
Memory Avail:       22.47 GB / 31.75 GB (70.8%)
Disk Space Avail:   754.27 GB / 952.44 GB (79.2%)
Presets specified: ['good_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "AutogluonModels\ag-20240827_071420"
Train Data Rows:    10000
Train Data Columns: 18
Label Column:       signature
Problem Type:       multiclass
Preprocessing data ...
Duplicated 2 samples from 2 rare classes in training set because eval_metric requires all classes have at least 2 samples.
Train Data Class Count: 13
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Avai

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-0.128677,-0.129257,log_loss,2.538114,5.634845,388.302915,0.000997,0.0,1.332026,3,True,20
1,LightGBMXT_BAG_L2,-0.131531,-0.145355,log_loss,1.814165,4.351843,347.40027,0.191883,0.474871,20.443135,2,True,12
2,ExtraTreesEntr_BAG_L2,-0.134237,-0.134954,log_loss,1.685683,4.277002,327.758272,0.063401,0.400031,0.801137,2,True,18
3,RandomForestEntr_BAG_L2,-0.134551,-0.139837,log_loss,1.687678,4.260287,329.075239,0.065396,0.383315,2.118104,2,True,15
4,ExtraTreesGini_BAG_L2,-0.137222,-0.136955,log_loss,1.687839,4.246156,327.838089,0.065557,0.369185,0.880953,2,True,17
5,XGBoost_BAG_L2,-0.13752,-0.149996,log_loss,1.783203,4.133575,342.236297,0.160921,0.256604,15.279162,2,True,19
6,RandomForestGini_BAG_L2,-0.137822,-0.140348,log_loss,1.741264,4.24455,328.800148,0.118983,0.367579,1.843013,2,True,14
7,NeuralNetFastAI_BAG_L2,-0.138582,-0.152474,log_loss,2.055516,4.120023,348.329352,0.433234,0.243052,21.372216,2,True,11
8,WeightedEnsemble_L2,-0.143905,-0.141226,log_loss,0.745884,1.307647,312.70287,0.0,0.0,0.67343,2,True,10
9,CatBoost_BAG_L1,-0.154606,-0.161188,log_loss,0.079497,0.136322,277.441824,0.079497,0.136322,277.441824,1,True,6


In [None]:
# %%

test_rows = pruning_test_data.shape[0]

if test_rows > 10000:
    num_sets = 1
else:
    num_sets = 30

df_pruning_features = pruning_predictor.feature_importance(pruning_test_data, num_shuffle_sets=num_sets)
low_importance_features = df_pruning_features.query(f"importance < {importance_threshold}").index.to_list()
low_importance_features

These features in provided data are not utilized by the predictor and will be ignored: ['Symmetry_D8']
Computing feature importance via permutation shuffling for 17 features using 2500 rows with 30 shuffle sets...
	1406.17s	= Expected runtime (46.87s per shuffle set)
2024-08-27 00:28:03,729	ERROR worker.py:406 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2024-08-27 00:28:05,558	ERROR worker.py:406 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2024-08-27 00:28:06,135	ERROR worker.py:406 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
	971.05s	= Actual runtime (Completed 30 of 30 shuff

['Symmetry_D3', 'Unnamed: 0']

In [None]:
# %%

final_train_data = train_data.drop(columns=low_importance_features)

In [None]:
# %%

custom_hyperparameters = get_hyperparameter_config('zeroshot') # ['default', 'zeroshot', 'light', 'very_light', 'toy', 'multimodal']

custom_hyperparameters['LR'] = [
    {'multi_class':'multinomial', 'penalty':None, 'tol':1e-6, 'max_iter':10000,
        'ag_args': {'name_suffix': 'Base'}},
    {'multi_class':'multinomial', 'penalty':'l2', 'tol':1e-6, 'max_iter':10000, 'C':0.1,
        'ag_args': {'name_suffix': 'Ridge'}},
    {'multi_class':'multinomial', 'penalty':'l1', 'tol':1e-6, 'max_iter':10000, 'C':0.1,
        'ag_args': {'name_suffix': 'Lasso'}},
    {'multi_class':'multinomial', 'penalty':'elasticnet', 'tol':1e-6, 'max_iter':10000, 'C':0.1,
        'ag_args': {'name_suffix': 'ElasticNet'}}
    ]

custom_preset = {'auto_stack': True,
                'dynamic_stacking': False,
                'hyperparameters':custom_hyperparameters,
                'refit_full': False,
                'set_best_to_refit_full': False,
                'save_bag_folds': True,
                'time_limit': time_limit*2,
                'infer_limit': infer_limit
                }

In [None]:
# %%

helper.set_global_seed(seed_value)

predictor = TabularPredictor(label=label, problem_type='multiclass', eval_metric='log_loss', log_to_file=True)

predictor.fit(final_train_data, presets=custom_preset)  

No path specified. Models will be saved in: "AutogluonModels\ag-20240827_074105"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.14
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          20
Memory Avail:       18.96 GB / 31.75 GB (59.7%)
Disk Space Avail:   753.23 GB / 952.44 GB (79.1%)
Presets specified: [{'auto_stack': True, 'dynamic_stacking': False, 'hyperparameters': {'NN_TORCH': [{}, {'activation': 'elu', 'dropout_prob': 0.10077639529843717, 'hidden_size': 108, 'learning_rate': 0.002735937344002146, 'num_layers': 4, 'use_batchnorm': True, 'weight_decay': 1.356433327634438e-12, 'ag_args': {'name_suffix': '_r79', 'priority': -2}}, {'activation': 'elu', 'dropout_prob': 0.11897478034205347, 'hidden_size': 213, 'learning_rate': 0.0010474382260641949, 'num_layers': 4, 'use_batchnorm': False, 'weight_decay': 5.594471067786272e-10, 'ag_args': {'name_suffix': '_r22', 'priority': -7}}, {'activation': 'elu'

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x266d65bfcd0>

In [None]:
# %%

predictor.persist(models='all', max_memory=0.5) # improves prediction time, consumes more memory

df_leaders = predictor.leaderboard(final_test_data)

df_leaders.head(40)

2024-08-27 01:01:16,258	ERROR worker.py:406 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2024-08-27 01:01:17,374	ERROR worker.py:406 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2024-08-27 01:01:18,957	ERROR worker.py:406 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2024-08-27 01:01:21,277	ERROR worker.py:406 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
Persisting 30 models in memory. Models will require 8.52% of memory.


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-0.122544,-0.121375,log_loss,2.432668,6.206836,754.113328,0.003152,0.00299,0.429714,3,True,30
1,RandomForestEntr_BAG_L2,-0.123744,-0.128786,log_loss,1.832968,4.566584,726.644917,0.062187,0.582266,2.586139,2,True,25
2,NeuralNetFastAI_BAG_L2,-0.126413,-0.141157,log_loss,2.228153,4.2688,747.751715,0.457372,0.284482,23.692937,2,True,21
3,LightGBMXT_BAG_L2,-0.130706,-0.150571,log_loss,1.951755,4.403926,757.980687,0.180974,0.419608,33.921909,2,True,22
4,WeightedEnsemble_L2,-0.131103,-0.134926,log_loss,0.970292,1.49028,666.402033,0.003987,0.002985,1.3384,2,True,20
5,XGBoost_BAG_L2,-0.131895,-0.144149,log_loss,2.009077,4.330472,761.866662,0.238296,0.346154,37.807884,2,True,29
6,ExtraTreesEntr_BAG_L2,-0.132201,-0.129777,log_loss,1.818408,4.581914,724.989782,0.047627,0.597596,0.931004,2,True,28
7,ExtraTreesGini_BAG_L2,-0.135271,-0.136412,log_loss,1.827584,4.602322,725.133937,0.056803,0.618004,1.075159,2,True,27
8,LightGBM_BAG_L2,-0.135654,-0.158192,log_loss,1.92512,4.323624,782.339352,0.154339,0.339306,58.280574,2,True,23
9,RandomForestGini_BAG_L2,-0.136945,-0.132346,log_loss,1.862329,4.739501,726.473534,0.091548,0.755184,2.414756,2,True,24


In [None]:
# %%

df_leaders.tail(20)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
10,CatBoost_BAG_L2,-0.141043,-0.148371,log_loss,1.824476,4.068731,925.658645,0.053695,0.084413,201.599867,2,True,26
11,NeuralNetTorch_BAG_L1,-0.142321,-0.160065,log_loss,0.119746,0.064471,166.635675,0.119746,0.064471,166.635675,1,True,16
12,NeuralNetFastAI_BAG_L1,-0.144109,-0.154927,log_loss,0.314326,0.178922,19.34765,0.314326,0.178922,19.34765,1,True,3
13,CatBoost_r177_BAG_L1,-0.150365,-0.157574,log_loss,0.074319,0.082093,141.220167,0.074319,0.082093,141.220167,1,True,18
14,CatBoost_BAG_L1,-0.150895,-0.158194,log_loss,0.09276,0.093734,316.843327,0.09276,0.093734,316.843327,1,True,8
15,XGBoost_BAG_L1,-0.155557,-0.170709,log_loss,0.147746,0.500662,11.537262,0.147746,0.500662,11.537262,1,True,11
16,LightGBM_BAG_L1,-0.163646,-0.175614,log_loss,0.217408,0.567412,9.479552,0.217408,0.567412,9.479552,1,True,5
17,LightGBM_r131_BAG_L1,-0.169139,-0.176062,log_loss,0.89639,10.728252,22.965112,0.89639,10.728252,22.965112,1,True,19
18,LightGBMXT_BAG_L1,-0.169733,-0.179032,log_loss,0.629535,3.541508,17.095763,0.629535,3.541508,17.095763,1,True,4
19,RandomForestEntr_BAG_L1,-0.191107,-0.192703,log_loss,0.062251,0.289208,1.333576,0.062251,0.289208,1.333576,1,True,7


In [None]:
# %%

y_pred = predictor.predict_proba(final_test_data.drop(columns=[label]), model=finalModel)

pd.concat([final_test_data[label], y_pred], axis=1).head(10)

Unnamed: 0,signature,-12,-10,-8,-6,-4,-2,0,2,4,6,8,10,12
2,0,0.000127,0.000133,0.000154,0.000208,0.000567,0.004446,0.989641,0.003719,0.000423,0.000189,0.000136,0.00013,0.000127
3,4,0.000227,0.00024,0.000289,0.000443,0.000549,0.000668,0.000869,0.004807,0.98203,0.008778,0.000575,0.000291,0.000234
4,2,0.000149,0.000154,0.000167,0.000215,0.000477,0.001153,0.008403,0.985989,0.002539,0.000283,0.000168,0.000152,0.000149
7,2,0.000139,0.000145,0.000164,0.000207,0.000456,0.001049,0.006168,0.988799,0.002166,0.00027,0.000157,0.000141,0.00014
10,-4,0.000175,0.000197,0.000371,0.003097,0.985888,0.006988,0.001138,0.000745,0.000476,0.000345,0.000222,0.000185,0.000174
11,2,0.000149,0.000158,0.00018,0.000244,0.000481,0.000837,0.003478,0.987682,0.005845,0.00045,0.000187,0.000157,0.000152
12,2,0.000149,0.000155,0.000171,0.000228,0.000467,0.00081,0.003901,0.987541,0.005668,0.000417,0.000181,0.000158,0.000153
14,0,0.000128,0.000132,0.000148,0.000187,0.000422,0.00295,0.989933,0.005052,0.000473,0.000184,0.000134,0.00013,0.000128
15,-2,0.000186,0.000197,0.000239,0.000415,0.007437,0.98235,0.007011,0.00092,0.00043,0.00025,0.000191,0.000188,0.000186
18,0,0.000124,0.000129,0.000149,0.000193,0.00049,0.003925,0.990178,0.003859,0.000397,0.000175,0.00013,0.000127,0.000125


In [None]:
# %%

df_important_features = predictor.feature_importance(final_test_data, model=finalModel)

df_important_features.reset_index()

These features in provided data are not utilized by the predictor and will be ignored: ['Unnamed: 0', 'Symmetry_D3', 'Symmetry_D8']
Computing feature importance via permutation shuffling for 15 features using 2500 rows with 5 shuffle sets...
	144.3s	= Expected runtime (28.86s per shuffle set)
	90.79s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,index,importance,stddev,p_value,n,p99_high,p99_low
0,meridinal_translation_real,2.502541,0.023866,9.925371e-10,5,2.551682,2.4534
1,longitudinal_translation,0.918611,0.010663,2.178014e-09,5,0.940566,0.896656
2,meridinal_translation_imag,0.790986,0.013335,9.69086e-09,5,0.818443,0.763528
3,short_geodesic_imag_part,0.150811,0.006068,3.138315e-07,5,0.163305,0.138317
4,cusp_volume,0.077883,0.002976,2.55315e-07,5,0.08401,0.071755
5,volume,0.070419,0.005829,5.581663e-06,5,0.082421,0.058418
6,hyperbolic_torsion_degree,0.058907,0.004085,2.758517e-06,5,0.067319,0.050495
7,injectivity_radius,0.039335,0.001707,4.240722e-07,5,0.042848,0.035821
8,short_geodesic_real_part,0.020886,0.001461,2.858354e-06,5,0.023895,0.017877
9,chern_simons,0.008967,0.000969,1.610202e-05,5,0.010962,0.006972


In [None]:
# %%

final_low_importance_features = df_important_features[df_important_features['importance'] < importance_threshold].index.to_list()

final_low_importance_features

[]