In [1]:
import os

from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')

In [3]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,178478,Bachelors,13,Never-married,Tech-support,Own-child,White,Female,0,0,40,United-States,<=50K
1,23,State-gov,61743,5th-6th,3,Never-married,Transport-moving,Not-in-family,White,Male,0,0,35,United-States,<=50K
2,46,Private,376789,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,15,United-States,<=50K
3,55,?,200235,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,50,United-States,>50K
4,36,Private,224541,7th-8th,4,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,El-Salvador,<=50K


In [4]:
print(train_data.shape)

(39073, 15)


In [5]:
SAMPLE_SIZE = 1000  # subsample subset of data for faster demo, try setting this to much larger values
SEED = 1234

train_data = train_data.sample(n=SAMPLE_SIZE, random_state=SEED)
print(train_data.shape)

(1000, 15)


In [6]:
print(*list(train_data.columns), sep='\n')

age
workclass
fnlwgt
education
education-num
marital-status
occupation
relationship
race
sex
capital-gain
capital-loss
hours-per-week
native-country
class


In [7]:
label = 'class'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count       1000
unique         2
top        <=50K
freq         746
Name: class, dtype: object


In [8]:
train_data[label].unique()

array([' <=50K', ' >50K'], dtype=object)

In [9]:
PATH_MODEL = os.path.join('models', 'Inc')

In [10]:
predictor = TabularPredictor(label=label, path=PATH_MODEL).fit(train_data)

Beginning AutoGluon training ...
AutoGluon will save models to "models/Inc/"
AutoGluon Version:  0.8.2
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #167~18.04.1-Ubuntu SMP Wed May 24 00:51:42 UTC 2023
Disk Space Avail:   271.94 GB / 501.38 GB (54.2%)
Train Data Rows:    1000
Train Data Columns: 14
Label Column: class
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [' <=50K', ' >50K']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 =  >50K, class 0 =  <=50K
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive ( >50K) vs negative ( <=50K) class.
	To explicitly set the positive_class, ei

In [11]:
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
y_test = test_data[label]  # values to predict
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Loaded data from: https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv | Columns = 15 / 15 | Rows = 9769 -> 9769


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,31,Private,169085,11th,7,Married-civ-spouse,Sales,Wife,White,Female,0,0,20,United-States
1,17,Self-emp-not-inc,226203,12th,8,Never-married,Sales,Own-child,White,Male,0,0,45,United-States
2,47,Private,54260,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1887,60,United-States
3,21,Private,176262,Some-college,10,Never-married,Exec-managerial,Own-child,White,Female,0,0,30,United-States
4,17,Private,241185,12th,8,Never-married,Prof-specialty,Own-child,White,Male,0,0,20,United-States


## WARNING from the tutorial [URL](https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html)
`TabularPredictor.load()` uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Never load data that could have come from an untrusted source, or that could have been tampered with. **Only load data you trust.**

In [12]:
# unnecessary, just demonstrates how to load previously-trained predictor from file
predictor = TabularPredictor.load(PATH_MODEL) 

In [13]:
y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.8526973078104207
Evaluations on test data:
{
    "accuracy": 0.8526973078104207,
    "balanced_accuracy": 0.7469619749808614,
    "mcc": 0.5606941470674345,
    "f1": 0.6374401612496851,
    "precision": 0.7662023016353725,
    "recall": 0.5457290767903364
}


Predictions:  
 0        <=50K
1        <=50K
2         >50K
3        <=50K
4        <=50K
         ...  
9764     <=50K
9765     <=50K
9766     <=50K
9767     <=50K
9768     <=50K
Name: class, Length: 9769, dtype: object


In [14]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,0.852697,0.85,0.013949,0.004889,0.306255,0.013949,0.004889,0.306255,1,True,3
1,WeightedEnsemble_L2,0.852697,0.85,0.016309,0.005723,0.88345,0.00236,0.000834,0.577194,2,True,14
2,CatBoost,0.852185,0.85,0.01057,0.004397,0.909081,0.01057,0.004397,0.909081,1,True,7
3,RandomForestEntr,0.844918,0.83,0.087283,0.049466,0.43338,0.087283,0.049466,0.43338,1,True,6
4,XGBoost,0.844508,0.83,0.022909,0.008022,0.195282,0.022909,0.008022,0.195282,1,True,11
5,RandomForestGini,0.844201,0.82,0.109213,0.060169,0.459152,0.109213,0.060169,0.459152,1,True,5
6,ExtraTreesGini,0.838366,0.815,0.150601,0.083551,0.421542,0.150601,0.083551,0.421542,1,True,8
7,ExtraTreesEntr,0.835602,0.83,0.160435,0.047928,0.41572,0.160435,0.047928,0.41572,1,True,9
8,NeuralNetTorch,0.834886,0.845,0.043411,0.01228,2.327821,0.043411,0.01228,2.327821,1,True,12
9,LightGBMLarge,0.834272,0.85,0.086572,0.008196,0.967196,0.086572,0.008196,0.967196,1,True,13


In [22]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd

In [26]:
dataset_breast_cancer = load_breast_cancer(return_X_y=False, as_frame=True)
df_bc = pd.DataFrame(data=dataset_breast_cancer['data'], columns=dataset_breast_cancer.feature_names)
df_bc.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [29]:
df_bc['label'] = dataset_breast_cancer['target']

In [30]:
data_train, data_test = train_test_split(df_bc, test_size=0.2)

In [31]:
print(data_train.shape)

(455, 31)


In [32]:
LABEL = 'label'
PATH_MODEL_BC = os.path.join('models', 'BreastCancer')

In [33]:
predictor = TabularPredictor(label=LABEL, path=PATH_MODEL_BC).fit(data_train)

Beginning AutoGluon training ...
AutoGluon will save models to "models/BreastCancer/"
AutoGluon Version:  0.8.2
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #167~18.04.1-Ubuntu SMP Wed May 24 00:51:42 UTC 2023
Disk Space Avail:   271.59 GB / 501.38 GB (54.2%)
Train Data Rows:    455
Train Data Columns: 30
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11013.06 MB
	Train Data (Original)  Memory Usage: 0.11 MB (0.0% of 

In [35]:
y_test = data_test[LABEL]
X_test = data_test.drop(columns=[LABEL])
X_test.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
512,13.4,20.52,88.64,556.7,0.1106,0.1469,0.1445,0.08172,0.2116,0.07325,...,16.41,29.66,113.3,844.4,0.1574,0.3856,0.5106,0.2051,0.3585,0.1109
457,13.21,25.25,84.1,537.9,0.08791,0.05205,0.02772,0.02068,0.1619,0.05584,...,14.35,34.23,91.29,632.9,0.1289,0.1063,0.139,0.06005,0.2444,0.06788
439,14.02,15.66,89.59,606.5,0.07966,0.05581,0.02087,0.02652,0.1589,0.05586,...,14.91,19.31,96.53,688.9,0.1034,0.1017,0.0626,0.08216,0.2136,0.0671
298,14.26,18.17,91.22,633.1,0.06576,0.0522,0.02475,0.01374,0.1635,0.05586,...,16.22,25.26,105.8,819.7,0.09445,0.2167,0.1565,0.0753,0.2636,0.07676
37,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,0.05863,...,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169


In [36]:
y_pred = predictor.predict(X_test)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.9736842105263158
Evaluations on test data:
{
    "accuracy": 0.9736842105263158,
    "balanced_accuracy": 0.9712607176881549,
    "mcc": 0.9456799777237261,
    "f1": 0.9777777777777777,
    "precision": 0.9705882352941176,
    "recall": 0.9850746268656716
}


In [38]:
predictor.leaderboard(data_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,0.982456,0.967033,0.014349,0.002823,0.161041,0.014349,0.002823,0.161041,1,True,11
1,LightGBM,0.973684,0.978022,0.005225,0.001407,0.238137,0.005225,0.001407,0.238137,1,True,4
2,WeightedEnsemble_L2,0.973684,0.978022,0.006607,0.002037,0.732667,0.001382,0.00063,0.49453,2,True,14
3,RandomForestEntr,0.973684,0.923077,0.050222,0.034441,0.400939,0.050222,0.034441,0.400939,1,True,6
4,RandomForestGini,0.973684,0.923077,0.05305,0.036039,0.402883,0.05305,0.036039,0.402883,1,True,5
5,LightGBMLarge,0.964912,0.956044,0.00255,0.001765,0.468213,0.00255,0.001765,0.468213,1,True,13
6,NeuralNetFastAI,0.964912,0.978022,0.022865,0.005832,0.437223,0.022865,0.005832,0.437223,1,True,10
7,CatBoost,0.95614,0.967033,0.002551,0.001566,0.785673,0.002551,0.001566,0.785673,1,True,7
8,ExtraTreesEntr,0.95614,0.945055,0.080575,0.083443,0.39745,0.080575,0.083443,0.39745,1,True,9
9,NeuralNetTorch,0.947368,0.967033,0.019988,0.01612,1.045643,0.019988,0.01612,1.045643,1,True,12


In [43]:
predictor.get_model_best()

'WeightedEnsemble_L2'

In [45]:
TIME_LIMIT = 60  # for quick demonstration only, you should set this to longest time you are willing to wait (in seconds)
METRIC = 'roc_auc'  # specify your evaluation metric here
predictor = TabularPredictor(label=LABEL, path=PATH_MODEL_BC, eval_metric=METRIC).fit(data_train, time_limit=TIME_LIMIT, presets='best_quality')
predictor.leaderboard(data_test, silent=True)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=5, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 60s
AutoGluon will save models to "models/BreastCancer/"
AutoGluon Version:  0.8.2
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #167~18.04.1-Ubuntu SMP Wed May 24 00:51:42 UTC 2023
Disk Space Avail:   271.56 GB / 501.38 GB (54.2%)
Train Data Rows:    455
Train Data Columns: 30
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fi

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.999365,0.995674,0.461199,0.1415,6.580222,0.004261,0.000484,0.342338,2,True,14
1,NeuralNetFastAI_BAG_L1,0.999047,0.994922,0.060814,0.03468,2.117915,0.060814,0.03468,2.117915,1,True,10
2,LightGBMXT_BAG_L1,0.99873,0.994754,0.047462,0.013407,0.744351,0.047462,0.013407,0.744351,1,True,3
3,LightGBMLarge_BAG_L1,0.99873,0.979457,0.17721,0.014211,1.21967,0.17721,0.014211,1.21967,1,True,13
4,LightGBM_BAG_L1,0.998412,0.992727,0.015743,0.008993,0.652921,0.015743,0.008993,0.652921,1,True,4
5,XGBoost_BAG_L1,0.998095,0.989655,0.044183,0.013526,0.997327,0.044183,0.013526,0.997327,1,True,11
6,RandomForestEntr_BAG_L1,0.99746,0.988036,0.051106,0.068825,0.479139,0.051106,0.068825,0.479139,1,True,6
7,CatBoost_BAG_L1,0.997142,0.992539,0.0084,0.005279,27.900305,0.0084,0.005279,27.900305,1,True,7
8,RandomForestGini_BAG_L1,0.997142,0.989122,0.056639,0.079008,0.440969,0.056639,0.079008,0.440969,1,True,5
9,NeuralNetTorch_BAG_L1,0.996507,0.993626,0.332919,0.083935,2.722697,0.332919,0.083935,2.722697,1,True,12


In [46]:
predictor.get_model_best()

'WeightedEnsemble_L2'