In [1]:
import os

from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')

In [3]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,178478,Bachelors,13,Never-married,Tech-support,Own-child,White,Female,0,0,40,United-States,<=50K
1,23,State-gov,61743,5th-6th,3,Never-married,Transport-moving,Not-in-family,White,Male,0,0,35,United-States,<=50K
2,46,Private,376789,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,15,United-States,<=50K
3,55,?,200235,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,50,United-States,>50K
4,36,Private,224541,7th-8th,4,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,El-Salvador,<=50K


In [4]:
SAMPLE_SIZE = 1000  # subsample subset of data for faster demo, try setting this to much larger values
SEED = 1234

train_data = train_data.sample(n=SAMPLE_SIZE, random_state=SEED)
print(train_data.shape)

(1000, 15)


## As a Regression task

In [5]:
age_column = 'age'
print("Summary of age variable: \n", train_data[age_column].describe())

Summary of age variable: 
 count    1000.000000
mean       38.082000
std        13.893607
min        17.000000
25%        26.000000
50%        36.000000
75%        47.000000
max        90.000000
Name: age, dtype: float64


In [6]:
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
test_data.shape

(9769, 15)

In [7]:
PATH_MODEL = os.path.join('models', 'Inc_age')
predictor_age = TabularPredictor(label=age_column, problem_type='regression', path=PATH_MODEL).fit(train_data, time_limit=60)

Beginning AutoGluon training ... Time limit = 60s
AutoGluon will save models to "models/Inc_age/"
AutoGluon Version:  0.8.2
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #167~18.04.1-Ubuntu SMP Wed May 24 00:51:42 UTC 2023
Disk Space Avail:   271.22 GB / 501.38 GB (54.1%)
Train Data Rows:    1000
Train Data Columns: 14
Label Column: age
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11826.5 MB
	Train Data (Original)  Memory Usage: 0.64 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 2 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting 

In [8]:
performance = predictor_age.evaluate(test_data)

Evaluation: root_mean_squared_error on test data: -10.305794077072315
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -10.305794077072315,
    "mean_squared_error": -106.20939155901881,
    "mean_absolute_error": -7.943027014668418,
    "r2": 0.4322901847752414,
    "pearsonr": 0.6580951411800383,
    "median_absolute_error": -6.451377868652344
}


In [9]:
predictor_age.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-10.305794,-9.464772,0.377097,0.108818,6.024659,0.003297,0.000453,0.243836,2,True,12
1,CatBoost,-10.435411,-9.522196,0.011259,0.00494,1.036179,0.011259,0.00494,1.036179,1,True,6
2,NeuralNetFastAI,-10.558365,-10.045126,0.192057,0.009408,1.263601,0.192057,0.009408,1.263601,1,True,8
3,LightGBMXT,-10.568216,-9.76505,0.013332,0.003844,0.253806,0.013332,0.003844,0.253806,1,True,3
4,ExtraTreesMSE,-10.703582,-9.86332,0.092194,0.036487,0.337062,0.092194,0.036487,0.337062,1,True,7
5,NeuralNetTorch,-10.779115,-9.978223,0.052191,0.011463,2.839568,0.052191,0.011463,2.839568,1,True,10
6,XGBoost,-10.800047,-9.782344,0.026099,0.046067,0.304413,0.026099,0.046067,0.304413,1,True,9
7,LightGBM,-10.827582,-10.163458,0.014792,0.00362,0.257063,0.014792,0.00362,0.257063,1,True,4
8,RandomForestMSE,-10.907342,-10.092777,0.099144,0.036606,0.380692,0.099144,0.036606,0.380692,1,True,5
9,LightGBMLarge,-11.118354,-10.327089,0.022393,0.004607,0.689464,0.022393,0.004607,0.689464,1,True,11


In [10]:
predictor_age.get_model_best()

'WeightedEnsemble_L2'

In [11]:
predictor_age.feature_importance(test_data)

Computing feature importance via permutation shuffling for 14 features using 5000 rows with 5 shuffle sets...
	24.49s	= Expected runtime (4.9s per shuffle set)
	10.39s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
marital-status,3.119337,0.133667,4.036095e-07,5,3.394558,2.844115
relationship,0.866419,0.045178,8.838824e-07,5,0.959441,0.773398
hours-per-week,0.66392,0.027309,3.42763e-07,5,0.72015,0.607689
class,0.192623,0.010259,9.619481e-07,5,0.213747,0.1715
workclass,0.173799,0.027852,7.650687e-05,5,0.231146,0.116451
education-num,0.112694,0.018973,9.287302e-05,5,0.15176,0.073628
education,0.105436,0.01734,8.470995e-05,5,0.14114,0.069733
native-country,0.063011,0.018839,0.0008544001,5,0.1018,0.024222
fnlwgt,0.06221,0.023693,0.002101809,5,0.110994,0.013425
occupation,0.038962,0.01,0.0004779635,5,0.059552,0.018372
