In [1]:
import numpy as np
import pandas as pd 
import os 
import random
import pickle
import warnings
warnings.filterwarnings(action='ignore')

from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.eda.auto as auto

In [2]:
TRAIN_PATH = "playground-series-s3e10/train.csv"
TEST_PATH = "playground-series-s3e10/test.csv"
SAMPLE_SUBISSION_PATH = "playground-series-s3e10/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"
TARGET = "Class"
NEGATIVE_FEATURES = "negative_features"


SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

MODEL_TIME_LIMIT = 60*60
MODEL_SAVE_PATH = "autogluon_model_2/"
MODEL_VERBOSE = 2
MODEL_PRESETS = "best_quality"

NUM_BAG_FOLDS = 5
NUM_BAG_SETS = 1
NUM_STACK_LEVELS = 1

In [3]:
# try1 keep all features
negative_features_list = ["id"] #["id", "Skewness_DMSNR_Curve"]
# try2 : remove all negative features
negative_features_list = ["id", "Mean_Integrated", "Mean_DMSNR_Curve", "Skewness"]

In [4]:
train_data = pd.read_csv(TRAIN_PATH).drop(negative_features_list, axis=1)

test_data_all = pd.read_csv(TEST_PATH)
test_data = test_data_all.drop(negative_features_list, axis=1)
test_data_id = test_data_all["id"]

In [5]:
# pre-made feature generator
from autogluon.features.generators import AutoMLPipelineFeatureGenerator

# This is the default feature generator of AutoGluon, and contains many stages of preprocessing made to handle many types of data.
# AutoMLPipelineFeatureGenerator is an implementation of PipelineFeatureGenerator
# For most users, this should be all they need to get high quality features that are ready to fit models.
auto_ml_pipeline_feature_generator = AutoMLPipelineFeatureGenerator()

train_data_features = train_data.drop(columns=[TARGET])
train_data_features_transform = auto_ml_pipeline_feature_generator.fit_transform(X=train_data_features)
#print(train_data_features_transform.head(5))

# test_data don't have TARGET column
#test_data_features = test_data.drop(columns=[TARGET])
test_data_transform = auto_ml_pipeline_feature_generator.transform(X=test_data)
#print(test_data_transform.head(5))

In [6]:
train_data_features_transform[TARGET] = train_data[TARGET]

In [7]:
train_data_features_transform.shape

(117564, 6)

In [8]:
predictor = TabularPredictor(
    label=TARGET, 
    problem_type="regression", # switch from binary to regression
    path=MODEL_SAVE_PATH, 
    sample_weight="auto_weight", 
    verbosity=MODEL_VERBOSE).fit(
    train_data_features_transform, 
    presets=MODEL_PRESETS, 
    time_limit=MODEL_TIME_LIMIT,
    num_bag_folds=NUM_BAG_FOLDS, 
    num_bag_sets=NUM_BAG_SETS, 
    num_stack_levels=NUM_STACK_LEVELS
)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "autogluon_model_2/"
AutoGluon Version:  0.6.3b20230205
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023
Train Data Rows:    117564
Train Data Columns: 5
Label Column: Class
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    41767.47 MB
	Train Data (Original)  Memory Usage: 4.7 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting Id

In [9]:
predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3  -0.086267      57.266458  1096.354354                0.001946           1.386949            3       True         22
1      WeightedEnsemble_L2  -0.086333      47.137641   570.456184                0.002109           1.507957            2       True         12
2          CatBoost_BAG_L2  -0.086541      48.618765   580.055256                0.048019           6.992199            2       True         16
3   NeuralNetFastAI_BAG_L2  -0.086693      50.784335   776.762270                2.213590         203.699213            2       True         18
4          CatBoost_BAG_L1  -0.086722       0.038104     8.688755                0.038104           8.688755            1       True          6
5        LightGBMXT_BAG_L2  -0.087209      49.022678   576.621778         

{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
  'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
  'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestMSE_BAG_L1': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesMSE_BAG_L1': 'StackerEnsembleModel_XT',
  'NeuralNetFastAI_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular',
  'XGBoost_BAG_L1': 'StackerEnsembleModel_XGBoost',
  'NeuralNetTorch_BAG_L1': 'StackerEnsembleModel_TabularNeuralNetTorch',
  'LightGBMLarge_BAG_L1': 'StackerEnsembleModel_LGB',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel',
  'LightGBMXT_BAG_L2': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L2': 'StackerEnsembleModel_LGB',
  'RandomForestMSE_BAG_L2': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L2': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesMSE_BAG_L2': 'StackerEnsembleModel_XT',
  'NeuralNetFastAI_BAG_L2': 'StackerEnsembleModel_NNFa

In [10]:
predictor = TabularPredictor.load(MODEL_SAVE_PATH) 
pred_test = predictor.predict(test_data_transform)

# pd.DataFrame(pred_test).to_csv("submission.csv", header=True)
pd.concat([test_data_id, pred_test], axis=1).to_csv("submission.csv", header=True, index=False)

In [11]:
predictor.persist_models()

Persisting 18 models in memory. Models will require 0.71% of memory.


['LightGBM_BAG_L1',
 'LightGBMXT_BAG_L1',
 'ExtraTreesMSE_BAG_L1',
 'NeuralNetTorch_BAG_L2',
 'CatBoost_BAG_L1',
 'NeuralNetTorch_BAG_L1',
 'LightGBMLarge_BAG_L1',
 'XGBoost_BAG_L1',
 'RandomForestMSE_BAG_L2',
 'LightGBMLarge_BAG_L2',
 'WeightedEnsemble_L3',
 'NeuralNetFastAI_BAG_L1',
 'CatBoost_BAG_L2',
 'KNeighborsUnif_BAG_L1',
 'KNeighborsDist_BAG_L1',
 'NeuralNetFastAI_BAG_L2',
 'RandomForestMSE_BAG_L1',
 'ExtraTreesMSE_BAG_L2']