In [1]:
import numpy as np
import pandas as pd 
import os 
import random
import warnings
warnings.filterwarnings(action='ignore')

from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.eda.auto as auto
from sklearn.model_selection import GroupKFold


In [2]:
TRAIN_PATH = "playground-series-s3e10/train.csv"
TEST_PATH = "playground-series-s3e10/test.csv"
SAMPLE_SUBISSION_PATH = "playground-series-s3e10/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"
TARGET = "Class"
NEGATIVE_FEATURES = "negative_features"

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

MODEL_SAVE_PATH = "output/playground-series-s3e10_feature_engineering/"
MODEL_VERBOSE = 2
MODEL_PRESETS = "best_quality"
TIME_LIMIT = 120

NUM_BAG_FOLDS = 5
NUM_BAG_SETS = 1
NUM_STACK_LEVELS = 1

In [3]:
train_data = pd.read_csv(TRAIN_PATH)

In [4]:
from autogluon.features.generators import AutoMLPipelineFeatureGenerator

auto_ml_pipeline_feature_generator = AutoMLPipelineFeatureGenerator()

train_data_features = train_data.drop([TARGET, "id"], axis=1)
train_data_features_transform = auto_ml_pipeline_feature_generator.fit_transform(X=train_data_features)

train_data_features_transform.describe()

Unnamed: 0,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve
count,117564.0,117564.0,117564.0,117564.0,117564.0,117564.0,117564.0,117564.0
mean,111.2483,46.713535,0.503498,1.886385,11.962921,26.190678,8.037488,93.881076
std,24.906474,6.102941,1.127093,6.515466,26.719946,20.041937,3.84098,79.96211
min,6.054688,24.783273,-1.730782,-1.791886,0.213211,7.370432,-2.597872,-1.976976
25%,104.546875,43.44339,0.049761,-0.188956,2.090301,14.955405,6.742911,49.409136
50%,116.664062,47.478932,0.186498,0.09172,2.808528,18.164924,8.442883,83.421375
75%,126.296875,50.862718,0.39562,0.691613,4.12291,24.732218,10.003237,122.09329
max,189.367188,93.602933,7.879628,65.385974,217.371238,109.890785,34.539844,1191.000837


In [5]:
train_data_features_transform[TARGET] = train_data[TARGET]

In [6]:
# https://stackoverflow.com/questions/56191448/sample-pandas-dataframe-based-on-values-in-column
# sample by group
train_data_features_transform_test =  train_data_features_transform.groupby(TARGET).sample(frac=0.25, random_state=1)
train_data_features_transform_test.head()

Unnamed: 0,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
83648,122.09375,53.94566,-0.02746,-0.50668,4.076923,22.929187,7.049416,56.839443,0
113745,116.867188,48.754745,0.254735,0.067584,3.01505,17.572804,8.118838,80.71541,0
103750,126.359375,46.442499,0.075528,0.176053,2.55602,17.559953,8.734353,89.873374,0
34497,151.0,46.673078,-0.338401,0.064067,3.020903,19.43898,8.073839,72.967109,0
50996,130.578125,50.161891,0.003526,-0.164029,2.141304,19.382942,9.599411,96.829541,0


In [7]:
sample_idxs = train_data_features_transform_test.index.tolist()
train_data_features_transform_train = train_data_features_transform.drop(sample_idxs, axis="index")
train_data_features_transform_train.head()

Unnamed: 0,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
0,133.171875,59.716081,0.043133,-0.703383,54.917224,70.084438,0.749798,-0.649512,0
2,112.640625,39.818393,0.379639,0.922306,2.730769,15.68969,8.193471,85.649785,0
3,120.679688,45.918448,-0.09849,0.011775,2.696488,20.954662,8.183874,70.332899,0
5,131.632812,52.56321,-0.075253,-0.495825,2.194816,15.537425,9.033439,97.032406,0
7,120.203125,49.927902,-0.08999,-0.321367,3.2801,18.37684,8.190561,77.917237,0


In [8]:
print(train_data_features_transform_train.shape)
print(train_data_features_transform_test.shape)

(88173, 9)
(29391, 9)


In [9]:
predictor = TabularPredictor(
    label=TARGET, 
    path=MODEL_SAVE_PATH, 
    verbosity=MODEL_VERBOSE).fit(
    train_data_features_transform_train, 
    time_limit=TIME_LIMIT, 
    presets=MODEL_PRESETS, 
    num_bag_folds=NUM_BAG_FOLDS, 
    num_bag_sets=NUM_BAG_SETS, 
    num_stack_levels=NUM_STACK_LEVELS
)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
Beginning AutoGluon training ... Time limit = 120s
AutoGluon will save models to "output/playground-series-s3e10_feature_engineering/"
AutoGluon Version:  0.6.3b20230205
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023
Train Data Rows:    88173
Train Data Columns: 8
Label Column: Class
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelin

In [10]:
predictor = TabularPredictor.load(MODEL_SAVE_PATH) 
pred_test = predictor.predict(train_data_features_transform_test)

predictor.leaderboard(train_data_features_transform_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L1,0.990575,0.991188,0.077526,0.036767,7.711259,0.077526,0.036767,7.711259,1,True,7
1,ExtraTreesEntr_BAG_L1,0.990575,0.990995,0.222032,2.497108,2.372678,0.222032,2.497108,2.372678,1,True,9
2,XGBoost_BAG_L1,0.990473,0.991222,0.092498,0.106954,1.415082,0.092498,0.106954,1.415082,1,True,11
3,LightGBM_BAG_L1,0.990473,0.991301,0.102826,0.120411,1.628629,0.102826,0.120411,1.628629,1,True,4
4,ExtraTreesGini_BAG_L1,0.990405,0.991052,0.258766,2.440366,2.493839,0.258766,2.440366,2.493839,1,True,8
5,WeightedEnsemble_L2,0.990405,0.991403,0.795659,7.427211,39.789535,0.003911,0.191828,26.33171,2,True,12
6,LightGBMXT_BAG_L2,0.990405,0.991324,4.362384,12.836717,49.457833,0.056434,0.094537,1.78151,2,True,13
7,LightGBMXT_BAG_L1,0.990371,0.990882,0.422875,0.896195,3.262503,0.422875,0.896195,3.262503,1,True,3
8,RandomForestGini_BAG_L1,0.990337,0.991063,0.208123,2.177496,6.962677,0.208123,2.177496,6.962677,1,True,5
9,NeuralNetFastAI_BAG_L1,0.990337,0.990779,2.292827,1.378218,14.587529,2.292827,1.378218,14.587529,1,True,10


In [11]:
feature_importances = predictor.feature_importance(train_data_features_transform_test)

Computing feature importance via permutation shuffling for 8 features using 5000 rows with 5 shuffle sets...
	20.3s	= Expected runtime (4.06s per shuffle set)
	7.35s	= Actual runtime (Completed 5 of 5 shuffle sets)


In [12]:
print("特徴量の重要度:")
display(feature_importances)

特徴量の重要度:


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
EK,0.15396,0.00562,2.12727e-07,5,0.165532,0.142388
SD,0.00196,0.001203,0.01096156,5,0.004438,-0.000518
Skewness_DMSNR_Curve,0.00044,0.000498,0.0596967,5,0.001465,-0.000585
EK_DMSNR_Curve,8e-05,0.000415,0.3442285,5,0.000934,-0.000774
SD_DMSNR_Curve,4e-05,0.000639,0.4477029,5,0.001355,-0.001275
Skewness,-4e-05,0.000329,0.6005171,5,0.000637,-0.000717
Mean_DMSNR_Curve,-0.0002,0.000583,0.7570669,5,0.001001,-0.001401
Mean_Integrated,-0.00024,0.000385,0.8822518,5,0.000552,-0.001032


In [13]:
feature_importances.to_csv("feature_importances.csv")