In [1]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [21]:
df = pd.read_csv('datasets/train.csv')
df_test = pd.read_csv('datasets/test.csv')
test_ids = df_test['id']
df.drop(['id','CustomerId', 'Surname'], axis=1, inplace=True)
df_test.drop(['id','CustomerId', 'Surname'], axis=1, inplace=True)

In [3]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
transformer = ColumnTransformer([('cat', OneHotEncoder(drop='first'), ['Geography', 'Gender']),
                                 ('num', StandardScaler(), ['CreditScore', 'Age', 'Tenure', 'NumOfProducts'])], remainder='passthrough')

In [5]:
df_trans = transformer.fit_transform(df)
df_prep = pd.DataFrame(df_trans, columns=transformer.get_feature_names_out().tolist())
df_prep.rename(columns={'remainder__Exited': 'Exited'}, inplace=True)

In [6]:
df_prep.head()

Unnamed: 0,cat__Geography_Germany,cat__Geography_Spain,cat__Gender_Male,num__CreditScore,num__Age,num__Tenure,num__NumOfProducts,remainder__Balance,remainder__HasCrCard,remainder__IsActiveMember,remainder__EstimatedSalary,Exited
0,0.0,0.0,1.0,0.144135,-0.578074,-0.719973,0.814298,0.0,1.0,0.0,181449.97,0.0
1,0.0,0.0,1.0,-0.367706,-0.578074,-1.432694,0.814298,0.0,1.0,1.0,49503.5,0.0
2,0.0,0.0,1.0,0.268974,0.211354,1.774548,0.814298,0.0,1.0,0.0,184866.69,0.0
3,0.0,0.0,1.0,-0.941966,-0.465299,-1.076334,-1.013348,148882.54,1.0,1.0,84560.88,0.0
4,0.0,1.0,1.0,0.743362,-0.578074,-0.007253,0.814298,0.0,1.0,1.0,15068.83,0.0


In [6]:
train = TabularDataset(df_prep)

In [8]:
automl = TabularPredictor(label='Exited', problem_type='binary', eval_metric='roc_auc')
automl.fit(train, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20240113_180310"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20240113_180310/ds_sub_fit/sub_fit_ho.
2024-01-13 15:03:10,693	INFO util.py:159 -- Outdated packages:
  ipywidgets==6.0.0 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Beginning Au

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f4df44ef940>

In [9]:
automl.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.891296,roc_auc,101.774953,2278.422915,0.01891,48.487794,3,True,33
1,CatBoost_BAG_L2,0.890875,roc_auc,74.409143,1787.76448,0.085586,72.854531,2,True,25
2,CatBoost_r177_BAG_L2,0.890799,roc_auc,74.410446,1757.454303,0.086889,42.544354,2,True,32
3,LightGBM_BAG_L2,0.890657,roc_auc,75.27096,1723.439467,0.947403,8.529519,2,True,22
4,XGBoost_BAG_L2,0.890644,roc_auc,75.217634,1724.227375,0.894078,9.317426,2,True,29
5,LightGBMXT_BAG_L2,0.890581,roc_auc,76.807351,1729.869982,2.483794,14.960034,2,True,21
6,WeightedEnsemble_L2,0.890502,roc_auc,36.342378,1029.232386,0.018787,37.329552,2,True,20
7,LightGBMLarge_BAG_L2,0.890491,roc_auc,78.188381,1735.061882,3.864824,20.151933,2,True,31
8,NeuralNetFastAI_BAG_L2,0.890387,roc_auc,76.643249,1970.980654,2.319692,256.070706,2,True,28
9,LightGBM_r131_BAG_L1,0.890179,roc_auc,19.688056,54.712108,19.688056,54.712108,1,True,16


In [9]:
df_test_prep = transformer.fit_transform(df_test)
df_test_prep = pd.DataFrame(df_test_prep, columns=transformer.get_feature_names_out().tolist())
df_test_prep = TabularDataset(df_test_prep)

In [10]:
df_test_prep

Unnamed: 0,cat__Geography_Germany,cat__Geography_Spain,cat__Gender_Male,num__CreditScore,num__Age,num__Tenure,num__NumOfProducts,remainder__Balance,remainder__HasCrCard,remainder__IsActiveMember,remainder__EstimatedSalary
0,0.0,0.0,0.0,-0.878176,-1.706504,-1.067887,0.820030,0.00,0.0,1.0,160976.75
1,0.0,0.0,0.0,0.329567,0.888990,-1.067887,-1.015806,0.00,1.0,0.0,72549.27
2,0.0,0.0,0.0,-0.006609,-0.465181,0.713922,0.820030,0.00,1.0,0.0,138882.09
3,0.0,0.0,1.0,0.304665,-0.239486,1.070284,-1.015806,0.00,1.0,0.0,113931.57
4,1.0,0.0,1.0,1.188684,-0.013791,1.783008,-1.015806,121263.62,1.0,0.0,139431.00
...,...,...,...,...,...,...,...,...,...,...,...
110018,0.0,1.0,1.0,-1.077392,-1.029419,0.713922,-1.015806,116099.82,1.0,1.0,148087.62
110019,0.0,0.0,0.0,-1.015137,-0.239486,-0.355164,-1.015806,178032.53,1.0,1.0,42181.68
110020,0.0,0.0,1.0,0.690645,-0.803724,-1.067887,0.820030,0.00,1.0,0.0,16287.38
110021,0.0,0.0,0.0,0.653292,-0.690876,-0.711526,-1.015806,0.00,1.0,1.0,158816.58


In [11]:
predictor = TabularPredictor.load("AutogluonModels/ag-20240113_180310")
prediction = predictor.predict_proba(df_test_prep)

In [25]:
submission = pd.DataFrame({'id': test_ids, 'Exited': prediction[1]})
submission.to_csv('submissions/automl.csv', index=False)