# Prepare everything

credit:
@kdmitrie

In [4]:
# pip install autogluon

In [5]:
%%time

from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np

TRAIN = 'data/train.csv'
TEST = 'data/test.csv'
ORIGINAL = 'data/Churn_Modelling.csv'

df_train = pd.read_csv(TRAIN)
df_orig = pd.read_csv(ORIGINAL)
df_test = pd.read_csv(TEST)

# Include the original dataset
df_train = pd.concat((df_train, df_orig), axis=0).drop(['id', 'RowNumber'], axis=1).drop_duplicates().dropna()

  from .autonotebook import tqdm as notebook_tqdm


CPU times: total: 109 ms
Wall time: 1.77 s


# Apply different preprocessing schemes

In [6]:
%%time
# Below is simple feature processing from https://www.kaggle.com/code/lordpatil/automl-autogluon

def processing1(df):
    df['Surname_len'] = df['Surname'].apply(len)
    df['Geography'] = df['Geography'].map({'France': 0, 'Spain': 1, 'Germany': 2})
    df['Gender'] = (df['Gender']=='Male').astype(np.int64)
    return df

def processing2(df):
    cols = ['CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember','EstimatedSalary', 'Exited']
    df = df.loc[df.groupby(cols)['Tenure'].idxmax()].drop_duplicates()
    df = df[(df.NumOfProducts < 4) | (df.HasCrCard == 1)]
    return df

def processing3(df):
    df['IsSenior'] = df['Age'].apply(lambda x: 1 if x >= 60 else 0)
    df['IsActive_by_CreditCard'] = df['HasCrCard'] * df['IsActiveMember']
    df['Products_Per_Tenure'] =  df['Tenure'] / df['NumOfProducts']
    df['AgeCat'] = np.round(df.Age/20).astype('int').astype('category')
    df['Sur_Geo_Gend_Sal'] = df['Surname']+df['Geography']+df['Gender']+np.round(df.EstimatedSalary).astype('str')
    return df

#df_train = processing1(df_train)
#df_test = processing1(df_test)

df_train = processing2(df_train)

df_train = processing3(df_train)
df_test = processing3(df_test)

CPU times: total: 1.17 s
Wall time: 5.32 s


# Fit the AutoGluon predictor

In [7]:
%%time

train = TabularDataset(df_train)
test = TabularDataset(df_test)

automl = TabularPredictor(label='Exited', problem_type='binary', eval_metric='roc_auc')
automl.fit(train, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20240130_091433"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240130_091433/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 900s
AutoGluon will save models to "AutogluonModels\ag-20240130_091433/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.10.13
Operating System:   

[1000]	valid_set's binary_logloss: 0.312006


	0.8921	 = Validation score   (roc_auc)
	24.27s	 = Training   runtime
	1.0s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ... Training model for up to 1751.78s of the 2642.3s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	0.8895	 = Validation score   (roc_auc)
	10.09s	 = Training   runtime
	0.26s	 = Validation runtime
Fitting model: RandomForestGini_BAG_L1 ... Training model for up to 1741.18s of the 2631.7s of remaining time.
	0.8797	 = Validation score   (roc_auc)
	9.85s	 = Training   runtime
	4.81s	 = Validation runtime
Fitting model: RandomForestEntr_BAG_L1 ... Training model for up to 1725.96s of the 2616.48s of remaining time.
	0.8794	 = Validation score   (roc_auc)
	11.49s	 = Training   runtime
	5.03s	 = Validation runtime
Fitting model: CatBoost_BAG_L1 ... Training model for up to 1708.89s of the 2599.4s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	

CPU times: total: 2h 11min 31s
Wall time: 1h 32s


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x217c2bfff70>

# Explore the leaderboard

In [8]:
%%time
automl.leaderboard()

CPU times: total: 0 ns
Wall time: 4 ms


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L2,0.908835,roc_auc,28.893836,2482.744485,0.472107,738.353466,2,True,18
1,WeightedEnsemble_L3,0.908835,roc_auc,28.917383,2523.091958,0.023547,40.347473,3,True,20
2,WeightedEnsemble_L2,0.896166,roc_auc,6.864506,1713.956405,0.022999,27.116842,2,True,13
3,ExtraTreesGini_BAG_L2,0.895873,roc_auc,33.352828,1751.516688,4.931098,7.125669,2,True,19
4,LightGBMXT_BAG_L2,0.895814,roc_auc,28.782301,1757.504567,0.360572,13.113548,2,True,14
5,CatBoost_BAG_L1,0.895809,roc_auc,0.400192,911.988342,0.400192,911.988342,1,True,7
6,RandomForestEntr_BAG_L2,0.894033,roc_auc,34.289248,1772.928778,5.867519,28.537759,2,True,17
7,LightGBM_BAG_L2,0.893524,roc_auc,28.605752,1753.860365,0.184023,9.469346,2,True,15
8,RandomForestGini_BAG_L2,0.89248,roc_auc,34.314754,1769.323259,5.893024,24.93224,2,True,16
9,LightGBMXT_BAG_L1,0.89211,roc_auc,0.996759,24.267942,0.996759,24.267942,1,True,3


# Make a 'pure' submission file

In [9]:
%%time
prediction = automl.predict_proba(test)

data_submit = pd.read_csv('data/sample_submission.csv')
data_submit.Exited = prediction[1]
data_submit[['id', 'Exited']].to_csv('ag_processing.csv', index=False)

!head ag_processing.csv

CPU times: total: 2min 26s
Wall time: 42.6 s


'head'��(��) ���� �Ǵ� �ܺ� ����, ������ �� �ִ� ���α׷�, �Ǵ�
��ġ ������ �ƴմϴ�.


# Add external solutions to produce an ensemble
- https://www.kaggle.com/code/ravi20076/playgrounds4e01-baseline-v2

In [10]:
df_ext1 = pd.read_csv('baseline-submission.csv') # 0.89651

data_submit.Exited = 0.15 * data_submit.Exited + 0.85 * df_ext1.Exited
data_submit[['id', 'Exited']].to_csv('ensemble.csv', index=False)

!head ensemble.csv

'head'��(��) ���� �Ǵ� �ܺ� ����, ������ �� �ִ� ���α׷�, �Ǵ�
��ġ ������ �ƴմϴ�.
