In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import roc_auc_score


from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric



import matplotlib.pyplot as plt
import lightgbm as lgb
import pandas as pd
import numpy as np

In [3]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
df_train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
df_test = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')
df_test_ov = df_test.copy()
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [5]:
scale_cols = ['Age','CreditScore', 'Balance','EstimatedSalary']


for c in scale_cols:
    min_value = df_train[c].min()
    max_value = df_train[c].max()
    df_train[f"{c}_scaled"] = (df_train[c] - min_value) / (max_value - min_value)
    df_test [f"{c}_scaled"] = (df_test[c]  - min_value) / (max_value - min_value)

In [6]:
def getFeats(df):
    df['IsSenior']               = df['Age'].apply(lambda x: 1 if x >= 60 else 0)
    
    df['IsActive_by_CreditCard'] = df['HasCrCard'] * df['IsActiveMember']
    
    df['Products_Per_Tenure']    = df['Tenure'] / df['NumOfProducts']
    
    df['AgeCat']                 = (df["Age"]//20).astype('int').astype('category')
    
    df['Sur_Geo_Gend_Sal']       = (df['Surname'] + 
                                    df['Geography'] + 
                                    df['Gender'] + 
                                    np.round(df.EstimatedSalary).astype('str'))
    
    return df

In [7]:
df_train = getFeats(df_train)
df_test  = getFeats(df_test)

feat_cols = df_train.columns.drop(['id','Exited'])
feat_cols = feat_cols.drop(scale_cols)

In [8]:
X = df_train[feat_cols]
y = df_train['Exited']

cat_features = np.where(X.dtypes != np.float64)[0]

In [9]:
# import optuna
# from optuna.samplers import TPESampler



# def objective(trial):
        
#     folds = StratifiedKFold(
#         n_splits     = 5,
#         random_state = 42,
#         shuffle      = True
#     )

#     param = {
#         "iterations"         : 1000,
#         "used_ram_limit"     : "25gb",
#         "eval_metric"        : 'AUC',
#         "objective"          : 'Logloss',

#         "depth"              : trial.suggest_int("depth", 1, 15),
#         "min_data_in_leaf"   : trial.suggest_int("min_data_in_leaf", 1, 100),
#         "colsample_bylevel"  : trial.suggest_float("colsample_bylevel", 0.05, 1.0),
#         "learning_rate"      : trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),        
#         "l2_leaf_reg"        : trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
#         "random_strength"    : trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
#         "od_wait"            : trial.suggest_int("od_wait", 10, 50),

#     }


    
#     auc_vals = []
#     for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    
#         X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
#         X_val, y_val     = X.iloc[valid_idx], y.iloc[valid_idx]
        

#         train_pool = Pool(X_train, 
#                           y_train, 
#                           cat_features = cat_features)

#         val_pool   = Pool(X_val, 
#                           y_val,
#                           cat_features = cat_features)
    
#         clf = CatBoostClassifier(**param)
#         clf.fit(train_pool, 
#                 eval_set              = val_pool,
#                 verbose               = 50,
#                 early_stopping_rounds = 100
#         )

    
#         y_pred_val = clf.predict_proba(X_val[feat_cols])[:,1]
#         auc_val    = roc_auc_score(y_val, y_pred_val)
#         auc_vals.append(auc_val)
#     return np.mean(auc_vals)




# if __name__ == "__main__":


#     study = optuna.create_study(direction="maximize")
#     study.optimize(objective, n_trials=20)

#     print("Number of finished trials: {}".format(len(study.trials)))
#     print("Best trial:")
#     trial = study.best_trial

#     print("  Value: {}".format(trial.value))
#     print("  Params: ")
#     for key, value in trial.params.items():
#         print("    {}: {}".format(key, value))

In [10]:
# bestParams = trial.params
# print(bestParams)

# {'depth': 7, 'min_data_in_leaf': 62, 'colsample_bylevel': 0.9907583019259885, 'learning_rate': 0.059637293057937336, 'l2_leaf_reg': 2.6598917320414883, 'random_strength': 2.879328229526927e-08}

In [11]:
bestParams = {
    **{'depth': 7, 'min_data_in_leaf': 62, 'colsample_bylevel': 0.9907583019259885, 'learning_rate': 0.059637293057937336, 'l2_leaf_reg': 2.6598917320414883, 'random_strength': 2.879328229526927e-08},
    **{'eval_metric' : 'AUC', 'iterations' : 3500},
}
bestParams

{'depth': 7,
 'min_data_in_leaf': 62,
 'colsample_bylevel': 0.9907583019259885,
 'learning_rate': 0.059637293057937336,
 'l2_leaf_reg': 2.6598917320414883,
 'random_strength': 2.879328229526927e-08,
 'eval_metric': 'AUC',
 'iterations': 3500}

In [12]:
folds = StratifiedKFold(
    n_splits     = 5,
    random_state = 42,
    shuffle      = True)

test_preds = np.empty((5, len(df_test)))
auc_vals=[]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val     = X.iloc[valid_idx], y.iloc[valid_idx]
    
    train_pool = Pool(X_train, 
                      y_train, 
                      cat_features = cat_features)
    
    val_pool   = Pool(X_val, 
                      y_val,
                      cat_features = cat_features)
    
    clf = CatBoostClassifier(**bestParams)
    
    clf.fit(train_pool, 
            eval_set = val_pool,
            verbose  = 500)
    
    y_pred_val = clf.predict_proba(X_val[feat_cols])[:,1]
    auc_val    = roc_auc_score(y_val, y_pred_val)
    
    
    print("AUC for fold ",n_fold,": ",auc_val)
    auc_vals.append(auc_val)
    
    y_pred_test = clf.predict_proba(df_test[feat_cols])[:,1]
    test_preds[n_fold, :] = y_pred_test
    print("----------------")

0:	test: 0.8792377	best: 0.8792377 (0)	total: 371ms	remaining: 21m 36s
500:	test: 0.8973893	best: 0.8973893 (500)	total: 2m 8s	remaining: 12m 50s
1000:	test: 0.8973894	best: 0.8976271 (820)	total: 4m 20s	remaining: 10m 51s
1500:	test: 0.8969645	best: 0.8976271 (820)	total: 6m 32s	remaining: 8m 43s
2000:	test: 0.8963402	best: 0.8976271 (820)	total: 8m 47s	remaining: 6m 34s
2500:	test: 0.8958983	best: 0.8976271 (820)	total: 11m	remaining: 4m 23s
3000:	test: 0.8953652	best: 0.8976271 (820)	total: 13m 15s	remaining: 2m 12s
3499:	test: 0.8946370	best: 0.8976271 (820)	total: 15m 29s	remaining: 0us

bestTest = 0.8976270672
bestIteration = 820

Shrink model to first 821 iterations.
AUC for fold  0 :  0.8976270671917286
----------------
0:	test: 0.8799592	best: 0.8799592 (0)	total: 348ms	remaining: 20m 18s
500:	test: 0.8977592	best: 0.8977738 (495)	total: 2m 10s	remaining: 13m 3s
1000:	test: 0.8980102	best: 0.8980473 (988)	total: 4m 22s	remaining: 10m 55s
1500:	test: 0.8977658	best: 0.8980473 (

In [13]:
df_orig   = pd.read_csv("/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv")

df_orig.rename(
    columns = {'Exited':'Exited_Orig'},
    inplace = True)

df_orig['Exited_Orig'] = df_orig['Exited_Orig'].map({0:1,1:0})


join_cols = list(df_orig.columns.drop(['RowNumber','Exited_Orig']))
df_test_ov = df_test_ov.merge(
                                df_orig,
                                on  = join_cols,
                                how = 'left'
                             )[['id','Exited_Orig']].fillna(-1)

df_sub = df_test_ov[['id','Exited_Orig']]

In [14]:
y_pred = test_preds.mean(axis=0)

df_sub['Exited'] = np.where(
                        df_sub.Exited_Orig == -1,
                        y_pred,
                        df_sub.Exited_Orig)

    
    
df_sub.head()

Unnamed: 0,id,Exited_Orig,Exited
0,165034,-1.0,0.0116
1,165035,-1.0,0.858492
2,165036,-1.0,0.013951
3,165037,-1.0,0.187841
4,165038,-1.0,0.406509


In [15]:
df_sub.drop('Exited_Orig', 
            axis    = 1,
            inplace = True)

df_sub.to_csv("submission_base_1.csv", index=False)


In [16]:
df_sub.head()

Unnamed: 0,id,Exited
0,165034,0.0116
1,165035,0.858492
2,165036,0.013951
3,165037,0.187841
4,165038,0.406509


In [17]:
df_ext1 = pd.read_csv('/kaggle/input/playgrounds4e01-baseline-v2/Submission_V2.csv') # 0.89651

df_sub.Exited = 0.15 * df_sub.Exited + 0.85 * df_ext1.Exited
df_sub[['id', 'Exited']].to_csv('submission_base_2.csv', index=False)

In [18]:
df_sub.head()

Unnamed: 0,id,Exited
0,165034,0.013135
1,165035,0.832799
2,165036,0.014746
3,165037,0.191087
4,165038,0.414602


In [19]:
df_ext1 = pd.read_csv('/kaggle/working/submission_base_1.csv') # 0.89644
df_ext1.head()

Unnamed: 0,id,Exited
0,165034,0.0116
1,165035,0.858492
2,165036,0.013951
3,165037,0.187841
4,165038,0.406509


In [20]:
df_ext2 = pd.read_csv('/kaggle/working/submission_base_2.csv') # 0.89653
df_ext2.head()

Unnamed: 0,id,Exited
0,165034,0.013135
1,165035,0.832799
2,165036,0.014746
3,165037,0.191087
4,165038,0.414602


In [21]:
df_ext3 = pd.read_csv('/kaggle/input/pgs41-more-lines-of-autogluon/ensemble.csv') # 0.89654
df_ext3.head()

Unnamed: 0,id,Exited
0,165034,0.013035
1,165035,0.830491
2,165036,0.01486
3,165037,0.192448
4,165038,0.413958


In [22]:
df_ext1 = pd.read_csv('/kaggle/working/submission_base_1.csv') # 0.89644
df_ext2 = pd.read_csv('/kaggle/working/submission_base_2.csv') # 0.89653
df_ext3 = pd.read_csv('/kaggle/input/pgs41-more-lines-of-autogluon/ensemble.csv') # 0.89654




df_sub.Exited = (0.05 * df_ext1.Exited) + (0.45 * df_ext2.Exited) + (0.5 * df_ext3.Exited)
df_sub[['id', 'Exited']].to_csv('submission.csv', index=False)
df_sub.head()

Unnamed: 0,id,Exited
0,165034,0.013008
1,165035,0.83293
2,165036,0.014764
3,165037,0.191605
4,165038,0.413875
