# Utility functions (for loading and preprocessing data):

In [1]:
# load data
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def load_data(path):
    df = pd.read_csv(path)
    return df

# delete irrelevant features
def clean_features(df, bad_features):
    df = df.drop(columns=bad_features)
    return df

# separate on input/output
def get_input_output(df, output_feature, bad_features = []):
    input_df = df.drop(columns=[output_feature] + bad_features)
    output_df = df[output_feature]
    return input_df, output_df

# map a feature's values (for categorial features)
def map_feature_values(df, feature, mapping):
    df[feature] = df[feature].map(mapping)
    return df

# mark categorical features
def mark_categorical_features(df, model, categorical_features = []):
    if model == 'lightgbm':
        for col in categorical_features:
            df[col].astype('category')
    elif model == 'xgboost':
        for col in categorical_features:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
    return df

# add a boolean feature based on a condition
def add_boolean_feature(df, base_feature, new_feature, condition):
    df[new_feature] = df[base_feature].apply(condition).astype(int)
    return df

## 1. Load and preprocess data:


In [None]:
import numpy as np

TRAIN_FILEPATH = 'train.csv'
TEST_FILEPATH = 'test.csv'

# load data
train_df = load_data(TRAIN_FILEPATH)
test_df = load_data(TEST_FILEPATH)

# get input/output dataframes
bad_features = ['CustomerId', 'Surname', 'id']
X, y = get_input_output(train_df, 'Exited', bad_features)

# process non-numeric features (Geography, Gender)
gender_mapping = {
    'Male': 0,
    'Female': 1,
}
geography_mapping = {
    'France': 0,
    'Germany': 1,
    'Spain': 2,
}

X = map_feature_values(X, 'Gender', gender_mapping)
test_df = map_feature_values(test_df, 'Gender', gender_mapping)

X = map_feature_values(X, 'Geography', geography_mapping)
test_df = map_feature_values(test_df, 'Geography', geography_mapping)

# add categorical feature
X = add_boolean_feature(X, 'Balance', 'HasBalance', lambda x: x > 0)
test_df = add_boolean_feature(test_df, 'Balance', 'HasBalance', lambda x: x > 0)

# replace with NaN null values
X = map_feature_values(X, 'Balance', {0: np.nan})
test_df = map_feature_values(test_df, 'Balance', {0: np.nan})

# mark the categorical features (with less discrete values)
categorical_features = ['HasCrCard', 'IsActiveMember', 'Geography', 'Gender', 'HasBalance']

# lightgbm
X = mark_categorical_features(X, model='lightgbm', categorical_features=categorical_features)
test_df = mark_categorical_features(test_df, model='lightgbm', categorical_features=categorical_features)

## 2. Optimize the hyperparameters using Optuna:


In [None]:
import optuna
import numpy as np
import warnings
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings('ignore')

study = optuna.create_study(direction='maximize') # create a study which role is to maximize the obejctive function (in our case roc_auc_score)

def objective(trial):
    # Hyperparameters values space
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'num_leaves': trial.suggest_int('num_leaves', 10, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
        valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)

        model = lgb.train(param, train_data,
                          valid_sets=[valid_data],
                          num_boost_round=1000,
                         )

        preds = model.predict(X_valid)
        auc = roc_auc_score(y_valid, preds)
        aucs.append(auc)

    return np.mean(aucs)

study.optimize(objective, n_trials=1000)
print('Best trial:')
print("Best parameters")
print(study.best_params)
print("Best AUC score:", study.best_value)

## 3. Build and train model (LightGMB):


In [None]:
import lightgbm as lgb

# build model
model = lgb.LGBMClassifier(
    **study.best_params,
    objective='binary',
    metric='auc'
)

# train model using all the train set
model.fit(X, y, categorical_feature=categorical_features)

## 4. Prediction for test set:


In [None]:
ids = test_df['id']
test_df = clean_features(test_df, bad_features)
test_df_predict = model.predict_proba(test_df)[:, 1]

submission = pd.DataFrame({
    'id': ids,
    'Exited': test_df_predict
})
submission.to_csv('/kaggle/working/submission.csv', index=False)