# Load Data & Overview

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

import lightgbm as lgb
import xgboost as xgb

# sklearn tools for model training and assesment
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.metrics import roc_curve, auc, accuracy_score,roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.base import clone

import gc
import os
print(os.listdir("../input"))
import warnings
warnings.filterwarnings("ignore")

In [None]:
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')
train.head()

In [None]:
train.info()

# EDA

## The Target

In [None]:
sns.countplot(train['target'])

The target is imbalance, and we will use AUC as the metric according to the requirement.  I tried SMOTE over sampling before but didn't help.

In [None]:
# Checking missing values
print(train.isnull().values.any())
print(test.isnull().values.any())

In [None]:
# Features that have high correlations with the target
features=[]
cor=[]
for feature in train.iloc[:,2:].columns:
    if (train['target'].corr(train[feature])>0.05)|(train['target'].corr(train[feature])<-0.05):
        features.append(feature)
        cor.append(train['target'].corr(train[feature]))

df_corr=pd.DataFrame({'Features': features,'Correlations':cor}).sort_values(by='Correlations').set_index('Features')

df_corr.plot(kind='barh',figsize=(10,8))

In [None]:
# Feature with high skewness
featuresSkew=[]
skewness=[]

for feature in train.iloc[:,2:].columns:
    if (train[feature].skew()>=0.5) | (train[feature].skew()<=-0.5) :
        featuresSkew.append(feature)
        skewness.append(train[feature].skew())

df_skew=pd.DataFrame({'Features':featuresSkew,'Skewness':skewness})
df_skew

There is no transformation needed.

# Feature Engineering

In [None]:
import featuretools as ft
es = ft.EntitySet(id='Santander')

es.entity_from_dataframe(dataframe=train[features],
                         entity_id='train',
                         make_index = True,
                         index='index')

fm, feat= ft.dfs(entityset=es, 
                 target_entity='train',
                 trans_primitives=['multiply_numeric','add_numeric'],
                 max_depth=1)

In [None]:
train=pd.concat((train,fm.iloc[:,len(features):]),axis=1)
# release some memory
del fm
gc.collect()
fm=pd.DataFrame()
train.info()

In [None]:
es.entity_from_dataframe(dataframe=test[features],
                         entity_id='test',
                         make_index = True,
                         index='index')

fm_test, feat= ft.dfs(entityset=es, 
                 target_entity='test',
                 trans_primitives=['multiply_numeric','add_numeric'],
                 max_depth=1)

In [None]:
test=pd.concat((test,fm_test.iloc[:,len(features):]),axis=1)
# release some memory
del fm_test
gc.collect()
fm_test=pd.DataFrame()
test.info()

# Prediction

In [None]:
# Cross validate model with Kfold stratified cross val
random_state = 123
kfold = StratifiedKFold(n_splits=12,shuffle=False,random_state=random_state)
pred_val = np.zeros(len(train))
feature_base=train.columns.tolist()[2:202]

In [None]:
# Parameters are from https://www.kaggle.com/jesucristo/santander-magic-lgb
param = {
        'bagging_freq': 5,
        'bagging_fraction': 0.38,
        'boost_from_average':'false',
        'boost': 'gbdt',
        'feature_fraction': 0.045,
        'learning_rate': 0.01,
        'max_depth': -1,  
        'metric':'auc',
        'min_data_in_leaf': 80,
        'min_sum_hessian_in_leaf': 10.0,
        'num_leaves': 13,
        'num_threads': 8,
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1
    }

In [None]:
# Baseline model: LightGBM with no feature engineering and tunning.
for foldIdx, (trn_idx, val_idx) in enumerate(kfold.split(train.loc[:,feature_base], train['target'])):
    print("Fold {}".format(foldIdx))
    lgbm_base=lgb.LGBMClassifier(n_estimators=100000,random_state=random_state,**param)
    lgbm_base.fit(train.iloc[trn_idx][feature_base],train['target'][trn_idx],
                  eval_set=[(train.iloc[trn_idx][feature_base],train['target'][trn_idx]),(train.iloc[val_idx][feature_base],train['target'][val_idx])],
                  early_stopping_rounds = 5000,
                  verbose=2000)
    pred_val[val_idx] = lgbm_base.predict_proba(train.loc[val_idx,feature_base], num_iteration=lgbm_base.best_iteration_)[:,1]

In [None]:
# Evaluation
print('AUC score: %.5f' % roc_auc_score(train['target'],pred_val))

In [None]:
# Split X and y
X_train=train.iloc[:,2:]
y_train=train['target']
X_test=test.iloc[:,1:]

In [None]:
# Feature selection with LightGBM for all features
lgbm_sel=lgb.LGBMClassifier(n_estimators=lgbm_base.best_iteration_,
                        random_state=random_state,
                        **param)

embeded_lgb_selector = SelectFromModel(lgbm_sel, threshold='1.25*median')
embeded_lgb_selector.fit(X_train, y_train)

In [None]:
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X_train.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

In [None]:
# Fit the lightGBM with embeded_lgb_feature
pred_val = np.zeros(len(train))
pred_test = np.zeros(len(test))
for foldIdx, (trn_idx, val_idx) in enumerate(kfold.split(X_train.loc[:,embeded_lgb_feature], y_train)):
    print("Fold {}".format(foldIdx))
    lgbm=lgb.LGBMClassifier(n_estimators=100000,random_state=random_state,**param)
    lgbm.fit(X_train.iloc[trn_idx][embeded_lgb_feature],y_train[trn_idx],
                  eval_set=[(X_train.iloc[trn_idx][embeded_lgb_feature],y_train[trn_idx]),(X_train.iloc[val_idx][embeded_lgb_feature],y_train[val_idx])],
                  early_stopping_rounds = 5000,
                  verbose=2000)
    pred_val[val_idx] = lgbm.predict_proba(X_train.loc[val_idx,embeded_lgb_feature], num_iteration=lgbm.best_iteration_)[:,1]
    pred_test += lgbm.predict_proba(X_test[embeded_lgb_feature],num_iteration=lgbm.best_iteration_)[:,1] / kfold.n_splits

In [None]:
# Evaluation
print('AUC score: %.5f' % roc_auc_score(y_train,pred_val))

The AUC score is not better with more features and the model tended to overfit. I will explore more ideas on 1)feature engineering, 2)feature selection and 3)ensemble modeling. 

In [None]:
# Submission
submission = pd.DataFrame({'ID_code': test.ID_code.values,
                           'target':pred_test})
submission.to_csv("LGBM_V1.csv", index=False)

# Reference

[What is the acceptable range of skewness and kurtosis for normal distribution of data?](https://codeburst.io/2-important-statistics-terms-you-need-to-know-in-data-science-skewness-and-kurtosis-388fef94eeaa)

[Auto feature engineering with feature tool](https://docs.featuretools.com/loading_data/using_entitysets.html)

[Auto feature engineering Kaggle case](https://www.kaggle.com/willkoehrsen/featuretools-for-good)

[How to choose metrics for imbalance dataset](https://towardsdatascience.com/what-metrics-should-we-use-on-imbalanced-data-set-precision-recall-roc-e2e79252aeba)

[Santander Magic LGB](https://www.kaggle.com/jesucristo/santander-magic-lgb)

[6 Ways for Feature Selection](https://www.kaggle.com/sz8416/6-ways-for-feature-selection)
