In [4]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# from light_gbm import kfold_lightgbm

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# Display/plot feature importance
def display_importances(feature_importance_df_, image_name):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout
    plt.savefig(image_name)

# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, submission_file_name, image_name, train_record_file, stratified = False, debug= False, seed=123):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=seed)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        dtrain = lgb.Dataset(data=train_df[feats].iloc[train_idx], 
                             label=train_df['TARGET'].iloc[train_idx], 
                             free_raw_data=False, silent=True)
        dvalid = lgb.Dataset(data=train_df[feats].iloc[valid_idx], 
                             label=train_df['TARGET'].iloc[valid_idx], 
                             free_raw_data=False, silent=True)

        # LightGBM parameters found by Bayesian optimization
        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'nthread': 4,
            'learning_rate': 0.02,  # 02,
            'num_leaves': 20,
            'colsample_bytree': 0.9497036,
            'subsample': 0.8715623,
            'subsample_freq': 1,
            'max_depth': 8,
            'reg_alpha': 0.041545473,
            'reg_lambda': 0.0735294,
            'min_split_gain': 0.0222415,
            'min_child_weight': 60, # 39.3259775,
            'seed': 0,
            'verbose': -1,
            'metric': 'auc',
        }
        
        clf = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=10000,
            valid_sets=[dtrain, dvalid],
            early_stopping_rounds=200,
            verbose_eval=False
        )

        oof_preds[valid_idx] = clf.predict(dvalid.data)
        sub_preds += clf.predict(test_df[feats]) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid.label, oof_preds[valid_idx])))
#         del clf, dtrain, dvalid
        del dtrain, dvalid
        gc.collect()
    roc_auc = roc_auc_score(train_df['TARGET'], oof_preds)
    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    with open(train_record_file,'a') as f:
        f.write('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)+'\n')
    # Write submission file and plot feature importance
    if not debug:
        sub_df = test_df[['SK_ID_CURR']].copy()
        sub_df['TARGET'] = sub_preds
        sub_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    display_importances(feature_importance_df, image_name)
    return feature_importance_df, clf, roc_auc


# all_feature_df = pd.read_csv('All_features.csv')

# Preprocess application_train.csv and application_test.csv
def original_application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv('../input/application_train.csv', nrows= num_rows)
    test_df = pd.read_csv('../input/application_test.csv', nrows= num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    del test_df
    gc.collect()
    return df


# Preprocess application_train.csv and application_test.csv
def application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv('../input/application_train.csv', nrows= num_rows)
    test_df = pd.read_csv('../input/application_test.csv', nrows= num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

    inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

    # df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    # df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    # df['NEW_DOC_IND_AVG'] = df[docs].mean(axis=1)
    # df['NEW_DOC_IND_STD'] = df[docs].std(axis=1)
    # df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
    # df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
    # df['NEW_LIVE_IND_STD'] = df[live].std(axis=1)
    # df['NEW_LIVE_IND_KURT'] = df[live].kurtosis(axis=1)
    # df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    # df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
    # df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    # df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])
    # df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    # df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    # df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    # df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
    # df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    # df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    # df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    # df['NEW_PHONE_TO_EMPLOY_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    # df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    del test_df
    gc.collect()
    return df

def record_importance(feat_importance, record_file_name):
    all_features_sort = feat_importance[["feature","importance"]]\
    .groupby("feature").mean().sort_values(by="importance", ascending=False)
    important_list = []
    # print(all_features_sort['importance']['NEW_EXT_SOURCES_MEAN'])
    important_list = all_features_sort.index.values
    with open(record_file_name,'w') as f:
        for name in important_list:
            f.write(str(name) + '\t'+ str(all_features_sort['importance'][name])+'\n')
    return 


In [45]:
# new_train_df = application_train_test()
def basic_feature_test(original_train_df, test_num):
    score_sum = []
    for i in range(test_num):
        feat_importance, clf, score = kfold_lightgbm(original_train_df, num_folds = 5, train_record_file = './new_features/Original_features_train.txt'.format(i), submission_file_name = './new_features/Original_features_{}.csv'.format(i), \
        image_name = './new_features/Original_importance.png'.format(i), stratified= False, debug = False, seed = i)
        record_importance(feat_importance, './new_features/Original_features_importance_rank.txt'.format(i))
        score_sum.append(score)
    return sum(score_sum)/len(score_sum)

def new_single_feature_test(new_train_df, feature_name, test_num):
    score_sum = []
    for i in range(test_num):       
        feat_importance, clf, score = kfold_lightgbm(new_train_df, num_folds = 5, train_record_file = './new_features/{}_train.txt'.format(feature_name), submission_file_name = './new_features/new_feature_{}.csv'.format(feature_name), \
        image_name = './new_features/{}_importance.png'.format(feature_name), stratified= False, debug = False, seed = i)
        record_importance(feat_importance,'./new_features/{}_importance_rank.txt'.format(feature_name))
        score_sum.append(score)
    return sum(score_sum)/len(score_sum)

In [41]:
score_dict = dict()
df = original_application_train_test()
score_dict['original'] = basic_feature_test(df,1)

Train samples: 307511, test samples: 48744


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Starting LightGBM. Train shape: (307507, 243), test shape: (48744, 243)
Fold  1 AUC : 0.768332
Fold  2 AUC : 0.762283
Fold  3 AUC : 0.753738
Fold  4 AUC : 0.759918
Fold  5 AUC : 0.765644
Full AUC score 0.761944


  stat_data = remove_na(group_data)


In [50]:
docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]

In [57]:
df = original_application_train_test()
original_length = len(df.columns)
new_df = df.copy()
try:
    new_df['NEW_CREDIT_TO_ANNUITY_RATIO'] = new_df['AMT_CREDIT'] / new_df['AMT_ANNUITY']
    score_dict['NEW_CREDIT_TO_ANNUITY_RATIO'] = new_single_feature_test(new_df, 'NEW_CREDIT_TO_ANNUITY_RATIO',1)
except:
    print('NEW_CREDIT_TO_ANNUITY_RATIO')
print(len(df.columns))
assert len(df.columns) == original_length

new_df = df.copy()
try:
    new_df['NEW_CREDIT_TO_GOODS_RATIO'] = new_df['AMT_CREDIT'] / new_df['AMT_GOODS_PRICE']
    score_dict['NEW_CREDIT_TO_GOODS_RATIO'] = new_single_feature_test(new_df, 'NEW_CREDIT_TO_GOODS_RATIO',1)
except:
    print('NEW_CREDIT_TO_GOODS_RATIO')
    print(len(df.columns))
    assert len(df.columns) == original_length

new_df = df.copy()
try:
    new_df['NEW_DOC_IND_AVG'] = new_df[docs].mean(axis=1)
    score_dict['NEW_DOC_IND_AVG'] = new_single_feature_test(new_df, 'NEW_DOC_IND_AVG',1)
except:
    print('NEW_DOC_IND_AVG')
print(len(df.columns))
assert len(df.columns) == original_length

new_df = df.copy()
try:
    new_df['NEW_DOC_IND_STD'] = new_df[docs].std(axis=1)
    score_dict['NEW_DOC_IND_STD'] = new_single_feature_test(new_df, 'NEW_DOC_IND_STD',1)
except:
    print('NEW_DOC_IND_STD')
print(len(df.columns))

new_df = df.copy()
try:
    new_df['NEW_DOC_IND_KURT'] = new_df[docs].kurtosis(axis=1)
    score_dict['NEW_DOC_IND_KURT'] = new_single_feature_test(new_df, 'NEW_DOC_IND_KURT',1)
except:
    print('NEW_DOC_IND_KURT')
    print(len(df.columns))

new_df = df.copy()
try:
    new_df['NEW_LIVE_IND_SUM'] = new_df[live].sum(axis=1)
    score_dict['NEW_LIVE_IND_SUM'] = new_single_feature_test(new_df, 'NEW_LIVE_IND_SUM',1)
except:
    print('NEW_LIVE_IND_SUM')
print(len(df.columns))

new_df = df.copy()
try:
    new_df['NEW_LIVE_IND_STD'] = new_df[live].std(axis=1)
    score_dict['NEW_LIVE_IND_STD'] = new_single_feature_test(new_df, 'NEW_LIVE_IND_STD',1)
except:
    print('NEW_LIVE_IND_STD')

new_df = df.copy()
try:
    new_df['NEW_LIVE_IND_KURT'] = new_df[live].kurtosis(axis=1)
    score_dict['NEW_LIVE_IND_KURT'] = new_single_feature_test(new_df, 'NEW_LIVE_IND_KURT',1)
except:
    print('NEW_LIVE_IND_KURT')

new_df = df.copy()
try:
    new_df['NEW_INC_PER_CHLD'] = new_df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    score_dict['NEW_INC_PER_CHLD'] = new_single_feature_test(new_df, 'NEW_INC_PER_CHLD',1)
except:
    print('NEW_INC_PER_CHLD')  

new_df = df.copy()
try:
    new_df['NEW_INC_BY_ORG'] = new_df['ORGANIZATION_TYPE'].map(inc_by_org)
    score_dict['NEW_INC_BY_ORG'] = new_single_feature_test(new_df, 'NEW_INC_BY_ORG',1)
except:
    print('NEW_INC_BY_ORG')  

new_df = df.copy()
try:
    new_df['NEW_EMPLOY_TO_BIRTH_RATIO'] = new_df['DAYS_EMPLOYED'] / new_df['DAYS_BIRTH']
    score_dict['NEW_EMPLOY_TO_BIRTH_RATIO'] = new_single_feature_test(new_df, 'NEW_EMPLOY_TO_BIRTH_RATIO',1)
except:
    print('NEW_EMPLOY_TO_BIRTH_RATIO')  

new_df = df.copy()
try:
    new_df['NEW_ANNUITY_TO_INCOME_RATIO'] = new_df['AMT_ANNUITY'] / (1 + new_df['AMT_INCOME_TOTAL'])
    score_dict['NEW_ANNUITY_TO_INCOME_RATIO'] = new_single_feature_test(new_df, 'NEW_ANNUITY_TO_INCOME_RATIO',1)
except:
    print('NEW_ANNUITY_TO_INCOME_RATIO')  

new_df = df.copy()
try:
    new_df['NEW_SOURCES_PROD'] = new_df['EXT_SOURCE_1'] * new_df['EXT_SOURCE_2'] * new_df['EXT_SOURCE_3']
    score_dict['NEW_SOURCES_PROD'] = new_single_feature_test(new_df, 'NEW_SOURCES_PROD',1)
except:
    print('NEW_SOURCES_PROD')  

new_df = df.copy()
try:
    new_df['NEW_EXT_SOURCES_MEAN'] = new_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    score_dict['NEW_EXT_SOURCES_MEAN'] = new_single_feature_test(new_df, 'NEW_EXT_SOURCES_MEAN',1)
except:
    print('NEW_EXT_SOURCES_MEAN')  

new_df = df.copy()
try:
    new_df['NEW_SCORES_STD'] = new_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    score_dict['NEW_SCORES_STD'] = new_single_feature_test(new_df, 'NEW_SCORES_STD',1)
except:
    print('NEW_SCORES_STD')  

new_df = df.copy()
try:
    new_df['NEW_SCORES_STD'] = new_df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
    score_dict['NEW_SCORES_STD'] = new_single_feature_test(new_df, 'NEW_SCORES_STD',1)
except:
    print('NEW_SCORES_STD')  

new_df = df.copy()
try:
    new_df['NEW_CAR_TO_BIRTH_RATIO'] = new_df['OWN_CAR_AGE'] / new_df['DAYS_BIRTH']
    score_dict['NEW_CAR_TO_BIRTH_RATIO'] = new_single_feature_test(new_df, 'NEW_CAR_TO_BIRTH_RATIO',1)
except:
    print('NEW_CAR_TO_BIRTH_RATIO')  

new_df = df.copy()
try:
    new_df['NEW_CAR_TO_EMPLOY_RATIO'] = new_df['OWN_CAR_AGE'] / new_df['DAYS_EMPLOYED']
    score_dict['NEW_CAR_TO_EMPLOY_RATIO'] = new_single_feature_test(new_df, 'NEW_CAR_TO_EMPLOY_RATIO',1)
except:
    print('NEW_CAR_TO_EMPLOY_RATIO')  

new_df = df.copy()
try:
    new_df['NEW_PHONE_TO_BIRTH_RATIO'] = new_df['DAYS_LAST_PHONE_CHANGE'] / new_df['DAYS_BIRTH']
    score_dict['NEW_PHONE_TO_BIRTH_RATIO'] = new_single_feature_test(new_df, 'NEW_PHONE_TO_BIRTH_RATIO',1)
except:
    print('NEW_PHONE_TO_BIRTH_RATIO')  

new_df = df.copy()
try:
    new_df['NEW_PHONE_TO_EMPLOY_RATIO'] = new_df['DAYS_LAST_PHONE_CHANGE'] / new_df['DAYS_EMPLOYED']
    score_dict['NEW_PHONE_TO_EMPLOY_RATIO'] = new_single_feature_test(new_df, 'NEW_PHONE_TO_EMPLOY_RATIO',1)
except:
    print('NEW_PHONE_TO_EMPLOY_RATIO')  

new_df = df.copy()
try:
    new_df['NEW_CREDIT_TO_INCOME_RATIO'] = new_df['AMT_CREDIT'] / new_df['AMT_INCOME_TOTAL']
    score_dict['NEW_CREDIT_TO_INCOME_RATIO'] = new_single_feature_test(new_df, 'NEW_CREDIT_TO_INCOME_RATIO',1)
except:
    print('NEW_CREDIT_TO_INCOME_RATIO')  



Train samples: 307511, test samples: 48744


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Starting LightGBM. Train shape: (307507, 244), test shape: (48744, 244)
Fold  1 AUC : 0.776609
Fold  2 AUC : 0.770611
Fold  3 AUC : 0.761543
Fold  4 AUC : 0.768248
Fold  5 AUC : 0.774085
Full AUC score 0.770172


  stat_data = remove_na(group_data)


243
Starting LightGBM. Train shape: (307507, 244), test shape: (48744, 244)
Fold  1 AUC : 0.768897
Fold  2 AUC : 0.762443
Fold  3 AUC : 0.754656
Fold  4 AUC : 0.760148
Fold  5 AUC : 0.766903
Full AUC score 0.762556
Starting LightGBM. Train shape: (307507, 244), test shape: (48744, 244)
Fold  1 AUC : 0.768597
Fold  2 AUC : 0.762542
Fold  3 AUC : 0.753952
Fold  4 AUC : 0.759910
Fold  5 AUC : 0.765751
Full AUC score 0.762080
243
Starting LightGBM. Train shape: (307507, 244), test shape: (48744, 244)
Fold  1 AUC : 0.768803
Fold  2 AUC : 0.762597
Fold  3 AUC : 0.753854
Fold  4 AUC : 0.759805
Fold  5 AUC : 0.765699
Full AUC score 0.762105
243
Starting LightGBM. Train shape: (307507, 244), test shape: (48744, 244)
Fold  1 AUC : 0.768967
Fold  2 AUC : 0.762409
Fold  3 AUC : 0.754388
Fold  4 AUC : 0.760070
Fold  5 AUC : 0.765891
Full AUC score 0.762309
Starting LightGBM. Train shape: (307507, 244), test shape: (48744, 244)
Fold  1 AUC : 0.768497
Fold  2 AUC : 0.762114
Fold  3 AUC : 0.753950
Fol

In [66]:
sorted_names = sorted(score_dict, key=score_dict.__getitem__, reverse=True)
for k in sorted_names:
    print("{} : {}".format(k, score_dict[k]))

NEW_CREDIT_TO_ANNUITY_RATIO : 0.7701718897950962
NEW_CREDIT_TO_GOODS_RATIO : 0.7625557909334466
NEW_DOC_IND_KURT : 0.7623092532609511
NEW_DOC_IND_STD : 0.7621047141082732
NEW_ANNUITY_TO_INCOME_RATIO : 0.7620956222284385
NEW_DOC_IND_AVG : 0.7620804281030955
NEW_CREDIT_TO_INCOME_RATIO : 0.7620576039977538
NEW_PHONE_TO_BIRTH_RATIO : 0.7619975364301655
original : 0.7619437351247256
NEW_INC_PER_CHLD : 0.7619278283606187
NEW_PHONE_TO_EMPLOY_RATIO : 0.7618876348788381
NEW_LIVE_IND_KURT : 0.7618848233685779
NEW_LIVE_IND_STD : 0.7618751755991575
NEW_EMPLOY_TO_BIRTH_RATIO : 0.7618725661528378
NEW_CAR_TO_BIRTH_RATIO : 0.7618712667021504
NEW_LIVE_IND_SUM : 0.7618523747211939
NEW_SOURCES_PROD : 0.7618296976750811
NEW_EXT_SOURCES_MEAN : 0.7617132403031236
NEW_CAR_TO_EMPLOY_RATIO : 0.7617093044737577
NEW_SCORES_STD : 0.7616722482555295


In [33]:
ratio_tuples = [('EXT_SOURCE_3','AMT_ANNUITY'),
               ('EXT_SOURCE_2','AMT_CREDIT'),
               ('AMT_ANNUITY','DAYS_BIRTH'),
               ('AMT_CREDIT','DAYS_EMPLOYED')]

246
247


In [None]:
for t in ratio_tuples:
    new_df = df.copy()
    try:
        new_feature_name = t[0]+'_'+t[1]+'_ratio'
        execline = "new_df['{}'] = new_df['{}'] / new_df['{}']".format(new_feature_name,t[0],t[1])
        print(execline)
        exec(execline)
        score_dict[new_feature_name] = new_single_feature_test(new_df, new_feature_name,1)
    except:
        print('new feature: {} is wrong'.format(new_feature_name))
        assert len(df.columns) == original_length

In [23]:
sorted_names = sorted(score_dict, key=score_dict.__getitem__, reverse=True)
for k in sorted_names:
    print("{} : {}".format(k, score_dict[k]))

{'NEW_CREDIT_TO_ANNUITY_RATIO'}