In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pt
%matplotlib inline
import gc
import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, KFold
from sklearn.metrics import roc_auc_score

pd.set_option('display.max_columns', 500)

In [42]:
%%time
train = pd.read_csv('train.csv')
test = pd.read_csv('test_bqCt9Pv.csv')

Wall time: 14.2 s


In [43]:
# some functions to prepare data
def prep_yrs(x):
    x_yrs = x.split()[0]
    x_mon = x.split()[1]
    n_years = x_yrs[:(len(x_yrs)-3)]
    n_mon = x_mon[:(len(x_mon)-3)]
    return int(n_years)*12 + int(n_mon)

def birth_prep(x):
    if x[6:] != '00':
        x = x[:6]+'19'+x[6:]
    else:
        x = x[:6]+'2000'
    return x

def preproc_score(x):
    if 'Not Scored' in x or 'No Bureau History Available' in x:
        return 'Not Scored'
    else:
        return x
    
def means_mapping(var, by, func = 'mean'):
    if type(by) != list:
        by = [by]
    name = '{0}_by_{1}_mean'.format(var, by)
    grp = df.groupby(by)[[var]].agg(func)
    grp.columns = [name]
    return pd.merge(df[by], grp, left_on = by, right_index = True, how = 'left')[name]

In [44]:
target = train['loan_default']
del train['loan_default']
gc.collect()

11

In [45]:
df = pd.concat((train, test))

In [46]:
# columns with only one unique value is useless
for col in df.columns:
    if df[col].nunique() == 1:
        df.drop(col, axis=1, inplace=True)
        print(f'{col} dropped, nunique=1')

MobileNo_Avl_Flag dropped, nunique=1


In [None]:
# preparing data
df['AVERAGE.ACCT.AGE'] = df['AVERAGE.ACCT.AGE'].apply(prep_yrs)
df['CREDIT.HISTORY.LENGTH'] = df['CREDIT.HISTORY.LENGTH'].apply(prep_yrs)

df['Date.of.Birth'] = df['Date.of.Birth'].apply(birth_prep)

df['Date.of.Birth'] = pd.to_datetime(df['Date.of.Birth'], format='%d-%m-%Y', cache=True, errors='coerce')
df['DisbursalDate'] = pd.to_datetime(df['DisbursalDate'], format='%d-%m-%y', cache=True)

df['PERFORM_CNS.SCORE.DESCRIPTION'] = df['PERFORM_CNS.SCORE.DESCRIPTION'].apply(preproc_score)

In [53]:
# generating some useful features based on basic features
df['years_on_loan'] = (df['DisbursalDate'] - df['Date.of.Birth']).astype('timedelta64[Y]')
df['dis_as_diff'] = df['asset_cost'] - df['disbursed_amount']
df['dis_as_share'] = df['asset_cost'] / df['disbursed_amount']
df['diff_ltv_'] = df['dis_as_share'] - df['ltv']

df['dayofweek'] = df['DisbursalDate'].dt.weekday
df['day'] = df['DisbursalDate'].dt.day

df['outstanding_now'] = df['disbursed_amount'] + df['PRI.CURRENT.BALANCE']
df['disbursed_tot'] = df['PRI.DISBURSED.AMOUNT'] + df['disbursed_amount']
df['out_to_dsbrsd'] = df['outstanding_now'] / df['disbursed_tot']
df['share_overdue'] = df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'] - df['NEW.ACCTS.IN.LAST.SIX.MONTHS']

df['sec_overdue_share'] = df['SEC.OVERDUE.ACCTS'] / df['SEC.NO.OF.ACCTS']
df['pri_overdue_share'] = df['PRI.OVERDUE.ACCTS'] / df['PRI.NO.OF.ACCTS']

In [54]:
# define list of categorical features to group by
cat_cols_names = ['branch_id', 'supplier_id',  'manufacturer_id',
                  'Employment.Type', 'State_ID', 'Employee_code_ID',
                  'PERFORM_CNS.SCORE.DESCRIPTION', 'Current_pincode_ID', 'years_on_loan']

In [55]:
# define list of numeric features to calculate by groups
# final list was selected by cv
numeric_cols = ['disbursed_amount', 'asset_cost', 'ltv',
                'PERFORM_CNS.SCORE', 'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
                'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES', 'pri_overdue_share']

In [56]:
# additional check
numeric_cols = [x for x in numeric_cols if x not in cat_cols_names]

In [58]:
# fill NA with some str value
df['Employment.Type'].fillna('miss', inplace=True)

In [59]:
# calculate means by categorical features
for name in cat_cols_names:
    temp = df.groupby([name])[numeric_cols].mean()
    temp.columns = [col+'_grpd_by_'+name for col in temp.columns]
    temp = temp.reset_index()
    df = df.merge(temp, how='left', on=name)

Wall time: 25.5 s


In [61]:
# calculate difference between actual value and mean by group
grpd_cols = [col for col in df.columns if 'grpd' in col]

for col in grpd_cols:
    numcol = col.split('_grpd_by_')[0]
    df[col] = df[numcol] - df[col]

Wall time: 725 ms


In [62]:
# frequency encoding for categories
for col in cat_cols_names:
    df[col] = df[col].map(df[col].value_counts(normalize=True))

In [63]:
# differnt relative features calcucated by combinations of groups
df['scr_by_sup_man'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['supplier_id',  'manufacturer_id'])
df['ltv_by_sup_man'] = df['ltv'] / means_mapping('ltv', ['supplier_id',  'manufacturer_id'])
#
df['scr_by_sup_branch'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['supplier_id',  'branch_id'])
df['ltv_by_sup_branch'] = df['ltv'] / means_mapping('ltv', ['supplier_id',  'branch_id'])
#
df['scr_by_man_branch'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['manufacturer_id',  'branch_id'])
df['ltv_by_man_branch'] = df['ltv'] / means_mapping('ltv', ['manufacturer_id',  'branch_id'])
#
df['scr_by_man_branch_sup'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['manufacturer_id',
                                                                                            'branch_id',
                                                                                            'supplier_id'])
df['ltv_by_man_branch_sup'] = df['ltv'] / means_mapping('ltv', ['manufacturer_id',
                                                                'branch_id',
                                                                'supplier_id'])
#
df['scr_by_state_branch'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['State_ID',  'branch_id'])
df['ltv_by_state_branch'] = df['ltv'] / means_mapping('ltv', ['State_ID',  'branch_id'])
#
df['scr_by_state_man'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['State_ID',  'manufacturer_id'])
df['ltv_by_state_man'] = df['ltv'] / means_mapping('ltv', ['State_ID',  'manufacturer_id'])
#
df['scr_by_state_sup'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['State_ID',  'supplier_id'])
df['ltv_by_state_sup'] = df['ltv'] / means_mapping('ltv', ['State_ID',  'supplier_id'])
#
df['scr_by_emp_branch'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['Employee_code_ID',  'branch_id'])
df['ltv_by_emp_branch'] = df['ltv'] / means_mapping('ltv', ['Employee_code_ID',  'branch_id'])
#
df['scr_by_sup_emp'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['Employee_code_ID',  'supplier_id'])
df['ltv_by_sup_emp'] = df['ltv'] / means_mapping('ltv', ['Employee_code_ID',  'supplier_id'])
#
df['scr_by_emp_branch_sup'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['Employee_code_ID',
                                                                                            'branch_id',
                                                                                            'supplier_id'])
df['ltv_by_emp_branch_sup'] = df['ltv'] / means_mapping('ltv', ['Employee_code_ID',
                                                                'branch_id',
                                                                'supplier_id'])
#
df['inq_by_sup_man'] = df['NO.OF_INQUIRIES'] / means_mapping('NO.OF_INQUIRIES', ['supplier_id',  'manufacturer_id'])
df['inq_by_sup_branch'] = df['NO.OF_INQUIRIES'] / means_mapping('NO.OF_INQUIRIES', ['supplier_id',  'branch_id'])
df['inq_by_man_branch'] = df['NO.OF_INQUIRIES'] / means_mapping('disbursed_amount', ['manufacturer_id',  'branch_id'])
df['inq_by_man_branch_sup'] = df['NO.OF_INQUIRIES'] / means_mapping('NO.OF_INQUIRIES', ['manufacturer_id',
                                                                                        'branch_id',
                                                                                        'supplier_id'])
df['inq_by_emp_branch'] = df['NO.OF_INQUIRIES'] / means_mapping('NO.OF_INQUIRIES',
                                                                ['Employee_code_ID',  'branch_id'])
#
df['scr_by_age_emptype'] = df['PERFORM_CNS.SCORE'] / means_mapping('PERFORM_CNS.SCORE', ['Employment.Type', 
                                                                                         'years_on_loan'])
df['ltv_by_age_emptype'] = df['ltv'] / means_mapping('ltv', ['Employment.Type', 'years_on_loan'])

In [64]:
# splip to train and test again
train_df = df.iloc[:len(train), :]
test_df = df.iloc[len(train):, :]

In [65]:
train_df['loan_default'] = target

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [66]:
del train, test, df
gc.collect()

7

In [68]:
# set the cv
# used simple SKF because in my case it shows good correlation with lb
# and also public/private split was "random" so i was not sure that it is time-wise
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
oof = train_df[['UniqueID', 'loan_default']]
oof['predict'] = 0
predictions = test_df[['UniqueID']]
val_aucs = []

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [69]:
features = [col for col in train_df.columns if col not in ['UniqueID', 'loan_default', 'Date.of.Birth', 'DisbursalDate']]
X_test = test_df[features].values

In [70]:
lgb_clf = lgb.LGBMClassifier(
                              boosting_type= 'gbdt',
                              learning_rate= 0.05,
                              n_estimators= 750,
                              num_leaves= 16,
                              objective= 'binary',
                              reg_lambda= 5,
                              seed= 15,
                              subsample= 0.5,
                              colsample_bytree=0.5
                              )

In [74]:
for fold, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['loan_default'])):
    X_train, y_train = train_df.iloc[trn_idx][features], train_df.iloc[trn_idx]['loan_default']
    X_valid, y_valid = train_df.iloc[val_idx][features], train_df.iloc[val_idx]['loan_default']
    
    p_valid,yp = 0,0
    
    lgb_clf.fit(X_train, y_train)
    p_valid += lgb_clf.predict_proba(X_valid)[:, 1]
    yp += lgb_clf.predict_proba(X_test)[:, 1]
        
    oof['predict'][val_idx] = p_valid
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    
    predictions['fold{}'.format(fold+1)] = yp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code, glob, local_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.o

Wall time: 1h 11min 15s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [75]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
all_auc = roc_auc_score(oof['loan_default'], oof['predict'])
print("Mean auc: %.9f, std: %.9f. All auc: %.9f." % (mean_auc, std_auc, all_auc))

Mean auc: 0.673137125, std: 0.002077454. All auc: 0.673116388.


In [76]:
predictions['loan_default'] = np.mean(predictions[[col for col in predictions.columns if col not in ['UniqueID', 'loan_default']]].values, axis=1)
predictions.to_csv('lgb_all_predictions_wmean1_mod3.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [77]:
sample_sub = pd.read_csv('sample_submission_24jSKY6.csv')

In [78]:
sample_sub['loan_default'] = predictions['loan_default'].values

In [79]:
sample_sub.to_csv('lgb_oof_wmean1_pred_mod3.csv', index=False)

In [80]:
oof.to_csv('oof_lgb_wmean1_mod3.csv', index=False)