In [73]:
%pylab inline
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

Populating the interactive namespace from numpy and matplotlib


### preprocess

In [2]:
%%time
usecols_test = np.array(['AMT_ANNUITY', 'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM', 
    'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_REQ_SOURCE_YEAR',
    'CREDIT_DELAY30', 'CREDIT_DELAY_MORE', 'CREDIT_FACILITY', 'DTIME_CREDIT_UPDATE',
    'DTIME_CREDIT_ENDDATE', 'DTIME_CREDIT_ENDDATE_FACT', 'CREDIT_TYPE',
    'DTIME_CREDIT', 'ID', 'TEXT_PAYMENT_DISCIPLINE'])
usecols_train = np.hstack((usecols_test, 'DEF'))
parse_dates=['DTIME_CREDIT', 'DTIME_CREDIT_ENDDATE',
    'DTIME_CREDIT_ENDDATE_FACT', 'DTIME_CREDIT_UPDATE']

data = pd.read_csv('test.csv', usecols=usecols_test,
    parse_dates=parse_dates, dayfirst=True, infer_datetime_format=True).append(
    pd.read_csv('train.csv', usecols=usecols_train,
    parse_dates=parse_dates, dayfirst=True, infer_datetime_format=True))
data.reset_index(drop=True, inplace=True)

CPU times: user 35.1 s, sys: 520 ms, total: 35.6 s
Wall time: 35.5 s


In [3]:
%%time
sorts = data['DTIME_CREDIT'].sort_values()
bad_idxs = sorts.index[:3]
true_idx = sorts.index[3]
data.loc[bad_idxs, 'DTIME_CREDIT'] = data.loc[true_idx, 'DTIME_CREDIT']

sorts = data['DTIME_CREDIT_UPDATE'].sort_values(ascending=False)
bad_idxs = sorts.index[:2]
true_idx = sorts.index[3]
data.loc[bad_idxs, 'DTIME_CREDIT_UPDATE'] = data.loc[true_idx, 'DTIME_CREDIT_UPDATE']

bad_idxs = data[['CREDIT_TYPE']].query('CREDIT_TYPE in (2, 8, 9, 10)').index
data.loc[bad_idxs, 'CREDIT_TYPE'] = 99

CPU times: user 948 ms, sys: 100 ms, total: 1.05 s
Wall time: 1.04 s


In [4]:
%%time
data['DELTA'] = (data['DTIME_CREDIT_ENDDATE'] - data['DTIME_CREDIT']).dt.days
data['DELTA_FACT'] = (data['DTIME_CREDIT_ENDDATE'] - data['DTIME_CREDIT_ENDDATE_FACT']).dt.days
data['DELTA_UPDATE_END'] = (data['DTIME_CREDIT_UPDATE'] - data['DTIME_CREDIT_ENDDATE']).dt.days
data['DELTA_UPDATE_START'] = (data['DTIME_CREDIT_UPDATE'] - data['DTIME_CREDIT']).dt.days

data['DAY_MONTH_CREDIT'] = data['DTIME_CREDIT'].dt.day
data['DAY_WEEK_CREDIT'] = data['DTIME_CREDIT'].dt.dayofweek
data['DAY_YEAR_CREDIT'] = data['DTIME_CREDIT'].dt.dayofyear

data['DAY_MONTH_CREDIT_ENDDATE'] = data['DTIME_CREDIT_ENDDATE'].dt.day
data['DAY_WEEK_CREDIT_ENDDATE'] = data['DTIME_CREDIT_ENDDATE'].dt.dayofweek
data['DAY_YEAR_CREDIT_ENDDATE'] = data['DTIME_CREDIT_ENDDATE'].dt.dayofyear

data['DAY_MONTH_CREDIT_UPDATE'] = data['DTIME_CREDIT_UPDATE'].dt.day
data['DAY_WEEK_CREDIT_UPDATE'] = data['DTIME_CREDIT_UPDATE'].dt.dayofweek
data['DAY_YEAR_CREDIT_UPDATE'] = data['DTIME_CREDIT_UPDATE'].dt.dayofyear

data['DAY_MONTH_CREDIT_ENDDATE_FACT'] = data['DTIME_CREDIT_ENDDATE_FACT'].dt.day
data['DAY_WEEK_CREDIT_ENDDATE_FACT'] = data['DTIME_CREDIT_ENDDATE_FACT'].dt.dayofweek
data['DAY_YEAR_CREDIT_ENDDATE_FACT'] = data['DTIME_CREDIT_ENDDATE_FACT'].dt.dayofyear

CPU times: user 1min 24s, sys: 100 ms, total: 1min 24s
Wall time: 1min 24s


In [7]:
%%time
data['TEXT_PAYMENT_DISCIPLINE'].fillna('', inplace=True)
counters = data['TEXT_PAYMENT_DISCIPLINE'].apply(
    lambda s:(s.count('0'), s.count('1'), s.count('C'), s.count('X'),
              s.count('2') + s.count('3') + s.count('4') + s.count('5')))
counters = pd.DataFrame(np.vstack(counters), data.index)
counters = counters.rename_axis({0:'COUNT_0', 1:'COUNT_1',
    2:'COUNT_C', 3:'COUNT_X', 4:'COUNT_MORE'}, axis=1)
anuar = data['TEXT_PAYMENT_DISCIPLINE'].apply(
    lambda s: sum([1/(j + 1) for j, c in enumerate(s) if c.isdigit() and c=='0']))
data = pd.concat((data, counters, anuar.rename('ANUAR')), axis=1)

CPU times: user 54.7 s, sys: 436 ms, total: 55.2 s
Wall time: 55.2 s


In [18]:
%%time
first_key = ['ID', 'DTIME_CREDIT', 'CREDIT_TYPE']
data = data.groupby(first_key).median()
data.to_csv('data.csv')

CPU times: user 1min 17s, sys: 1.04 s, total: 1min 18s
Wall time: 1min 18s


### inference

In [20]:
%%time
first_key = ['ID', 'DTIME_CREDIT', 'CREDIT_TYPE']
data = pd.read_csv('data.csv', parse_dates=['DTIME_CREDIT'], infer_datetime_format=True)
data = data.groupby(first_key).first()
data.reset_index(['DTIME_CREDIT', 'CREDIT_TYPE'], inplace=True)
data.drop('DTIME_CREDIT', axis=1, inplace=True)

CPU times: user 6.38 s, sys: 284 ms, total: 6.67 s
Wall time: 6.67 s


In [28]:
%%time
lengths = data.groupby('ID').size()
labels = data.groupby('ID')['DEF'].first()
idxs = data.index
data.drop('DEF', axis=1, inplace=True)
data.loc[data['CREDIT_TYPE'] != 19, 'CREDIT_TYPE'] = 0
data.loc[data['CREDIT_TYPE'] == 19, 'CREDIT_TYPE'] = 1
data.loc[data['AMT_CREDIT_SUM_DEBT'].isnull(), 'AMT_CREDIT_SUM_DEBT'] = 0
data.loc[data['CREDIT_FACILITY'].isnull(), 'CREDIT_FACILITY'] = 0

CPU times: user 260 ms, sys: 44 ms, total: 304 ms
Wall time: 302 ms


In [107]:
%%time
grouped_mean = data.groupby('ID').mean()
grouped_std = data.groupby('ID').std()
grouped_median = data.groupby('ID').median()
grouped_max = data.groupby('ID').max()
grouped_min = data.groupby('ID').min()
grouped_data = pd.concat((grouped_mean, grouped_std, grouped_median,
    grouped_max, grouped_min, lengths.rename('LEN')), axis=1)

train_idxs = labels[labels.notnull()].index
test_idxs = labels[labels.isnull()].index
train_data = grouped_data.loc[train_idxs]
train_labels = labels.loc[train_idxs]
test_data = grouped_data.loc[test_idxs]

CPU times: user 3.64 s, sys: 280 ms, total: 3.92 s
Wall time: 3.92 s


In [142]:
%%time
model = LGBMClassifier(n_estimators=300, max_depth=2, reg_lambda=1)
print(cross_val_score(model, train_data, train_labels, cv=5, scoring='roc_auc').mean())

0.684501059057
CPU times: user 1min 51s, sys: 540 ms, total: 1min 51s
Wall time: 16.6 s


In [126]:
model = LGBMClassifier(n_estimators=300, max_depth=2, reg_lambda=1)
model.fit(train_data, train_labels)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=2, min_child_samples=10, min_child_weight=5,
        min_split_gain=0.0, n_estimators=300, n_jobs=-1, num_leaves=31,
        objective=None, random_state=0, reg_alpha=0.0, reg_lambda=1,
        silent=True, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1)

In [143]:
%%time
model = LGBMClassifier(n_estimators=300, max_depth=2, reg_lambda=1)
model.fit(train_data, train_labels)
test_labels = model.predict_proba(test_data)[:, 1]
pd.DataFrame(test_labels, index=test_idxs).rename_axis(
    {0:'Score'}, axis=1).to_csv('submit.csv')

CPU times: user 29.3 s, sys: 180 ms, total: 29.5 s
Wall time: 4.71 s
