In [1]:
%pylab inline
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score

Populating the interactive namespace from numpy and matplotlib


### preprocess

In [2]:
usecols_test = np.array(['AMT_ANNUITY', 'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM', 
    'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_REQ_SOURCE_YEAR',
    'CREDIT_DELAY30', 'CREDIT_DELAY_MORE', 'CREDIT_FACILITY', 'DTIME_CREDIT_UPDATE',
    'DTIME_CREDIT_ENDDATE', 'DTIME_CREDIT_ENDDATE_FACT', 'CREDIT_TYPE',
    'DTIME_CREDIT', 'ID', 'NUM_SOURCE', 'TEXT_PAYMENT_DISCIPLINE'])
usecols_train = np.hstack((usecols_test, 'DEF'))
parse_dates=['DTIME_CREDIT', 'DTIME_CREDIT_ENDDATE',
    'DTIME_CREDIT_ENDDATE_FACT', 'DTIME_CREDIT_UPDATE']

data = pd.read_csv('sample/test.csv', usecols=usecols_test,
    parse_dates=parse_dates, dayfirst=True, infer_datetime_format=True).append(
    pd.read_csv('sample/train.csv', usecols=usecols_train,
    parse_dates=parse_dates, dayfirst=True, infer_datetime_format=True))
data.reset_index(drop=True, inplace=True)

time: 37.5 s


In [3]:
%%time
sorts = data['DTIME_CREDIT'].sort_values()
bad_idxs = sorts.index[:3]
true_idx = sorts.index[3]
data.loc[bad_idxs, 'DTIME_CREDIT'] = data.loc[true_idx, 'DTIME_CREDIT']

sorts = data['DTIME_CREDIT_UPDATE'].sort_values(ascending=False)
bad_idxs = sorts.index[:2]
true_idx = sorts.index[3]
data.loc[bad_idxs, 'DTIME_CREDIT_UPDATE'] = data.loc[true_idx, 'DTIME_CREDIT_UPDATE']

bad_idxs = data[['CREDIT_TYPE']].query('CREDIT_TYPE in (2, 8, 9, 10)').index
data.loc[bad_idxs, 'CREDIT_TYPE'] = 99

CPU times: user 992 ms, sys: 56 ms, total: 1.05 s
Wall time: 1.04 s


In [4]:
%%time
data['DELTA'] = (data['DTIME_CREDIT_ENDDATE'] - data['DTIME_CREDIT']).dt.days
data['DELTA_FACT'] = (data['DTIME_CREDIT_ENDDATE'] - data['DTIME_CREDIT_ENDDATE_FACT']).dt.days
data['DELTA_UPDATE_END'] = (data['DTIME_CREDIT_UPDATE'] - data['DTIME_CREDIT_ENDDATE']).dt.days
data['DELTA_UPDATE_START'] = (data['DTIME_CREDIT_UPDATE'] - data['DTIME_CREDIT']).dt.days

CPU times: user 1min 17s, sys: 60 ms, total: 1min 17s
Wall time: 1min 16s


In [5]:
%%time
masks = data['TEXT_PAYMENT_DISCIPLINE'].notnull()
counters = data['TEXT_PAYMENT_DISCIPLINE'][masks].apply(
    lambda s:(s.count('0'), s.count('1'), s.count('C'), s.count('X'),
              s.count('2') + s.count('3') + s.count('4') + s.count('5')))
counters = pd.DataFrame(np.vstack(counters), data[masks].index)
counters = counters.rename_axis({0:'COUNT_0', 1:'COUNT_1',
    2:'COUNT_C', 3:'COUNT_X', 4:'COUNT_MORE'}, axis=1)
data = data.join(counters)

CPU times: user 16.2 s, sys: 444 ms, total: 16.7 s
Wall time: 16.7 s


In [None]:
%%time
first_key = ['ID', 'DTIME_CREDIT', 'CREDIT_TYPE']
data = data.groupby(first_key).median()
data.reset_index(['DTIME_CREDIT', 'CREDIT_TYPE'], drop=True, inplace=True)
data.drop('NUM_SOURCE', axis=1, inplace=True)
data.to_csv('sample/data_merged.csv')

In [19]:
data.to_csv('sample/data_merged.csv')

### inference

In [3]:
%%time
data = pd.read_csv('sample/data_merged.csv')
lengths = data.groupby('ID').size()
labels = data.groupby('ID')['DEF'].first()
data.drop('DEF', axis=1, inplace=True)
data.loc[data['AMT_CREDIT_SUM_DEBT'].isnull(), 'AMT_CREDIT_SUM_DEBT'] = 0
data.loc[data['CREDIT_FACILITY'].isnull(), 'CREDIT_FACILITY'] = 0

CPU times: user 2.44 s, sys: 128 ms, total: 2.56 s
Wall time: 2.56 s
time: 2.57 s


In [4]:
%%time
grouped_mean = data.groupby('ID').mean()
grouped_mean = grouped_mean.fillna(grouped_mean.mean())

grouped_var = data.groupby('ID').var()
grouped_var = grouped_var.fillna(0)

grouped_data = grouped_mean.merge(grouped_var,
    how='outer', left_index=True, right_index=True)

train_idxs = labels[labels.notnull()].index
test_idxs = labels[labels.isnull()].index
train_data = grouped_data.loc[train_idxs]
train_labels = labels.loc[train_idxs]
test_data = grouped_data.loc[test_idxs]

CPU times: user 640 ms, sys: 120 ms, total: 760 ms
Wall time: 755 ms
time: 769 ms


In [32]:
%%time
model = LGBMClassifier(n_estimators=300, max_depth=-1, reg_lambda=1, num_leaves=4)
print(cross_val_score(model, train_data, train_labels, scoring='roc_auc').mean())

0.680878832429
CPU times: user 20.7 s, sys: 20 ms, total: 20.7 s
Wall time: 2.99 s
time: 3 s


In [33]:
%%time
model = LGBMClassifier(n_estimators=300, max_depth=-1, reg_lambda=1, num_leaves=4)
model.fit(train_data, train_labels)

CPU times: user 10 s, sys: 20 ms, total: 10 s
Wall time: 1.44 s
time: 1.45 s


In [35]:
%%time
test_labels = model.predict_proba(test_data)[:, 1]
np.save('sample/test_labels.npy', test_labels)
pd.DataFrame(test_labels, index=test_idxs).rename_axis(
    {0:'Score'}, axis=1).to_csv('submit.csv')

CPU times: user 2.38 s, sys: 12 ms, total: 2.39 s
Wall time: 606 ms
time: 611 ms
