# SAS competition (Home Credit). Прогнозирование невозврата кредита по кредитной истории

In [None]:
%pylab inline
import pandas as pd
import lightgbm as lgb
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold, KFold, cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder

from collections import Counter

In [None]:
fstats = [np.max, np.min, np.mean, np.std, np.median, np.sum]

In [None]:
INPUT_PATH = '../input/'

train = pd.read_csv(INPUT_PATH + 'train.csv')
test = pd.read_csv(INPUT_PATH + 'test.csv')

__Внимание__. Датасет состоит из двух аналогичных частей, одна из которых рассчитывается по всем строкам выборки, а другая только по активным кредитным заявкам. Части далее конкатенируются. Для этого нужно запустить весь большой фрагмент ниже два раза (с новой загрузкой датасета) и закомментированием фильтрации по активным заявкам при втором проходе.

Работа с датами:

In [None]:
train.SK_DATE_DECISION = train.SK_DATE_DECISION.apply(
    lambda x: datetime.datetime.strptime(str(x), '%Y%m%d')
)
test.SK_DATE_DECISION = test.SK_DATE_DECISION.apply(
    lambda x: datetime.datetime.strptime(str(x), '%Y%m%d')
)

train.DTIME_CREDIT = train.DTIME_CREDIT.apply(
    lambda x: datetime.datetime.strptime(str(x), '%d.%m.%Y') if x is not None else None
)
test.DTIME_CREDIT = test.DTIME_CREDIT.apply(
    lambda x: datetime.datetime.strptime(str(x), '%d.%m.%Y') if x is not None else None
)

train.DTIME_CREDIT_ENDDATE = train.DTIME_CREDIT_ENDDATE.apply(
    lambda x: datetime.datetime.strptime(str(x), '%d.%m.%Y') if type(x) == str else None
)
test.DTIME_CREDIT_ENDDATE = test.DTIME_CREDIT_ENDDATE.apply(
    lambda x: datetime.datetime.strptime(str(x), '%d.%m.%Y') if type(x) == str else None
)

train.DTIME_CREDIT_ENDDATE_FACT = train.DTIME_CREDIT_ENDDATE_FACT.apply(
    lambda x: datetime.datetime.strptime(str(x), '%d.%m.%Y') if type(x) == str else None
)
test.DTIME_CREDIT_ENDDATE_FACT = test.DTIME_CREDIT_ENDDATE_FACT.apply(
    lambda x: datetime.datetime.strptime(str(x), '%d.%m.%Y') if type(x) == str else None
)

train.DTIME_CREDIT_UPDATE = train.DTIME_CREDIT_UPDATE.apply(
    lambda x: datetime.datetime.strptime(str(x), '%d.%m.%Y') if type(x) == str else None
)
test.DTIME_CREDIT_UPDATE = test.DTIME_CREDIT_UPDATE.apply(
    lambda x: datetime.datetime.strptime(str(x), '%d.%m.%Y') if type(x) == str else None
)

Подготовка простых весов:

In [None]:
train['min_credit_date'] = train.ID.map(train.groupby('ID')['DTIME_CREDIT'].min())
test['min_credit_date'] = test.ID.map(test.groupby('ID')['DTIME_CREDIT'].min())

train['delta_credit_date'] = train.DTIME_CREDIT - train.min_credit_date
test['delta_credit_date'] = test.DTIME_CREDIT - test.min_credit_date

train['delta_credit_date'] = train.delta_credit_date.apply(lambda x: x.days)
test['delta_credit_date'] = test.delta_credit_date.apply(lambda x: x.days)

train['weight'] = train.delta_credit_date.astype(float) / train.ID.map(train.groupby('ID')['delta_credit_date'].max())
test['weight'] = test.delta_credit_date.astype(float) / test.ID.map(test.groupby('ID')['delta_credit_date'].max())

train['sum_weight'] = train.ID.map(train.groupby('ID')['weight'].sum())
test['sum_weight'] = test.ID.map(test.groupby('ID')['weight'].sum())

Немного кодирования:

In [None]:
le = LabelEncoder()
train.CREDIT_CURRENCY = le.fit_transform(train.CREDIT_CURRENCY)
test.CREDIT_CURRENCY = le.transform(test.CREDIT_CURRENCY)

Небольшая работа с выбросами:

In [None]:
train.loc[train.DTIME_CREDIT < datetime.datetime(1993, 9, 17), 'DTIME_CREDIT'] = datetime.datetime(1993, 9, 17)

train.loc[626482, 'DTIME_CREDIT_UPDATE'] = train.loc[626481, 'DTIME_CREDIT_UPDATE']
test.loc[1563857, 'DTIME_CREDIT_UPDATE'] = test.loc[1563856, 'DTIME_CREDIT_UPDATE']

In [None]:
train['late_return'] = (train.DTIME_CREDIT_ENDDATE < train.DTIME_CREDIT_ENDDATE_FACT).astype(int)
test['late_return'] = (test.DTIME_CREDIT_ENDDATE < test.DTIME_CREDIT_ENDDATE_FACT).astype(int)

__Внимание__. Следующую ячейку нужно закомментировать при повторном проходе.

In [None]:
# only active
train['is_current'] = (train.CREDIT_ACTIVE == 1).astype(int)
test['is_current'] = (test.CREDIT_ACTIVE == 1).astype(int)

train = train.loc[train.is_current == 1].reset_index(drop=True).copy()
test = test.loc[test.is_current == 1].reset_index(drop=True).copy()

In [None]:
df_train = pd.DataFrame(0, index=np.unique(train.ID), columns=[])
df_test = pd.DataFrame(0, index=np.unique(test.ID), columns=[])

In [None]:
df_train['ID'] = df_train.index
df_test['ID'] = df_test.index

Для честных ctr по группам:

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=14)
cv_ctr = GroupKFold(n_splits=5)

groups = np.zeros(len(train))
label = 0 
for i_tr, i_ts in cv.split(df_train):
    inds = df_train.iloc[i_ts]['ID']
    groups[np.in1d(train.ID, inds)] = label
    label += 1

In [None]:
train['day_delta'] = (train.DTIME_CREDIT_ENDDATE - train.DTIME_CREDIT).dt.days
train['day_delta_fact'] = (train.DTIME_CREDIT_ENDDATE - train.DTIME_CREDIT_ENDDATE_FACT).dt.days
train['day_delta_update_start'] = (train.DTIME_CREDIT_UPDATE - train.DTIME_CREDIT).dt.days
train['day_delta_update_end'] = (train.DTIME_CREDIT_UPDATE - train.DTIME_CREDIT_ENDDATE).dt.days

test['day_delta'] = (test.DTIME_CREDIT_ENDDATE - test.DTIME_CREDIT).dt.days
test['day_delta_fact'] = (test.DTIME_CREDIT_ENDDATE - test.DTIME_CREDIT_ENDDATE_FACT).dt.days
test['day_delta_update_start'] = (test.DTIME_CREDIT_UPDATE - test.DTIME_CREDIT).dt.days
test['day_delta_update_end'] = (test.DTIME_CREDIT_UPDATE - test.DTIME_CREDIT_ENDDATE).dt.days

Ненужные признаки:

In [None]:
train = train.drop('CREDIT_COLLATERAL', axis=1)
test = test.drop('CREDIT_COLLATERAL', axis=1)

Работа с текстовым признаком (количества символов), остальное позже:

In [None]:
tmp = train['TEXT_PAYMENT_DISCIPLINE'].apply(lambda x: Counter(x) if type(x) == str else {})
for char in ['C', 'X', '0', '1', '2', '3', '4', '5']:
    train['num_{}'.format(char)] = tmp.apply(lambda x: x.get(char, 0))

tmp = test['TEXT_PAYMENT_DISCIPLINE'].apply(lambda x: Counter(x) if type(x) == str else {})
for char in ['C', 'X', '0', '1', '2', '3', '4', '5']:
    test['num_{}'.format(char)] = tmp.apply(lambda x: x.get(char, 0))
    
train['num_other'] = train[['num_' + x for x in ['2', '3', '4', '5']]].sum(axis=1)
test['num_other'] = test[['num_' + x for x in ['2', '3', '4', '5']]].sum(axis=1)

for char in ['C', 'X', '0', '1', 'other']:
    for func in fstats:
        df_train['num_{}_{}'.format(char, func.__name__)] = df_train.ID.map(
            train.groupby('ID')['num_{}'.format(char)].agg(func).fillna(0)
        )
        df_test['num_{}_{}'.format(char, func.__name__)] = df_test.ID.map(
            test.groupby('ID')['num_{}'.format(char)].agg(func).fillna(0)
        )
        
for char in ['C', 'X', '0', '1', 'other']:
    train['tmp'] = train['num_{}'.format(char)] * train['weight']
    test['tmp'] = test['num_{}'.format(char)] * test['weight']
    df_train['num_{}_weighted'.format(char)] = df_train.ID.map(
        train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
    )
    df_test['num_{}_weighted'.format(char)] = df_test.ID.map(
        test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
    )

Различные признаки:

In [None]:
df_train['num_late_return'] = df_train.ID.map(train.groupby('ID')['late_return'].sum())
df_test['num_late_return'] = df_test.ID.map(test.groupby('ID')['late_return'].sum())

df_train['ratio_late_return'] = df_train.ID.map(train.groupby('ID')['late_return'].mean())
df_test['ratio_late_return'] = df_test.ID.map(test.groupby('ID')['late_return'].mean())

col = 'late_return'
new_col = 'num_late_return_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)

In [None]:
df_train['num_records'] = df_train.ID.map(train.groupby('ID').size())
df_test['num_records'] = df_test.ID.map(test.groupby('ID').size())

new_col = 'num_records_weighted'
train['tmp'] = np.ones(len(train)) * train['weight']
test['tmp'] = np.ones(len(test)) * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)

In [None]:
df_train['DEF'] = df_train.ID.map(train.groupby('ID')['DEF'].first())

In [None]:
tmp = train.groupby('ID')['CREDIT_ACTIVE'].agg(lambda x: list(x))
tmp = tmp.apply(lambda x: Counter(x))
df_train['num_active'] = df_train.ID.map(tmp.apply(lambda x: x[0]))
df_train['num_close'] = df_train.ID.map(tmp.apply(lambda x: x[1]))
df_train['num_sell'] = df_train.ID.map(tmp.apply(lambda x: x[2]))
df_train['num_bad'] = df_train.ID.map(tmp.apply(lambda x: x[3]))

tmp = test.groupby('ID')['CREDIT_ACTIVE'].agg(lambda x: list(x))
tmp = tmp.apply(lambda x: Counter(x))
df_test['num_active'] = df_test.ID.map(tmp.apply(lambda x: x[0]))
df_test['num_close'] = df_test.ID.map(tmp.apply(lambda x: x[1]))
df_test['num_sell'] = df_test.ID.map(tmp.apply(lambda x: x[2]))
df_test['num_bad'] = df_test.ID.map(tmp.apply(lambda x: x[3]))

In [None]:
df_train['ratio_active'] = df_train['num_active'] / df_train['num_records']
df_train['ratio_close'] = df_train['num_close'] / df_train['num_records']

df_test['ratio_active'] = df_test['num_active'] / df_test['num_records']
df_test['ratio_close'] = df_test['num_close'] / df_test['num_records']

In [None]:
for col in ['day_delta', 'day_delta_fact', 'day_delta_update_start', 'day_delta_update_end']:
    groups_train = train.groupby('ID')[col]
    groups_test = test.groupby('ID')[col]
    for func in fstats:
        df_train['{}_{}'.format(col, func.__name__)] = df_train.ID.map(groups_train.agg(func))
        df_test['{}_{}'.format(col, func.__name__)] = df_test.ID.map(groups_test.agg(func))
        
    new_col = col + '_weighted'
    train['tmp'] = train[col] * train['weight']
    test['tmp'] = test[col] * test['weight']
    df_train[new_col] = df_train.ID.map(
        train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
    )
    df_test[new_col] = df_test.ID.map(
        test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
    )

In [None]:
groups_train = train.groupby('ID')['CREDIT_DAY_OVERDUE']
groups_test = test.groupby('ID')['CREDIT_DAY_OVERDUE']

for func in fstats:
    df_train['day_overdue_{}'.format(func.__name__)] = df_train.ID.map(groups_train.agg(func))
    df_test['day_overdue_{}'.format(func.__name__)] = df_test.ID.map(groups_test.agg(func))

    df_train['day_overdue_non_zero_{}'.format(func.__name__)] = df_train.ID.map(
        train.loc[train['CREDIT_DAY_OVERDUE'] > 0].groupby('ID')['CREDIT_DAY_OVERDUE'].agg(func)
    ).fillna(0)
    df_test['day_overdue_non_zero_{}'.format(func.__name__)] = df_test.ID.map(
        test.loc[test['CREDIT_DAY_OVERDUE'] > 0].groupby('ID')['CREDIT_DAY_OVERDUE'].agg(func)
    ).fillna(0)


col = 'CREDIT_DAY_OVERDUE'
new_col = 'num_day_overdue_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)

In [None]:
train.CREDIT_FACILITY.fillna(-1, inplace=True)
test.CREDIT_FACILITY.fillna(-1, inplace=True)

Кодирование категориальных признаков:

In [None]:
target = df_train.DEF.values

In [None]:
groups = np.zeros(len(train))
label = 0 
for i_tr, i_ts in cv.split(df_train):
    inds = df_train.iloc[i_ts]['ID']
    groups[np.in1d(train.ID, inds)] = label
    label += 1

In [None]:
global_mean = target.mean()
alpha = 50.0
cat_cols = ['CREDIT_TYPE', 'CREDIT_FACILITY', 'CREDIT_CURRENCY']

for col in cat_cols:
    test['ctr_'+col.lower()] = 0
    for i_tr, i_ts in cv_ctr.split(train, groups=groups):
        counts = train.iloc[i_tr].groupby(col).size()
        means = train.iloc[i_tr].groupby(col)['DEF'].mean()
        train.loc[i_ts, 'ctr_'+col.lower()] = train.iloc[i_ts][col].map(
            (counts*means+alpha*global_mean)/(counts+alpha)
        ).fillna(global_mean)
        test['ctr_'+col.lower()] += test[col].map(
            (counts*means+alpha*global_mean)/(counts+alpha)
        ).fillna(global_mean) / 5.0

In [None]:
for col in ['ctr_credit_type', 'ctr_credit_facility', 'ctr_credit_currency']:
    groups_train = train.groupby('ID')[col]
    groups_test = test.groupby('ID')[col]
    for func in fstats:
        df_train['{}_{}'.format(col, func.__name__)] = df_train.ID.map(groups_train.agg(func))
        df_test['{}_{}'.format(col, func.__name__)] = df_test.ID.map(groups_test.agg(func))
        
    new_col = col + '_weighted'
    train['tmp'] = train[col] * train['weight']
    test['tmp'] = test[col] * test['weight']
    df_train[new_col] = df_train.ID.map(
        train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
    )
    df_test[new_col] = df_test.ID.map(
        test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
    )

Вторая часть работы с текстовым признаком (длины, доли, логрегрессия на tfidf и мешках слов, ...):

In [None]:
def aggregate_symbols(s, c):
    inds = np.where(np.array(list(s))  == c)[0]
    if len(inds) == 0:
        return 0
    else:
        return np.sum(1.0/(inds+1))

In [None]:
text = train['TEXT_PAYMENT_DISCIPLINE'].copy()

train['len_text_raw'] = train['TEXT_PAYMENT_DISCIPLINE'].fillna('').apply(len)
test['len_text_raw'] = test['TEXT_PAYMENT_DISCIPLINE'].fillna('').apply(len)

text.fillna('', inplace=True)
text = text.apply(lambda x: 
                  filter(lambda y: y in ['0', '1', '2', '3', '4', '5', 'C', 'X'], x)\
                  .replace('3', '2').replace('4', '2').replace('5', '2'))
cvect = CountVectorizer(analyzer='char', ngram_range=(1, 3))
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
train_features_tfidf = tfidf.fit_transform(text)
train_features_cvect = cvect.fit_transform(text)
train['len_text_clean'] = text.fillna('').apply(len)
for c in ['0', '1', '2', 'C', 'X']:
    train['golden_{}'.format(c)] = text.apply(lambda x: aggregate_symbols(x, c))
    
text = test['TEXT_PAYMENT_DISCIPLINE'].copy()
text.fillna('', inplace=True)
text = text.apply(lambda x: 
                  filter(lambda y: y in ['0', '1', '2', '3', '4', '5', 'C', 'X'], x)\
                  .replace('3', '2').replace('4', '2').replace('5', '2'))
test_features_tfidf = tfidf.transform(text)
test_features_cvect = cvect.transform(text)
test['len_text_clean'] = text.fillna('').apply(len)
for c in ['0', '1', '2', 'C', 'X']:
    test['golden_{}'.format(c)] = text.apply(lambda x: aggregate_symbols(x, c))

scaler = StandardScaler(with_mean=False)
train_features_cvect = scaler.fit_transform(train_features_cvect)
test_features_cvect = scaler.transform(test_features_cvect)

In [None]:
train['cvect'] = 0
train['tfidf'] = 0
test['cvect'] = 0
test['tfidf'] = 0

for i_tr, i_ts in cv_ctr.split(train, groups=groups):
    clf = LogisticRegression()
    clf.fit(train_features_cvect[i_tr], train.loc[i_tr, 'DEF'])
    train.loc[i_ts, 'cvect'] = clf.predict_proba(train_features_cvect[i_ts])[:, 1]
    test['cvect'] += clf.predict_proba(test_features_cvect)[:, 1] / 5.0

    clf = LogisticRegression()
    clf.fit(train_features_tfidf[i_tr], train.loc[i_tr, 'DEF'])
    train.loc[i_ts, 'tfidf'] = clf.predict_proba(train_features_tfidf[i_ts])[:, 1]
    test['tfidf'] += clf.predict_proba(test_features_tfidf)[:, 1] / 5.0

In [None]:
del train_features_cvect, test_features_cvect
del train_features_tfidf, test_features_tfidf

In [None]:
for char in ['C', 'X', '0', '1', 'other']:
    train['ratio_raw_{}'.format(char)] = train['num_{}'.format(char)].astype(float) / train['len_text_raw']
    train['ratio_clean_{}'.format(char)] = train['num_{}'.format(char)].astype(float) / train['len_text_clean']
    
    test['ratio_raw_{}'.format(char)] = test['num_{}'.format(char)].astype(float) / test['len_text_raw']
    test['ratio_clean_{}'.format(char)] = test['num_{}'.format(char)].astype(float) / test['len_text_clean']
    
    for func in fstats:
        df_train['ratio_clean_{}_{}'.format(char, func.__name__)] = df_train.ID.map(
            train.groupby('ID')['ratio_clean_{}'.format(char)].agg(func).fillna(0)
        )
        df_test['ratio_clean_{}_{}'.format(char, func.__name__)] = df_test.ID.map(
            test.groupby('ID')['ratio_clean_{}'.format(char)].agg(func).fillna(0)
        )
        
        df_train['ratio_raw_{}_{}'.format(char, func.__name__)] = df_train.ID.map(
            train.groupby('ID')['ratio_raw_{}'.format(char)].agg(func).fillna(0)
        )
        df_test['ratio_raw_{}_{}'.format(char, func.__name__)] = df_test.ID.map(
            test.groupby('ID')['ratio_raw_{}'.format(char)].agg(func).fillna(0)
        )
    
    col = 'ratio_raw_{}'.format(char)
    new_col = 'ratio_raw_{}'.format(char) + '_weighted'
    train['tmp'] = train[col] * train['weight']
    test['tmp'] = test[col] * test['weight']
    df_train[new_col] = df_train.ID.map(
        train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
    )
    df_test[new_col] = df_test.ID.map(
        test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
    )
    
    col = 'ratio_clean_{}'.format(char)
    new_col = 'ratio_clean_{}'.format(char) + '_weighted'
    train['tmp'] = train[col] * train['weight']
    test['tmp'] = test[col] * test['weight']
    df_train[new_col] = df_train.ID.map(
        train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
    )
    df_test[new_col] = df_test.ID.map(
        test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
    )

In [None]:
for col in ['tfidf', 'cvect', 'len_text_raw', 'len_text_clean']:
    groups_train = train.groupby('ID')[col]
    groups_test = test.groupby('ID')[col]
    for func in fstats:
        df_train['{}_{}'.format(col, func.__name__)] = df_train.ID.map(groups_train.agg(func))
        df_test['{}_{}'.format(col, func.__name__)] = df_test.ID.map(groups_test.agg(func))
        
    new_col = col + '_weighted'
    train['tmp'] = train[col] * train['weight']
    test['tmp'] = test[col] * test['weight']
    df_train[new_col] = df_train.ID.map(
        train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
    )
    df_test[new_col] = df_test.ID.map(
        test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
    )

In [None]:
for c in ['0', '1', '2', 'C', 'X']:
    groups_train = train.groupby('ID')['golden_{}'.format(c)]
    groups_test = test.groupby('ID')['golden_{}'.format(c)]
    for func in fstats:
        df_train['golden_{}_{}'.format(c, func.__name__)] = df_train.ID.map(groups_train.agg(func))
        df_test['golden_{}_{}'.format(c, func.__name__)] = df_test.ID.map(groups_test.agg(func))
        
    new_col = 'golden_{}'.format(c) + '_weighted'
    train['tmp'] = train['golden_{}'.format(c)] * train['weight']
    test['tmp'] = test['golden_{}'.format(c)] * test['weight']
    df_train[new_col] = df_train.ID.map(
        train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
    )
    df_test[new_col] = df_test.ID.map(
        test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
    )

Ещё различные признаки:

In [None]:
train['has_micro_credit'] = (train.CREDIT_TYPE == 19).astype(int)
test['has_micro_credit'] = (test.CREDIT_TYPE == 19).astype(int)
train['has_ipoteka'] = (train.CREDIT_TYPE == 3).astype(int)
test['has_ipoteka'] = (test.CREDIT_TYPE == 3).astype(int)
train['has_potreb'] = (train.CREDIT_TYPE == 3).astype(int)
test['has_potreb'] = (test.CREDIT_TYPE == 3).astype(int)

df_train['num_micro_credit'] = df_train.ID.map(train.groupby('ID')['has_micro_credit'].sum())
df_test['num_micro_credit'] = df_test.ID.map(test.groupby('ID')['has_micro_credit'].sum())
df_train['num_ipoteka'] = df_train.ID.map(train.groupby('ID')['has_ipoteka'].sum())
df_test['num_ipoteka'] = df_test.ID.map(test.groupby('ID')['has_ipoteka'].sum())
df_train['num_potreb'] = df_train.ID.map(train.groupby('ID')['has_potreb'].sum())
df_test['num_potreb'] = df_test.ID.map(test.groupby('ID')['has_potreb'].sum())

df_train['ratio_micro_credit'] = df_train.ID.map(train.groupby('ID')['has_micro_credit'].mean())
df_test['ratio_micro_credit'] = df_test.ID.map(test.groupby('ID')['has_micro_credit'].mean())
df_train['ratio_ipoteka'] = df_train.ID.map(train.groupby('ID')['has_ipoteka'].mean())
df_test['ratio_ipoteka'] = df_test.ID.map(test.groupby('ID')['has_ipoteka'].mean())
df_train['ratio_potreb'] = df_train.ID.map(train.groupby('ID')['has_potreb'].mean())
df_test['ratio_potreb'] = df_test.ID.map(test.groupby('ID')['has_potreb'].mean())

for col in ['has_micro_credit', 'has_ipoteka', 'has_potreb']:
    new_col = col + '_weighted'
    train['tmp'] = train[col] * train['weight']
    test['tmp'] = test[col] * test['weight']
    df_train[new_col] = df_train.ID.map(
        train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
    )
    df_test[new_col] = df_test.ID.map(
        test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
    )

In [None]:
for func in fstats:
    df_train['amt_annuity_{}'.format(func.__name__)] = df_train.ID.map(train.groupby('ID')['AMT_ANNUITY'].agg(func))
    df_test['amt_annuity_{}'.format(func.__name__)] = df_test.ID.map(test.groupby('ID')['AMT_ANNUITY'].agg(func))

    df_train['amt_annuity_nonzero_{}'.format(func.__name__)] = df_train.ID.map(
        train.loc[train.AMT_ANNUITY > 0].groupby('ID')['AMT_ANNUITY'].agg(func))
    df_test['amt_annuity_nonzero_{}'.format(func.__name__)] = df_test.ID.map(
        test.loc[test.AMT_ANNUITY > 0].groupby('ID')['AMT_ANNUITY'].agg(func))

col = 'AMT_ANNUITY'
new_col = 'amt_annuity_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)

In [None]:
for func in fstats:
    df_train['amt_annuity_current_{}'.format(func.__name__)] = df_train.ID.map(
        train.loc[train.DTIME_CREDIT_ENDDATE > train.SK_DATE_DECISION].groupby('ID')['AMT_ANNUITY'].agg(func))
    df_test['amt_annuity_current_{}'.format(func.__name__)] = df_test.ID.map(
        test.loc[test.DTIME_CREDIT_ENDDATE > test.SK_DATE_DECISION].groupby('ID')['AMT_ANNUITY'].agg(func))

In [None]:
train['AMT_REQ'] = train[[x for x in train.columns if x.startswith('AMT_REQ_SOURCE')]].sum(axis=1)
test['AMT_REQ'] = test[[x for x in train.columns if x.startswith('AMT_REQ_SOURCE')]].sum(axis=1)

for func in fstats:
    df_train['num_requests_{}'.format(func.__name__)] = df_train.ID.map(train.groupby('ID')['AMT_REQ'].agg(func))
    df_test['num_requests_{}'.format(func.__name__)] = df_test.ID.map(test.groupby('ID')['AMT_REQ'].agg(func))
    
col = 'AMT_REQ'
new_col = 'num_requests_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)

In [None]:
for func in fstats:
    df_train['num_prolong_{}'.format(func.__name__)] = df_train.ID.map(
        train.groupby('ID')['CNT_CREDIT_PROLONG'].agg(func)
    )
    df_test['num_prolong_{}'.format(func.__name__)] = df_test.ID.map(
        test.groupby('ID')['CNT_CREDIT_PROLONG'].agg(func)
    )
    
col = 'CNT_CREDIT_PROLONG'
new_col = 'num_prolong_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)
    
for func in fstats:
    df_train['amt_max_overdue_{}'.format(func.__name__)] = df_train.ID.map(
        train.groupby('ID')['AMT_CREDIT_MAX_OVERDUE'].agg(func)
    )
    df_test['amt_max_overdue_{}'.format(func.__name__)] = df_test.ID.map(
        test.groupby('ID')['AMT_CREDIT_MAX_OVERDUE'].agg(func)
    )
    
col = 'AMT_CREDIT_MAX_OVERDUE'
new_col = 'amt_max_overdue_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)
    
for func in fstats:
    df_train['amt_sum_{}'.format(func.__name__)] = df_train.ID.map(
        train.groupby('ID')['AMT_CREDIT_SUM'].agg(func)
    )
    df_test['amt_sum_{}'.format(func.__name__)] = df_test.ID.map(
        test.groupby('ID')['AMT_CREDIT_SUM'].agg(func)
    )    

col = 'AMT_CREDIT_SUM'
new_col = 'amt_sum_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)
    
for func in fstats:
    df_train['amt_sum_debt_{}'.format(func.__name__)] = df_train.ID.map(
        train.groupby('ID')['AMT_CREDIT_SUM_DEBT'].agg(func)
    )
    df_test['amt_sum_debt_{}'.format(func.__name__)] = df_test.ID.map(
        test.groupby('ID')['AMT_CREDIT_SUM_DEBT'].agg(func)
    )
    
col = 'AMT_CREDIT_SUM_DEBT'
new_col = 'amt_sum_debt_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)
    
for func in fstats:
    df_train['amt_sum_limit_{}'.format(func.__name__)] = df_train.ID.map(
        train.groupby('ID')['AMT_CREDIT_SUM_LIMIT'].agg(func)
    )
    df_test['amt_sum_limit_{}'.format(func.__name__)] = df_test.ID.map(
        test.groupby('ID')['AMT_CREDIT_SUM_LIMIT'].agg(func)
    )
    
col = 'AMT_CREDIT_SUM_LIMIT'
new_col = 'amt_sum_limit_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)
    
for func in fstats:
    df_train['amt_sum_overdue_{}'.format(func.__name__)] = df_train.ID.map(
        train.groupby('ID')['AMT_CREDIT_SUM_OVERDUE'].agg(func)
    )
    df_test['amt_sum_overdue_{}'.format(func.__name__)] = df_test.ID.map(
        test.groupby('ID')['AMT_CREDIT_SUM_OVERDUE'].agg(func)
    )
    
col = 'AMT_CREDIT_SUM_OVERDUE'
new_col = 'amt_sum_overdue_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)

In [None]:
for func in fstats:
    df_train['sum_type_{}'.format(func.__name__)] = df_train.ID.map(
        train.groupby('ID')['CREDIT_SUM_TYPE'].agg(func)
    )
    df_test['sum_type_{}'.format(func.__name__)] = df_test.ID.map(
        test.groupby('ID')['CREDIT_SUM_TYPE'].agg(func)
    )
    
col = 'CREDIT_SUM_TYPE'
new_col = 'sum_type_weighted'
train['tmp'] = train[col] * train['weight']
test['tmp'] = test[col] * test['weight']
df_train[new_col] = df_train.ID.map(
    train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
)
df_test[new_col] = df_test.ID.map(
    test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
)

In [None]:
train['CREDIT_DELAY_ALL'] = train[[x for x in train.columns if x.startswith('CREDIT_DELAY')]].sum(axis=1)
test['CREDIT_DELAY_ALL'] = test[[x for x in test.columns if x.startswith('CREDIT_DELAY')]].sum(axis=1)

In [None]:
for col in [x for x in train.columns if x.startswith('CREDIT_DELAY')]:
    for func in fstats:
        df_train['{}_{}'.format(col.lower(), func.__name__)] = df_train.ID.map(
            train.groupby('ID')[col].agg(func)
        )
        df_test['{}_{}'.format(col.lower(), func.__name__)] = df_test.ID.map(
            test.groupby('ID')[col].agg(func)
        )
        
    new_col = col.lower() + '_weighted'
    train['tmp'] = train[col] * train['weight']
    test['tmp'] = test[col] * test['weight']
    df_train[new_col] = df_train.ID.map(
        train.groupby('ID')['tmp'].mean() / train.groupby('ID')['sum_weight'].first()
    )
    df_test[new_col] = df_test.ID.map(
        test.groupby('ID')['tmp'].mean() / test.groupby('ID')['sum_weight'].first()
    )

Здесь сохраним данные от первого прохода:

In [None]:
df_train_active = df_train.copy()
df_test_active = df_test.copy()

__Внимание__. Теперь необходимо вернуться назад и заново загрузить датасет, сделать второй проход.

Объединение признаков:

In [None]:
df_train.set_index('ID', inplace=True)
df_test.set_index('ID', inplace=True)
df_train_active.set_index('ID', inplace=True)
df_test_active.set_index('ID', inplace=True)

In [None]:
df_train = pd.concat((df_train, df_train_active.rename(columns={x:x+'_active' for x in df_train_active.columns})), axis=1)
df_test = pd.concat((df_test, df_test_active.rename(columns={x:x+'_active' for x in df_test_active.columns})), axis=1)

df_train.drop('DEF_active', axis=1, inplace=True)

Обучим несколько моделей с различными отложенными выборками и параметрами, усредним:

In [None]:
list_params = [
    {'lambda_l2': 1.1391988554694428e-40, 'num_leaves': 15},
    {'lambda_l2': 1.0, 'num_leaves': 26},
]

ltrain = lgb.Dataset(df, target)

for i, params in enumerate(list_params):
    params['objective'] = 'binary'
    params['metric'] = 'auc'
    params['learning_rate'] = 0.02
    
    df_test['prediction_{}'.format(str(i))] = 0
    
    scores = []
    for i_tr, i_ts in cv.split(df):
        X_tr = df.iloc[i_tr]
        X_ts = df.iloc[i_ts]
        y_tr = target[i_tr]
        y_ts = target[i_ts]
        ltrain = lgb.Dataset(X_tr, y_tr)
        lvalid = ltrain.create_valid(X_ts, y_ts)
        bst = lgb.train(params, train_set=ltrain, valid_sets=lvalid,
                        num_boost_round=2000, early_stopping_rounds=50,
                        verbose_eval=False)
        y_pred = bst.predict(X_ts)
        scores.append(roc_auc_score(y_ts, y_pred))    
        y_pr = bst.predict(df_test)
        df_test['prediction_{}'.format(str(i))] += y_pr / 5.0
        
    print np.mean(scores), np.std(scores)

Сохраним предсказания:

In [None]:
df_test[[x for x in df_test.columns if x.startswith('prediction_')]].corr()
df_test['Score'] = df_test[[x for x in df_test.columns if x.startswith('prediction_')]].mean(axis=1)
df_test[['ID', 'Score']].to_csv('../output/submission.csv', index=False)