In [1]:
import pandas as pd
import numpy as np

from feature_pr import * #make_counters, make_counters_test, compress_vals, replace_val, make_pairs

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
%matplotlib inline
plt.rcParams['figure.figsize'] = (10,7)

In [5]:
data = pd.read_csv('data/credit_train.csv', sep=';', encoding='cp1251')
data_test = pd.read_csv('data/credit_test.csv', sep=';', encoding='cp1251')

In [6]:
df = pd.concat([data, data_test])

In [7]:
counters_cols = ['tariff_id', 'living_region', 'age', 'credit_count', 'overdue_credit_count',
                 'gender', 'marital_status', 'job_position', 'credit_month', 'education']

other_cols = ['credit_sum', 'score_shk', 'monthly_income', 'monthly_credit_sum',
              'lr_median_income_dif', 'lr_median_credit_sum_dif', 'income_credit_sum_dif',
              'income_credit_sum_frac', 'lr_median_credit_sum_frac']

df.living_region.fillna('N', inplace=True)
df.credit_count.fillna(10, inplace=True)
df.overdue_credit_count.fillna(2, inplace=True)
df['credit_sum'] = df.credit_sum.apply(lambda x: float(x.replace(',', '.')))
df['score_shk'] = df.score_shk.apply(lambda x: float(x.replace(',', '.')))

# нормализация имен регионов для подсчета статистик, для классификации использовались исходные
a = ['РЕСП', 'ОБЛ', 'ОБЛАСТЬ', 'КРАЙ', 'ОБЛ.', 'РЕСП.', 'Р-Н', 'АО', 'КРАЙ.', '-', 'Г', 'Г.', 'АОБЛ', 'РЕСПУБЛИКА',
    'ОКРУГ', 'АВТОНОМНЫЙ']
df['living_region2'] = df.living_region.apply(lambda x: ''.join([y for y in x.upper().split() if y not in a]))
df['living_region2'] = df.living_region2.apply(lambda x: ''.join([y for y in x.split('.') if y not in a]))

df.loc[df.living_region2 == 'САХА/ЯКУТИЯ/', 'living_region2'] = 'САХА'
df.loc[df.living_region2 == 'САХА(ЯКУТИЯ)', 'living_region2'] = 'САХА'
df.loc[df.living_region2 == 'ХАНТЫ-МАНСИЙСКИЙЮГРА', 'living_region2'] = 'ХАНТЫ-МАНСИЙСКИЙ'
df.loc[df.living_region2 == 'ХАНТЫ-МАНСИЙСКИЙЮ', 'living_region2'] = 'ХАНТЫ-МАНСИЙСКИЙ'
df.loc[df.living_region2 == 'ЕВРЕЙСКАЯАВТОНОМНАЯ', 'living_region2'] = 'ЕВРЕЙСКАЯ'
df.loc[df.living_region2 == 'ЧУВАШСКАЯЧУВАШИЯ', 'living_region2'] = 'ЧУВАШСКАЯ'
df.loc[df.living_region2 == 'ЧУВАШИЯЧУВАШСКАЯ', 'living_region2'] = 'ЧУВАШСКАЯ'
df.loc[df.living_region2 == 'СЕВЕРНАЯОСЕТИЯАЛАНИЯ', 'living_region2'] = 'ОСЕТИЯ'
df.loc[df.living_region2 == 'СЕВОСЕТИЯАЛАНИЯ', 'living_region2'] = 'ОСЕТИЯ'
df.loc[df.living_region2 == 'ГОРЬКОВСКАЯ', 'living_region2'] = 'НИЖЕГОРОДСКАЯ'
df.loc[df.living_region2 == 'ПЕРМСКАЯ', 'living_region2'] = 'ПЕРМСКИЙ'
df.loc[df.living_region2 == 'КАМЧАТСКАЯ', 'living_region2'] = 'КАМЧАТСКИЙ'
df.loc[df.living_region2 == 'ЧУКОТСКИЙАO', 'living_region2'] = 'ЧУКОТСКИЙ'
df.loc[df.living_region2 == 'АЛТАЙ', 'living_region2'] = 'АЛТАЙСКИЙ'

# преобразования вещественных признаков
f = lambda x: x.fillna(x.median(), inplace=True)
df.groupby(df.living_region2).monthly_income.transform(f)

df['monthly_credit_sum'] = df.credit_sum / df.credit_month

median_income = dict(df.groupby(df.living_region2).monthly_income.median())
df['lr_median_income_dif'] = df.monthly_income - df.living_region2.map(median_income)

median_credit_sum = dict(df.groupby(df.living_region2).credit_sum.median())
df['lr_median_credit_sum_dif'] = df.credit_sum - df.living_region2.map(median_credit_sum)
df['lr_median_credit_sum_frac'] = df.credit_sum / df.living_region2.map(median_credit_sum)

df['income_credit_sum_dif'] = df.monthly_income - df.monthly_credit_sum
df['income_credit_sum_frac'] = df.monthly_income / df.monthly_credit_sum

# Объединим редко встречающиеся значения признаков
df.loc[df.credit_count > 10, 'credit_count'] = 10
df.loc[df.overdue_credit_count > 2, 'overdue_credit_count'] = 2

compress_vals(df, 'living_region', 5, 'N')
compress_vals(df, 'tariff_id', 5, 2.0)
compress_vals(df, 'job_position', 20, 'N')


replace_val(df, 'credit_month', [30, 31, 32], 36)
replace_val(df, 'credit_month', [21, 22, 23, 25, 26, 27, 28, 29], 24)
replace_val(df, 'credit_month', [17], 18)

for col in ['living_region', 'gender', 'marital_status', 'job_position', 'education']:
    df[col], _ = pd.factorize(df[col])

# частоты встреч
for col in counters_cols:
    df[col + '_count'] = df[col].map(df[col].value_counts())

In [8]:
data = df.iloc[:data.shape[0], :]
data_test = df.iloc[-data_test.shape[0]:, :]

In [9]:
# кодирование средним таргетом
X_train_prob = make_counters(X=data[counters_cols].values, y=data.open_account_flg.values, n_folds=10)
X_test_prob = make_counters_test(data_test[counters_cols].values, data[counters_cols].values, 
                            data.open_account_flg.values)
for i in range(len(counters_cols)):
    data[counters_cols[i] + '_prob'] = X_train_prob[:, i]
    data_test[counters_cols[i] + '_prob'] = X_test_prob[:, i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
feature_names = []
for name in counters_cols:
    feature_names.append(name + '_count')
    feature_names.append(name + '_prob')
for name in other_cols:
    feature_names.append(name)

In [None]:
from xgboost.sklearn import XGBClassifier

In [None]:
xgb = XGBClassifier(
 learning_rate=0.01,
 n_estimators=3030,
 max_depth=5,
 min_child_weight=5,
 gamma=0.0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='binary:logistic',
 scale_pos_weight=1,
 seed=27,
 nthread=nthr)

xgb.fit(data[feature_names], data.open_account_flg.values, eval_metric='auc')
feat_imp = pd.Series(alg.booster().get_score(importance_type='gain')).sort_values(ascending=False)

answer = xgb.predict_proba(data_test[feature_names])
an = pd.DataFrame({'_ID_': _ID_, '_VAL_': answer[:, 1]})
an.to_csv(output_name, index=False)

* validation: 0.77195
* public test: 0.7687
* private test: 0.7713