In [1]:
# %run preprocessing.ipynb

import pandas as pd
train = pd.read_csv('data/credit_train.csv', sep = ';', encoding='cp1251', index_col = 'client_id')
X = train.drop('open_account_flg', axis = 1)
y = train.open_account_flg
X_test = pd.read_csv('data/credit_test.csv', sep = ';', encoding='cp1251', index_col = 'client_id')
df = pd.concat([X, X_test], axis=0)
df.fillna(-999, inplace = True)
df['credit_sum'] = [float(s.replace(',', '.')) for s in df.credit_sum]
df['score_shk'] = [float(s.replace(',', '.')) for s in df.score_shk]
education_map = { 'SCH' : 0, 'UGR' : 1, 'GRD' : 2, 'PGR' : 3, 'ACD' : 4 }
df.replace({'education' : education_map}, inplace = True)
cat_variables = [i for i in df.columns if df[i].dtype == 'O']
from sklearn.preprocessing import LabelEncoder
encods = [LabelEncoder() for col in cat_variables]
for i, col in enumerate(cat_variables):
    df[col] = encods[i].fit_transform(df[col].astype(str))
X = df[:len(X)]
X_test = df[len(X):]

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score

import xgboost as xgb

from datetime import datetime

In [3]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size = 0.4)

In [4]:
dtrain = xgb.DMatrix(X_train, y_train, missing=-999)
deval = xgb.DMatrix(X_eval, y_eval, missing=-999)
dtest = xgb.DMatrix(X_test, missing=-999)

In [5]:
param = {
    'max_depth' : 7,
    'objective':'binary:logistic'
}
param['eval_metric'] = 'auc'
evallist  = [(deval,'eval'), (dtrain,'train')]

In [6]:
num_round = 1000
bst = xgb.train(param, 
                dtrain, 
                num_round,
                evals = evallist,
                verbose_eval = 100
               )

[0]	eval-auc:0.732437	train-auc:0.740606
[100]	eval-auc:0.756861	train-auc:0.86523
[200]	eval-auc:0.748634	train-auc:0.909786
[300]	eval-auc:0.742346	train-auc:0.939339
[400]	eval-auc:0.737913	train-auc:0.957205
[500]	eval-auc:0.733446	train-auc:0.970081
[600]	eval-auc:0.729127	train-auc:0.978128
[700]	eval-auc:0.725401	train-auc:0.985476
[800]	eval-auc:0.72216	train-auc:0.990886
[900]	eval-auc:0.719792	train-auc:0.993907


In [7]:
y_pred = bst.predict(deval)
auc = roc_auc_score(y_eval, y_pred)
print('The auc of prediction is:', auc)

The auc of prediction is: 0.71807248517


In [8]:
pred = bst.predict(dtest)

datename = datetime.now().strftime(format = '%d%m_%H%M')
filename = 'submissions/' + 'xgb_' + datename + '_' + str(round(auc, 5)) + '.csv'
print('saving to %s...' % filename)

pd.DataFrame({'_ID_': X_test.index, '_VAL_': pred}).to_csv(filename, index=False, sep=';')

saving to submissions/xgb_1301_1818_0.71807.csv...


In [9]:
print()




In [10]:
%ls submissions/

gbm_1201_1932.csv          gbm_1201_2136_0.76083.csv
gbm_1201_1942_0.73039.csv  gbm_1201_2140_0.76138.csv
gbm_1201_1943_0.75541.csv  gbm_1301_1600_0.76456.csv
gbm_1201_2037_0.75769.csv  gbm_1301_1604_0.76904.csv
gbm_1201_2038_0.75769.csv  xgb_1201_1843.csv
gbm_1201_2106_0.5002.csv   xgb_1201_1848.csv
gbm_1201_2107_0.7604.csv   xgb_1301_1630_0.76089.csv
gbm_1201_2120_0.75891.csv  xgb_1301_1818_0.71807.csv
