In [2]:
from sklearn.model_selection import train_test_split
from sklearn import metrics   #Additional scklearn functions
import pandas as pd
import numpy as np
import xgboost as xgb

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20, 4

In [3]:
train_df = pd.read_csv('../dump/train_modified.csv')
test_df = pd.read_csv('../dump/test_modified.csv')

In [4]:
target='acc_now_delinq'
IDcol = 'member_id'
train_df[target].value_counts()

0.0    697707
1.0      3258
Name: acc_now_delinq, dtype: int64

In [31]:
predictors = [x for x in train_df.columns if x not in [target,IDcol]]
params = {
        'objective': 'binary:logistic',
        'eta': 0.1,
        'max_depth': 9,
        'seed': 27,
        'n_estimators':1000,
        'min_child_weight':1,
        'gamma':0,
        'subsample':0.8,
        'colsample_bytree':0.8,
        'n_jobs' : -1,
        'scale_pos_weight':100,
#     'eval_metric' : 'auc'
    }

In [15]:
def f1_error(preds,dtrain): #preds是结果（概率值），dtrain是个带label的DMatrix
    beta=2
    label=dtrain.get_label() #提取label
    
    preds = 1.0/(1.0 + np.exp(-preds))
    pred = [int(i>=0.5) for i in preds]
    
    tp=sum([int(i==1 and j==1) for i,j in zip(pred,label)])
    precision = float(tp)/sum(pred)
    recall=float(tp)/sum(label)
    
    return 'f1-score', -1*(precision*recall/(precision+recall))

In [16]:
def maxRecall(preds,dtrain): #preds是结果（概率值），dtrain是个带label的DMatrix
    labels=dtrain.get_label() #提取label
    preds=1-preds
    precision,recall,threshold = metrics.precision_recall_curve(labels,preds,pos_label=0)
    pr=pd.DataFrame({'precision':precision,'recall':recall})
    return 'Max Recall:',pr[pr.precision>=0.5].recall.max()

In [33]:
def f1_error(preds,dtrain):
    beta=2
    label=dtrain.get_label() #提取label
    
    preds = 1.0/(1.0 + np.exp(-preds))
    pred = [int(i>=0.5) for i in preds]
    
    tp=sum([int(i==1 and j==1) for i,j in zip(pred,label)])
    precision = float(tp)/sum(pred)
    recall=float(tp)/sum(label)
    
    return 'f1-score', -1*(precision*recall/(precision+recall))
    

In [9]:
# Split Train/Test data
x_train, x_valid, y_train, y_valid = train_test_split(train_df[predictors], train_df[target], test_size=0.15, random_state=14)

In [34]:

xgbtrain = xgb.DMatrix(train_df[predictors], train_df[target])
xgbvalid = xgb.DMatrix(x_valid, label=y_valid)
xgbtest = xgb.DMatrix(test_df[predictors])
watchlist = [(xgbtrain, 'train'), (xgbtrain, 'test')]
num_rounds = 60
model = xgb.train(params, xgbtrain, num_rounds, watchlist, early_stopping_rounds=15
                  , feval = f1_error
                 )
p_valid = model.predict(xgbvalid)

[0]	train-error:0.154731	test-error:0.154731	train-f1-score:-0.004626	test-f1-score:-0.004626
Multiple eval metrics have been passed: 'test-f1-score' will be used for early stopping.

Will train until test-f1-score hasn't improved in 15 rounds.
[1]	train-error:0.101992	test-error:0.101992	train-f1-score:-0.004626	test-f1-score:-0.004626
[2]	train-error:0.089529	test-error:0.089529	train-f1-score:-0.004626	test-f1-score:-0.004626
[3]	train-error:0.082404	test-error:0.082404	train-f1-score:-0.004626	test-f1-score:-0.004626
[4]	train-error:0.07991	test-error:0.07991	train-f1-score:-0.004626	test-f1-score:-0.004626
[5]	train-error:0.077546	test-error:0.077546	train-f1-score:-0.004626	test-f1-score:-0.004626
[6]	train-error:0.074071	test-error:0.074071	train-f1-score:-0.004626	test-f1-score:-0.004626
[7]	train-error:0.073648	test-error:0.073648	train-f1-score:-0.004626	test-f1-score:-0.004626
[8]	train-error:0.073061	test-error:0.073061	train-f1-score:-0.004626	test-f1-score:-0.004626
[9]	t

In [29]:
#Predict training set:
dtrain_predictions = model.predict(xgbvalid)
dtrain_predprob = model.predict_proba(xgbvalid)[:,1]

AttributeError: 'Booster' object has no attribute 'predict_proba'

In [None]:
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % metrics.accuracy_score(train_df['acc_now_delinq'].values, dtrain_predictions))
print("Recall : %.4g" % metrics.recall_score(train_df['acc_now_delinq'].values, dtrain_predictions))
print("Fbeta Score : %.4g" % metrics.fbeta_score(train_df['acc_now_delinq'].values, dtrain_predictions,beta=2))
print("AUC Score (Train): %f" % metrics.roc_auc_score(train_df['acc_now_delinq'], dtrain_predprob))
print(metrics.classification_report(train_df['acc_now_delinq'], dtrain_predictions))

#     Predict on testing data:
test_df['predprob'] = xgb1.predict_proba(test_df[predictors])[:,1]
#     results = test_results.merge(dtest[['ID','predprob']], on='ID')
#     print('AUC Score (Test): %f' % metrics.roc_auc_score(results['Disbursed'], results['predprob']))

feat_imp = pd.Series(xgb1.get_booster().get_score(importance_type='weight')).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')