In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from util import *

In [2]:
data = load_data("data/data-202008.csv")

In [3]:
cat_features = ["province_code", "district_code","viettel_bank_code","channel_code"
                ,"staff_code","process_code","shop_code"]

data = preprocess_data(data, cat_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [4]:
# Label encoder
for c in data.columns[data.dtypes == 'object']:
    data[c] = data[c].factorize()[0]

In [5]:
data.dtypes.value_counts()

int64      10
float64     6
bool        4
dtype: int64

In [6]:
data.head()

Unnamed: 0,province_code,district_code,date_diff,viettel_bank_code,channel_type_id,channel_code,staff_code,trans_amount,trans_fee,trans_type,process_code,channel_fee,shop_code,customer_fee,fee_partner,is_fraud,same_name,same_phone,same_phone_channel,same_phone_channel_ben
0,0,0,16,0,164.0,0,0,500000.0,8182.0,1,0,5400.0,0,18000.0,0.0,0,False,True,False,False
1,1,1,16,0,164.0,1,1,3000000.0,10000.0,2,1,5500.0,1,11000.0,0.0,0,False,True,False,False
2,2,2,16,1,6.0,2,2,600000.0,0.0,2,2,1260.0,2,0.0,0.0,0,True,True,False,False
3,1,3,16,2,197.0,3,3,393692.0,1000.0,3,3,770.0,3,0.0,1100.0,0,False,False,False,False
4,3,4,16,2,1.0,4,4,349384.0,909.0,3,4,700.0,4,0.0,1000.0,0,False,False,True,False


In [7]:
X, y, X_train, y_train, X_val, y_val, X_test, y_test = split_data(data)

In [8]:
# from xgboost import XGBClassifier
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedStratifiedKFold
# # define model
# model = XGBClassifier()
# # define evaluation procedure
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=1)
# # evaluate model
# scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=2)
# # summarize performance
# print('Mean ROC AUC: %.5f' % mean(scores))

In [9]:
D_train = xgb.DMatrix(X_train, label=y_train)
D_val = xgb.DMatrix(X_val, label=y_val)
D_test = xgb.DMatrix(X_test, label=y_test)

# parameters = {'max_depth': 4,
#          'eta': 0.01,
#          'objective': 'binary:logistic',
#          'eval_metric': 'error',
#          'nthread': 4}

parameters = {'nthread':4, #when use hyperthread, xgboost may become slower
              'objective':'binary:logistic',
              'learning_rate': 0.05, #so called `eta` value
              'max_depth': 6,
              'min_child_weight': 11,
              'silent': 1,
              'subsample': 0.8,
              'colsample_bytree': 0.7,
              'n_estimators': 5, #number of trees, change it to 1000 for better results
              'missing':-999,
              'seed': 1337,
                'scale_pos_weight':10}

steps = 20  # The number of training iterations

model = xgb.train(parameters, D_train, steps)
# #xgb.fit(X_train, y_train)


Parameters: { missing, n_estimators, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [10]:
save_model(model, 'xgb-model.saved')

In [11]:
#eval_model(model, X_test, y_test)
model = load_model('xgb-model.saved')
y_pred = model.predict(D_test)
print('classification_report: \n{}'.format(classification_report(y_test, y_pred.round())))
print('confusion_matrix: \n{}'.format(confusion_matrix(y_test,  y_pred.round())))
print('roc_auc_score: {}'.format(roc_auc_score(y_test,  y_pred.round())))
print('f1_score: {}'.format(f1_score(y_test,  y_pred.round())))
print('precision_score: {}'.format(precision_score(y_test,  y_pred.round())))
print('recall_score: {}'.format(recall_score(y_test,  y_pred.round())))

classification_report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1433272
           1       0.38      0.67      0.49      5430

    accuracy                           0.99   1438702
   macro avg       0.69      0.83      0.74   1438702
weighted avg       1.00      0.99      1.00   1438702

confusion_matrix: 
[[1427400    5872]
 [   1775    3655]]
roc_auc_score: 0.8345077096810526
f1_score: 0.488734371866016
precision_score: 0.38364647842972605
recall_score: 0.6731123388581952


In [12]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import *

xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}


clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=StratifiedKFold(train['QuoteConversion_Flag'], n_folds=5, shuffle=True), 
                   scoring='roc_auc',
                   verbose=2, refit=True)

clf.fit(train[features], train["QuoteConversion_Flag"])


ModuleNotFoundError: No module named 'sklearn.grid_search'