In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from scipy import sparse
import pickle
from sklearn.feature_selection import SelectFromModel

import gc
import random

In [3]:
import matplotlib.pyplot as plt

In [4]:
features_dir227 = './features_227/'
features_dir306 = './features_306/'
data_dir = "./data/FT_Camp_2/"
inter_dir = "./inter_data_repo/2/"

In [5]:
# 训练数据
train227 = pd.read_csv(data_dir + 'train.csv')

# 预测目标用户
pred_users306 = pd.read_csv(data_dir + 'pred_users.csv')

In [9]:
random.seed(42)
random_seed = list(range(43))
max_depth = [3,4]
lambd = list(range(0,5))
subsample = [i/1000.0 for i in range(700,800)]
colsample_bytree = [i/1000.0 for i in range(700,800)]
min_child_weight = [i/100.0 for i in range(150,250)]
# n_feature = range(150,282,2)

random.shuffle(random_seed)
random.shuffle(max_depth)
random.shuffle(lambd)
random.shuffle(subsample)
random.shuffle(colsample_bytree)
random.shuffle(min_child_weight)
# random.shuffle(n_feature)

In [None]:
pickle.dump(random_seed, open(inter_dir + 'random_seed_'+str(iteration)+'.p', 'wb'))
pickle.dump(max_depth, open(inter_dir + 'max_depth_'+str(iteration)+'.p', 'wb'))
pickle.dump(lambd, open(inter_dir + 'lambd_'+str(iteration)+'.p', 'wb'))
pickle.dump(subsample, open(inter_dir + 'subsample_'+str(iteration)+'.p', 'wb'))
pickle.dump(colsample_bytree, open(inter_dir + 'colsample_bytree_'+str(iteration)+'.p', 'wb'))
pickle.dump(min_child_weight, open(inter_dir + 'min_child_weight_'+str(iteration)+'.p', 'wb'))
# pickle.dump(random_seed, open(inter_dir + 'random_seed_'+str(iteration)+'.p', 'wb'))

## 开始训练

In [9]:
X227 = pickle.load(open(inter_dir + 'X227.p', 'rb'))
X306 = pickle.load( open(inter_dir + 'X306.p', 'rb'))

Y227 = pickle.load(open(inter_dir + 'Y227.p', 'rb'))

In [10]:
X306 = X306[list(X227.columns)]

In [11]:
# X227.columns == X306.columns

In [19]:
X227_csr = sparse.csr_matrix(X227)
X306_csr = sparse.csr_matrix(X306)

In [20]:
# X227_csr = sparse.csr_matrix(X227.loc[:,sfeatures])
# X306_csr = sparse.csr_matrix(X306.loc[:,sfeatures])

In [21]:
print(X227_csr.shape)
print(X306_csr.shape)

(108252, 694)
(94655, 694)


In [9]:
# def pipeline(iteration,random_seed,max_depth,lambd,subsample,colsample_bytree,min_child_weight):
    
#     X_train, y_train = X227_csr, Y227
#     if max_depth==3:
#         n_estimators = 400
#     elif max_depth==4:
#         n_estimators = 300

#     model = xgb.XGBClassifier(n_estimators=n_estimators,
#                               max_depth = max_depth,
#                               reg_lambda = lambd,
#                               subsample = subsample,
#                               colsample_bytree = colsample_bytree,
#                               min_child_weight = min_child_weight,
#                               n_jobs = 6,
#                               random_state=random_seed)
#     eval_set = [(X_train, y_train)]
#     models.fit(X_train, y_train, eval_metric=['auc'], eval_set=eval_set, verbose=False)

In [41]:
def pipeline(iteration,random_seed,max_depth,lambd,subsample,colsample_bytree,min_child_weight):
    if max_depth==3:
        n_estimators = 400
    elif max_depth==4:
        n_estimators = 300
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    preds306 = np.zeros((X306_csr.shape[0]))
    i = 0
    models = [0]*kf.get_n_splits()
    roc_scores = []
    for train_index, val_index in kf.split(X227_csr, Y227):
        X_train, X_val = X227_csr[train_index], X227_csr[val_index]
        y_train, y_val = Y227[train_index], Y227[val_index]
        eval_set = [(X_train, y_train), (X_val, y_val)]

        models[i] = xgb.XGBClassifier(n_estimators=n_estimators,
                              max_depth = max_depth,
                              reg_lambda = lambd,
                              subsample = subsample,
                              colsample_bytree = colsample_bytree,
                              min_child_weight = min_child_weight,
                              n_jobs = 6,
                              random_state=random_seed)
        models[i].fit(X_train, y_train, eval_metric=['auc', "error", "logloss"], eval_set=eval_set, verbose=False)

        score = models[i].predict_proba(X_val)[:, 1]

        roc = roc_auc_score(y_val, score)
        roc_scores.append(roc)

        print(roc)

        preds306 = preds306 + models[i].predict_proba(X306_csr)[:,1]

        i+=1

    preds306 = preds306 / kf.get_n_splits()
    pickle.dump(preds306, open(inter_dir + 'preds306_'+str(iteration)+'.p', 'wb'))
    pickle.dump(models, open(inter_dir + 'models_'+str(iteration)+'.p','wb'))
    pickle.dump(roc_scores, open(inter_dir + 'roc_scores_'+str(iteration)+'.p','wb'))
    print("mean_roc_score: {}".format(np.mean(roc_scores)))
    

In [None]:
for i in range(0,30):
    print ("iter:",i)
    print(random_seed[i],max_depth[i%2],lambd[i%5],subsample[i],colsample_bytree[i],min_child_weight[i])
    pipeline(i,random_seed[i],max_depth[i%2],lambd[i%5],subsample[i],colsample_bytree[i],min_child_weight[i])


iter: 0
12 4 1 0.724 0.752 1.64
0.6963155052551382
0.6988293898152477
0.6977228138528138
0.6962828896103896
0.6946056060606061
mean_roc_score: 0.6967512409188391
iter: 1
19 3 4 0.708 0.731 1.61
0.7005322126123319
0.7001805642131145
0.6981005735930735
0.6968932575757576


In [1]:
# for i in range(kf.get_n_splits()):
#     results = models[i].evals_result()
#     print(len(results['validation_0']['auc']))