In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score,roc_curve
import lightgbm as lgb
import gc
import feature_engineering as fe

def score(y_true, y_score):
    """ Evaluation metric
    """
    fpr, tpr, thresholds = roc_curve(y_true, y_score, pos_label=1)
    score = 0.4 * tpr[np.where(fpr >= 0.001)[0][0]] + \
            0.3 * tpr[np.where(fpr >= 0.005)[0][0]] + \
            0.3 * tpr[np.where(fpr >= 0.01)[0][0]]

    return score


def evaluate(y_true, y_pred, y_prob):
    """ 估计结果: precision, recall, f1, auc, mayi_score
    """
    p = precision_score(y_true, y_pred)
    r = recall_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)
    mayi = score(y_true, y_prob)
    
    return [p,r,f1,auc,mayi] 

def cv(clf_fit_params,clf,X,y,n_splits=5):
    models=[]
    i=0
    eval_train = pd.DataFrame(index=range(n_splits), columns=['P','R','F1','AUC','mayi'])
    eval_test  = pd.DataFrame(index=range(n_splits), columns=['P','R','F1','AUC','mayi'])
    kf=KFold(n_splits=5,shuffle=True,random_state=2018)
    for train_index,test_index in kf.split(X):
        X_,X_test=X[train_index],X[test_index]
        y_,y_test=y[train_index],y[test_index]
        X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.2,random_state=2018)
        del X_,y_
        gc.collect()
        
        
        clf.fit(X_train, y_train,eval_set=[(X_valid,y_valid)], eval_metric='auc',
        verbose=True, **clf_fit_params)
        
        ## Model Testing
        # On training set
        y_prob_train = clf.predict_proba(X_train)[:,1]
        y_pred_train = clf.predict(X_train)
        eval_train.iloc[i,:] = evaluate(y_train, y_pred_train, y_prob_train)
        
        # On testing set
        y_prob_test = clf.predict_proba(X_test)[:,1]
        y_pred_test = clf.predict(X_test)
        eval_test.iloc[i,:] = evaluate(y_test, y_pred_test, y_prob_test)
        models.append(clf)
        
        i+=1
        
    return models,eval_train,eval_test




In [2]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
train.fillna(-999,inplace=True)
test.fillna(-999,inplace=True)
train=fe.transfer_date2_week(train)
test=fe.transfer_date2_week(test)

y=train["label"].values
train.drop(["label","date"],axis=1,inplace=True)
test.drop(["label","date"],axis=1,inplace=True)

features=train.columns.tolist()
categorical_features=['f1','f2','f3','f4','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15','f16','f17','f18','f19','isholiday']
X=train[features].values



In [12]:
clf=lgb.LGBMClassifier(
    boosting_type='gbdt',
    num_leaves=31,
    max_depth=-1,
    learning_rate=0.1,
    n_estimators=1000,
    max_bin=255,
    subsample_for_bin=200000,
    objective=None,
    min_split_gain=0.0,
    min_child_weight=0.001,
    min_child_samples=20,
    subsample=1.0,
    subsample_freq=1,
    colsample_bytree=1.0,
    reg_alpha=0.0,
    reg_lambda=0.0,
    random_state=None, 
    n_jobs=-1,
    silent=True
)


clf_fit_params = {
                'feature_name': features,
                  'categorical_feature': categorical_features,
                  'early_stopping_rounds':100
                 }

models,eval_train,eval_test=cv(clf_fit_params,clf,X,y,n_splits=5)
test_prob_final = np.zeros((len(test),))
for model in models:
    test_prob = model.predict_proba(test)[:,1]
    test_prob_final += (test_prob*0.2)



[1]	valid_0's auc: 0.913976
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.951484
[3]	valid_0's auc: 0.950283
[4]	valid_0's auc: 0.950406
[5]	valid_0's auc: 0.954821
[6]	valid_0's auc: 0.954541
[7]	valid_0's auc: 0.957878
[8]	valid_0's auc: 0.958955
[9]	valid_0's auc: 0.959545
[10]	valid_0's auc: 0.95964
[11]	valid_0's auc: 0.96229
[12]	valid_0's auc: 0.962618
[13]	valid_0's auc: 0.962652
[14]	valid_0's auc: 0.9647
[15]	valid_0's auc: 0.965432
[16]	valid_0's auc: 0.965623
[17]	valid_0's auc: 0.968667
[18]	valid_0's auc: 0.969401
[19]	valid_0's auc: 0.969803
[20]	valid_0's auc: 0.970359
[21]	valid_0's auc: 0.970618
[22]	valid_0's auc: 0.970754
[23]	valid_0's auc: 0.97201
[24]	valid_0's auc: 0.973268
[25]	valid_0's auc: 0.974216
[26]	valid_0's auc: 0.974371
[27]	valid_0's auc: 0.974598
[28]	valid_0's auc: 0.974974
[29]	valid_0's auc: 0.975331
[30]	valid_0's auc: 0.975533
[31]	valid_0's auc: 0.976637
[32]	valid_0's auc: 0.976896
[33]	valid_0's auc: 0.9

  if diff:
  if diff:


[1]	valid_0's auc: 0.913976
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.951484
[3]	valid_0's auc: 0.950283
[4]	valid_0's auc: 0.950406
[5]	valid_0's auc: 0.954821
[6]	valid_0's auc: 0.954541
[7]	valid_0's auc: 0.957878
[8]	valid_0's auc: 0.958955
[9]	valid_0's auc: 0.959545
[10]	valid_0's auc: 0.95964
[11]	valid_0's auc: 0.96229
[12]	valid_0's auc: 0.962618
[13]	valid_0's auc: 0.962652
[14]	valid_0's auc: 0.9647
[15]	valid_0's auc: 0.965432
[16]	valid_0's auc: 0.965623
[17]	valid_0's auc: 0.968667
[18]	valid_0's auc: 0.969401
[19]	valid_0's auc: 0.969803
[20]	valid_0's auc: 0.970359
[21]	valid_0's auc: 0.970618
[22]	valid_0's auc: 0.970754
[23]	valid_0's auc: 0.97201
[24]	valid_0's auc: 0.973268
[25]	valid_0's auc: 0.974216
[26]	valid_0's auc: 0.974371
[27]	valid_0's auc: 0.974598
[28]	valid_0's auc: 0.974974
[29]	valid_0's auc: 0.975331
[30]	valid_0's auc: 0.975533
[31]	valid_0's auc: 0.976637
[32]	valid_0's auc: 0.976896
[33]	valid_0's auc: 0.9

  if diff:
  if diff:


[1]	valid_0's auc: 0.913976
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.951484
[3]	valid_0's auc: 0.950283
[4]	valid_0's auc: 0.950406
[5]	valid_0's auc: 0.954821
[6]	valid_0's auc: 0.954541
[7]	valid_0's auc: 0.957878
[8]	valid_0's auc: 0.958955
[9]	valid_0's auc: 0.959545
[10]	valid_0's auc: 0.95964
[11]	valid_0's auc: 0.96229
[12]	valid_0's auc: 0.962618
[13]	valid_0's auc: 0.962652
[14]	valid_0's auc: 0.9647
[15]	valid_0's auc: 0.965432
[16]	valid_0's auc: 0.965623
[17]	valid_0's auc: 0.968667
[18]	valid_0's auc: 0.969401
[19]	valid_0's auc: 0.969803
[20]	valid_0's auc: 0.970359
[21]	valid_0's auc: 0.970618
[22]	valid_0's auc: 0.970754
[23]	valid_0's auc: 0.97201
[24]	valid_0's auc: 0.973268
[25]	valid_0's auc: 0.974216
[26]	valid_0's auc: 0.974371
[27]	valid_0's auc: 0.974598
[28]	valid_0's auc: 0.974974
[29]	valid_0's auc: 0.975331
[30]	valid_0's auc: 0.975533
[31]	valid_0's auc: 0.976637
[32]	valid_0's auc: 0.976896
[33]	valid_0's auc: 0.9

  if diff:
  if diff:


[1]	valid_0's auc: 0.913976
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.951484
[3]	valid_0's auc: 0.950283
[4]	valid_0's auc: 0.950406
[5]	valid_0's auc: 0.954821
[6]	valid_0's auc: 0.954541
[7]	valid_0's auc: 0.957878
[8]	valid_0's auc: 0.958955
[9]	valid_0's auc: 0.959545
[10]	valid_0's auc: 0.95964
[11]	valid_0's auc: 0.96229
[12]	valid_0's auc: 0.962618
[13]	valid_0's auc: 0.962652
[14]	valid_0's auc: 0.9647
[15]	valid_0's auc: 0.965432
[16]	valid_0's auc: 0.965623
[17]	valid_0's auc: 0.968667
[18]	valid_0's auc: 0.969401
[19]	valid_0's auc: 0.969803
[20]	valid_0's auc: 0.970359
[21]	valid_0's auc: 0.970618
[22]	valid_0's auc: 0.970754
[23]	valid_0's auc: 0.97201
[24]	valid_0's auc: 0.973268
[25]	valid_0's auc: 0.974216
[26]	valid_0's auc: 0.974371
[27]	valid_0's auc: 0.974598
[28]	valid_0's auc: 0.974974
[29]	valid_0's auc: 0.975331
[30]	valid_0's auc: 0.975533
[31]	valid_0's auc: 0.976637
[32]	valid_0's auc: 0.976896
[33]	valid_0's auc: 0.9

  if diff:
  if diff:


[1]	valid_0's auc: 0.913976
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.951484
[3]	valid_0's auc: 0.950283
[4]	valid_0's auc: 0.950406
[5]	valid_0's auc: 0.954821
[6]	valid_0's auc: 0.954541
[7]	valid_0's auc: 0.957878
[8]	valid_0's auc: 0.958955
[9]	valid_0's auc: 0.959545
[10]	valid_0's auc: 0.95964
[11]	valid_0's auc: 0.96229
[12]	valid_0's auc: 0.962618
[13]	valid_0's auc: 0.962652
[14]	valid_0's auc: 0.9647
[15]	valid_0's auc: 0.965432
[16]	valid_0's auc: 0.965623
[17]	valid_0's auc: 0.968667
[18]	valid_0's auc: 0.969401
[19]	valid_0's auc: 0.969803
[20]	valid_0's auc: 0.970359
[21]	valid_0's auc: 0.970618
[22]	valid_0's auc: 0.970754
[23]	valid_0's auc: 0.97201
[24]	valid_0's auc: 0.973268
[25]	valid_0's auc: 0.974216
[26]	valid_0's auc: 0.974371
[27]	valid_0's auc: 0.974598
[28]	valid_0's auc: 0.974974
[29]	valid_0's auc: 0.975331
[30]	valid_0's auc: 0.975533
[31]	valid_0's auc: 0.976637
[32]	valid_0's auc: 0.976896
[33]	valid_0's auc: 0.9

  if diff:
  if diff:


In [18]:
eval_test

Unnamed: 0,P,R,F1,AUC,mayi
0,0.698565,0.488703,0.575086,0.98556,0.531799
1,0.767363,0.545345,0.637579,0.990603,0.626485
2,0.75828,0.532219,0.625449,0.990565,0.607586
3,0.749716,0.543505,0.63017,0.991274,0.616454
4,0.734822,0.542954,0.624483,0.990347,0.605459


In [15]:
submission=pd.read_csv("./sub_sample/submission.csv")
submission["score"]=test_prob_final
submission.to_csv("submission.csv",index=False)

In [16]:
for model in models:
    print(model.feature_importances_)

[  5   1   0  23  98  54 215   7  16   2   7  14   4  46  70   1  21  44
  24  38  14  18  14  40  50  51  41  40  46  54  72  53  40   3   1   2
   3   1   5   0   0   0   0   1   0   0   1   3   5  11   6   0   3   0
   0   3   3  45   3   0   0   2   0  10   8   5   0   1   0   1   3   0
   1   0   6   4  41  22   4   9  26  23 162   1   3   5   7  28   6   8
  21   2   6   2   1   2   1   6   7   7   3   6  10   8   6  13  65  44
  13  25  71   2  16  14  19  50  35  25  16  11  52   9   9   4  25  31
   1   2   0   0   0  29  39  17  45  38  12   4  11   0   0   1  11   6
   3   0   2   2   1   3  14  10  10   6   8   9   5  18   8  10   0  10
  22  10   4   8   7   6  12   4   7]
[  5   1   0  23  98  54 215   7  16   2   7  14   4  46  70   1  21  44
  24  38  14  18  14  40  50  51  41  40  46  54  72  53  40   3   1   2
   3   1   5   0   0   0   0   1   0   0   1   3   5  11   6   0   3   0
   0   3   3  45   3   0   0   2   0  10   8   5   0   1   0   1   3   0
   1   0   6 