In [8]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import time
import datetime
import os

from utils import raw_data_path,dump_pickle,load_pickle,cal_log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import lightgbm as lgb

params = {
    'max_depth': 4,                 #4
#    'min_data_in_leaf': 40,-
    'feature_fraction': 1,       #1
    'learning_rate': 0.04,          #0.04
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'verbose': -1,
    'metric': 'binary_logloss',
}

if __name__ == '__main__':
    
    
    t0 = time.time()
    train_data = load_pickle(path='../data/train_final_onehot.pkl')    
    train_Y = train_data['is_trade']
    train_data.drop('is_trade', axis=1, inplace=True) 
#    new_cvr = load_pickle(path=cache_pkl_path +'train_data_cvr_fusion')
#    train_data['new_cvr'] = new_cvr.values
    
    cv_data = load_pickle(path='../data/valid_final_onehot.pkl')
    cv_Y = cv_data['is_trade']
    cv_data.drop('is_trade', axis=1, inplace=True) 
#    new_cvr = load_pickle(path=cache_pkl_path +'cv_data_cvr_fusion')
#    cv_data['new_cvr'] = new_cvr.values
    
    test_data = load_pickle(path='../data/test_final_onehot.pkl')
#    new_cvr = load_pickle(path=cache_pkl_path +'test_data_cvr_fusion')
#    test_data['new_cvr'] = new_cvr.values
    
    test_file = 'round1_ijcai_18_test_a_20180301.txt'
    test = pd.read_table(raw_data_path + test_file,delim_whitespace=True)
    test_id = test.instance_id
    
    drop_cols = ['user_id','shop_id','item_id','item_brand_id']
    train_data.drop(drop_cols,axis=1,inplace=True)
    cv_data.drop(drop_cols,axis=1,inplace=True)
    test_data.drop(drop_cols,axis=1,inplace=True)
    
    print('train shap:',train_data.shape)
    print('cv shape', cv_data.shape)
    print('test shape', test_data.shape)
    
    lgb_train = lgb.Dataset(train_data.values, train_Y)
    lgb_cv = lgb.Dataset(cv_data.values, cv_Y)
    gbm = lgb.train(params=params,            #参数
                    train_set=lgb_train,      #要训练的数据
                    num_boost_round=500,     #迭代次数
                    valid_sets=lgb_cv,        #训练时需要评估的列表
                    verbose_eval=False,       #
                    
                    early_stopping_rounds=500)
    
    predict_train = gbm.predict(train_data.values)
    predict_cv = gbm.predict(cv_data.values)
    predict_test = gbm.predict(test_data.values)
    
    feat_imp = pd.Series(gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False)

    print('训练损失:',log_loss(train_Y, predict_train))
    print('测试损失:',log_loss(cv_Y, predict_cv))
    t1 = time.time()
    print('训练时间:',t1 - t0)
    
    #全量评测
    train_data = pd.concat([train_data, cv_data],axis=0)
    train_Y = np.append(train_Y, cv_Y)
    
    lgb_train = lgb.Dataset(train_data.values, train_Y)
    gbm = lgb.train(params=params,            #参数
                    train_set=lgb_train,      #要训练的数据
                    num_boost_round=300,     #迭代次数
                    verbose_eval=True)
    predict_test = gbm.predict(test_data.values)
    print('训练损失:',cal_log_loss(gbm.predict(train_data.values), train_Y))
    
#     submission = pd.DataFrame(build_train_dataset{'instance_id':test_id,'predicted_score':predict_test})
#     print('预测正样本比例:',len(submission.loc[submission.predicted_score>=0.5])/len(submission))
#     submission.to_csv(r'../result/lgb_{}.txt'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),
#                   index=False, sep=' ',line_terminator='\r')
    
    
    


train shap: (439088, 120)
cv shape (57421, 120)
test shape (55113, 121)


ValueError: y_true and y_pred contain different number of classes 3, 2. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [-1.  0.  1.]

In [10]:
train_Y

0         0.0
1        -1.0
2         0.0
3        -1.0
4         0.0
5        -1.0
6         0.0
7        -1.0
8         0.0
9        -1.0
10        0.0
11       -1.0
12        0.0
13       -1.0
14        0.0
15       -1.0
16        0.0
17       -1.0
18        0.0
19       -1.0
20        0.0
21       -1.0
22        0.0
23       -1.0
24        0.0
25       -1.0
26        0.0
27       -1.0
28        1.0
29       -1.0
         ... 
439058    0.0
439059    1.0
439060    0.0
439061    0.0
439062    0.0
439063    0.0
439064    1.0
439065    0.0
439066    0.0
439067    0.0
439068    0.0
439069    0.0
439070    0.0
439071    0.0
439072    0.0
439073    0.0
439074    0.0
439075    0.0
439076    0.0
439077    0.0
439078    0.0
439079    0.0
439080    0.0
439081    0.0
439082    0.0
439083    0.0
439084    0.0
439085    0.0
439086    0.0
439087    0.0
Name: is_trade, Length: 439088, dtype: float64