In [6]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import time
import datetime
import os

from utils import raw_data_path,dump_pickle,load_pickle,cal_log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import lightgbm as lgb

params = {
    'max_depth': 4,                 #4
#    'min_data_in_leaf': 40,-
    'feature_fraction': 1,       #1
    'learning_rate': 0.04,          #0.04
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'verbose': -1,
    'metric': 'binary_logloss',
}

if __name__ == '__main__':
    
    
    t0 = time.time()
    train_data = load_pickle(path='../data/train_final_onehot.pkl')    
    train_Y = train_data['is_trade']
    train_data.drop('is_trade', axis=1, inplace=True) 
#    new_cvr = load_pickle(path=cache_pkl_path +'train_data_cvr_fusion')
#    train_data['new_cvr'] = new_cvr.values
    
    cv_data = load_pickle(path='../data/valid_final_onehot.pkl')
    cv_Y = cv_data['is_trade']
    cv_data.drop('is_trade', axis=1, inplace=True) 
#    new_cvr = load_pickle(path=cache_pkl_path +'cv_data_cvr_fusion')
#    cv_data['new_cvr'] = new_cvr.values
    
    test_data = load_pickle(path='../data/test_final_onehot.pkl')
    test_data.drop('is_trade', axis=1, inplace=True) 
#    new_cvr = load_pickle(path=cache_pkl_path +'test_data_cvr_fusion')
#    test_data['new_cvr'] = new_cvr.values
    
    test_file = 'round1_ijcai_18_test_a_20180301.txt'
    test = pd.read_table(raw_data_path + test_file,delim_whitespace=True)
    test_id = test.instance_id
    
#     drop_cols = ['index', 'context_id', 'user_id','shop_id','item_id','item_brand_id']
#     train_data.drop(drop_cols,axis=1,inplace=True)
#     cv_data.drop(drop_cols,axis=1,inplace=True)
#     test_data.drop(drop_cols,axis=1,inplace=True)
    
    print('train shap:',train_data.shape)
    print('cv shape', cv_data.shape)
    print('test shape', test_data.shape)
    
    lgb_train = lgb.Dataset(train_data.values, train_Y)
    lgb_cv = lgb.Dataset(cv_data.values, cv_Y)
    gbm = lgb.train(params=params,            #参数
                    train_set=lgb_train,      #要训练的数据
                    num_boost_round=2000,     #迭代次数
                    valid_sets=lgb_cv,        #训练时需要评估的列表
                    verbose_eval=False,       #
                    
                    early_stopping_rounds=500)
    
    predict_train = gbm.predict(train_data.values)
    predict_cv = gbm.predict(cv_data.values)
    predict_test = gbm.predict(test_data.values)
    
    feat_imp = pd.Series(gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False)

    print('训练损失:',log_loss(train_Y, predict_train))
    print('测试损失:',log_loss(cv_Y, predict_cv))
    t1 = time.time()
    print('训练时间:',t1 - t0)
    
    #全量评测
    train_data = pd.concat([train_data, cv_data],axis=0)
    train_Y = np.append(train_Y, cv_Y)
    
    lgb_train = lgb.Dataset(train_data.values, train_Y)
    gbm = lgb.train(params=params,            #参数
                    train_set=lgb_train,      #要训练的数据
                    num_boost_round=300,     #迭代次数
                    verbose_eval=True)
    predict_test = gbm.predict(test_data.values)
    print('训练损失:',cal_log_loss(gbm.predict(train_data.values), train_Y))
    
    submission = pd.DataFrame({'instance_id':test_id,'predicted_score':predict_test})
    print('预测正样本比例:',len(submission.loc[submission.predicted_score>=0.5])/len(submission))
    submission.to_csv(r'../result/lgb_{}.txt'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),
                  index=False, sep=' ',line_terminator='\r')
    
    
    


train shap: (420717, 99)
cv shape (57421, 99)
test shape (18371, 99)
训练损失: 0.08466158484596635
测试损失: 0.08102384728166763
训练时间: 50.55836200714111
训练损失: 0.08539260479330965
预测正样本比例: 0.0


In [5]:
submission

Unnamed: 0,instance_id,predicted_score
0,2475218615076601065,0.019722
1,398316874173557226,0.011500
2,6586402638209028583,0.018503
3,1040996105851528465,0.038966
4,6316278569655873454,0.012318
5,868158305045921978,0.006318
6,5713520501786699854,0.003845
7,932945015407923184,0.011920
8,1919197847086752313,0.012116
9,304887065966615346,0.008582
