In [12]:
import numpy as np
import pandas as pd
import gensim
import time
import csv
import os
import sklearn
from sklearn import datasets
from sklearn.naive_bayes import ComplementNB
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import lightgbm as lgb


In [16]:
#创建lgb模型    API:https://lightgbm.readthedocs.io/en/latest/Python-API.html
#参数说明：传入总特征文件，传入标签文件和输出文件名
def make_lgb_model(train_file,label_file,outfile):
    #参数列表
    params = {'boosting_type': 'gbdt',
                  'objective': 'binary',
                  'metric': ['auc','binary_loss'], 
                  'num_leaves': 31,
                  'max_depth' : 7,
                  'learning_rate': 0.1,
                  'feature_fraction': 1.0,
                  #'bagging_fraction': 0.9,
                  #'bagging_seed': 0,
                  #'bagging_freq': 1,
                  #'seed':1024,
                  #'verbosity':10,
                  #'first_metric_only':True
                  #'lambda_l1': 0,
                  #'lambda_l2': 0,
                  #'ignore_column': [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
                  #'ignore_column':[5,6,12,16,22,27,28] 
    }
    lb=[]
    with open(label_file,'r') as lb_file:
        lb_reader = csv.reader(lb_file, delimiter=',')
        for row in tqdm(lb_reader,mininterval=1.0):
            lb.append(int(row[0]))
        #创建lgb所需的训练数据集,weight为权重（与attention机制相对应）
        train_data = lgb.Dataset(train_file,lb,weight=None)
    start = time.time()
    #进行训练,num_boost_round为训练轮数
    bst = lgb.train(params, train_data, num_boost_round = 300)
    #输出特征数
    print(bst.num_feature())
    #输出特征重要性
    print(bst.feature_importance())
    #保存lgb模型
    bst.save_model(outfile)
    print(bst)
    #进行交叉验证，nfold为交叉准备集数量，10个集合， 100轮,early_stopping_rounds为早停轮，如果5轮没能改进成绩就不再训练
    bst_valid = lgb.cv(params, train_data,nfold=10,num_boost_round=300,early_stopping_rounds=5)
    #显示分数
    print(bst_valid)
    
    print('Training lgb model, using time：'+str(time.time()-start))
    return True

#调用该函数
make_lgb_model('all_feature_train.txt','lb_train.txt','lgb_model.txt')

17000it [00:00, 1420381.83it/s]


39
[ 82 310   9 280 188 224 215 236 176   0   0   0 248 367 357 339 216 336
 367 381  32 190 330 308 332 206 309 222 279 221 367 372 260   0   0   0
 301 344 413]
<lightgbm.basic.Booster object at 0x000002589D1DD400>
{'auc-mean': [0.597173135814497, 0.5999724964109644, 0.6032045710615601, 0.6049845426326261, 0.6066458832182299, 0.6080440507674544, 0.6086268569578404, 0.6088390205028582, 0.6102357954357108, 0.6107708835522598, 0.6103189737545087, 0.6102741194796231, 0.6106153310183962, 0.6113693457326643, 0.6110858450079879, 0.6114632302464094, 0.6112045656407605, 0.6118601675491917, 0.611849862219033, 0.6121526189903045, 0.6121497219056, 0.613010648926285], 'auc-stdv': [0.01412746081991307, 0.014378770505077444, 0.016061263969761364, 0.01566860077485317, 0.016536656044956837, 0.01669950691622316, 0.016871075630739085, 0.01693666458360434, 0.016575000783407928, 0.01595811892153831, 0.016044025180514728, 0.016797116105017922, 0.015876300438424595, 0.015684760426596164, 0.0160855302359042

True

In [17]:
#检测Qauc的程序，思路：把query_id相同的部分集群，然后计算其roc_auc_score，然后取所有的平均值
#输入result（一位数组），真实表文件（包括两个id和正确的标签），将输出qauc值
def metric_auc(predict,file ,start, end):
    start_time = time.time()
    #总数量较小，可以使用pandas
    true = pd.read_csv('test.csv',header=None)

    '''
    true:      DataFrame , ['query_id','query_title_id','label']
    predict:   np.array , [0.79,0.03,0.56,...]
    '''
    #拼接
    true = pd.concat([true.reset_index(drop=True),pd.DataFrame(predict)],axis=1)
    true.columns = ['query_id','query_title_id','label','predict']
    auc_score = []
    count = 0
    #对query_id相同的进行集群
    for i in tqdm(true.groupby('query_id'),mininterval=1.0):
        a = i[1]
        x = np.array(a['label'])
        y = np.array(a['predict'])
        #当一个query_id中所有标签都为0时会报错，所以添置为0.5
        try:
            auc_score.append(roc_auc_score(x,y))
        except:
            auc_score.append(0.5)  
            count+=1
    print(count)
    return np.mean(auc_score)


#读取lgb模型
bst = lgb.Booster(model_file='lgb_model.txt')
#预测
predict = (bst.predict('all_feature_test.txt'))
print(len(predict))
metric_auc(predict,'test.csv',17000,20000) 

3000


100%|██████████████████████████████████████████████████████████████████████████████| 444/444 [00:00<00:00, 1193.39it/s]


15


0.5134933418345764

In [18]:
#作答环节
#读取存储的lgb模型，传入测试集csv和测试集特征并且预测结果，保存到文件中
def predict(lgb_model,feature_file,test_file,out_file):
    #读取 
    bst = lgb.Booster(model_file=lgb_model)
    #预测
    result = bst.predict(feature_file)
    #newline=''防止写csv时每两行数据中空一行
    with open(out_file, 'w',newline = '') as output:
        with open(test_file) as csv_file:
            #处理开始时间
            start_time = time.time()
            #打开csv_reader以及csv_writer
            csv_reader = csv.reader(csv_file, delimiter=',')
            writer = csv.writer(output)
            
            line_count = 0
            for row in csv_reader:
                query_id = row[0]
                title_id = row[2]
                pred = result[line_count]
                #写下csv一行
                writer.writerow([query_id,title_id,pred])
                line_count+=1
    print("complete, "+"using time:"+str(time.time()-start_time))

        
#调用该函数
predict('lgb_model.txt','all_feature_test.txt','test.csv','result.csv')


complete, using time:0.007083892822265625
