In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import datetime
from deep_tools import f
from deep_tools import DataGenerator

register=pd.read_csv('./data/user_register_log.txt',sep='\t',names=['user_id','register_day','register_type','device_type'])
launch=pd.read_csv('./data/app_launch_log.txt',sep='\t',names=['user_id','launch_day'])
create=pd.read_csv('./data/video_create_log.txt',sep='\t',names=['user_id','create_day'])
activity=pd.read_csv('./data/user_activity_log.txt',sep='\t',names=['user_id','act_day','page','video_id','author_id','act_type'])

  from ._conv import register_converters as _register_converters


In [2]:
#参数
n_features=12
n_hu=5
n_device=50
n_register=7
n_days=31

In [3]:
data_generator=DataGenerator(register,launch,create,activity)

In [4]:
#device_dict
device_table=register.groupby(['device_type'],as_index=False).agg({'user_id':'count'})
device_table=device_table.sort_values(by=['user_id'],ascending=False)
device_table['device_type_map']=np.arange(len(device_table))
device_table.drop('user_id',axis=1,inplace=True)
register=pd.merge(register,device_table)
device_dict={row[0]:row[-1] for index,row in register.iterrows()}

#register_dict
register_dict={row[0]:row[2] for index,row in register.iterrows()}

In [5]:
tf.reset_default_graph()
tf.set_random_seed(10)

#Variables
with tf.variable_scope('test4'):
    
    #变量与输入
    lr=tf.placeholder(tf.float32,[],name='learning_rate')

    W_out=tf.get_variable('W_out',[n_hu,1])
    b_out=tf.get_variable('b_out',[1])

    
    x=tf.placeholder(tf.float32,[None,None,n_features])
    y=tf.placeholder(tf.float32,[None,None])
    
    batch_size=tf.shape(x)[0]
    seq_length=tf.shape(x)[1]

    PR_input=tf.placeholder(tf.float32,[None,None,1])
    
    device_input=tf.placeholder(tf.int32,[None])
    register_input=tf.placeholder(tf.int32,[None])
    date_input=tf.placeholder(tf.int32,[None])
    
    device_embedding=tf.get_variable('device_embedding',[n_device,1],initializer=tf.zeros_initializer)
    register_embedding=tf.get_variable('register_embedding',[n_register,1],initializer=tf.zeros_initializer)
    date_embedding=tf.get_variable('date_embedding',[n_days,1],initializer=tf.zeros_initializer)
    
    #RNN层
    cell=tf.nn.rnn_cell.GRUCell(n_hu)
    initial_state = cell.zero_state(batch_size, dtype=tf.float32)
    outputs, state = tf.nn.dynamic_rnn(cell, x,
                                       initial_state=initial_state)
    
    #输出层
    outputs=tf.reshape(outputs,[-1,n_hu])
    logits=tf.matmul(outputs,W_out)+b_out
    logits=tf.reshape(logits,tf.stack([batch_size,seq_length]))
    
    device_intercept=tf.nn.embedding_lookup(device_embedding,device_input)
    register_intercept=tf.nn.embedding_lookup(register_embedding,register_input)
    date_intercept=tf.nn.embedding_lookup(date_embedding,date_input)
    date_intercept=tf.reshape(date_intercept,tf.stack([1,seq_length]))
    
    
    logits=logits+device_intercept+register_intercept+date_intercept

In [6]:
#local_train
logits_local_train=logits[:,:-14]
label_local_train=y[:,:-14]

regularizer=tf.contrib.layers.l2_regularizer(0.00001)
penalty=tf.contrib.layers.apply_regularization(regularizer,tf.trainable_variables())

obj_local=tf.losses.sigmoid_cross_entropy(label_local_train,logits_local_train)+penalty
optimizer=tf.train.AdamOptimizer(lr)
step_local=optimizer.minimize(obj_local)

#local_test
logits_local_test=logits[:,-8]
label_local_test=y[:,-8]

#online_train
logits_online_train=logits[:,:-7]
label_online_train=y[:,:-7]

obj_online=tf.losses.sigmoid_cross_entropy(label_online_train,logits_online_train)+penalty
optimizer=tf.train.AdamOptimizer(lr)
step_online=optimizer.minimize(obj_online)

#online_test
logits_online_test=logits[:,-1]

In [7]:
sess=tf.Session()
sess.run(tf.global_variables_initializer())

In [8]:
def test(strategy='local'):
    if strategy=='local':
        n_NA=14
        date_seq=[31]+list(range(2,16))+[16]*15
        variables_1=[obj_local,logits_local_train,label_local_train]
        variables_2=[logits_local_test,label_local_test]
    else:
        n_NA=7
        date_seq=[31]+list(range(2,23))+[23]*8
        variables_1=[obj_online,logits_online_train,label_online_train]
        variables_2=logits_online_test
        
    obs_count,cum_loss,correct=0,0,0
    user,prob,real=[],[],[]

    #训练损失
    for length,id_list,data_x,data_y in zip(*data_generator.get_set(strategy,'train')):
        _obj,_logits_train,_label_train=sess.run(variables_1,
                           feed_dict={x:data_x,
                                      y:data_y,
                                      device_input:[device_dict[u] for u in id_list],
                                      register_input:[register_dict[u] for u in id_list],
                                      date_input:date_seq[-length:],
                                      lr:0.001})

        obs_count+=(length-n_NA)*len(id_list)
        cum_loss+=_obj*(length-n_NA)*len(id_list)
        correct+=np.sum((1*(_logits_train>0)==_label_train))

    #测试损失
    for length,id_list,data_x,data_y in zip(*data_generator.get_set(strategy,'test')):
        _=sess.run(variables_2,
                           feed_dict={x:data_x,
                                      y:data_y,
                                      device_input:[device_dict[u] for u in id_list],
                                      register_input:[register_dict[u] for u in id_list],
                                      date_input:date_seq[-length:],
                                      lr:0.001})
        if strategy=='local':
            _logits_test,_label_test=_
            real+=list(_label_test)
        else:
            _logits_test=_

        user+=list(id_list)
        prob+=list(1/(1+np.exp(-_logits_test.reshape([-1]))))
    
    #训练损失
    print('train_loss',cum_loss/obs_count,correct/obs_count)
    
    #测试损失
    if strategy=='local':
        result=pd.DataFrame({'user_id':user,'prob':prob,'label':real})
        print('test_score:',f(result))
    else:
        result=pd.DataFrame({'user_id':user,'prob':prob})
    return result

In [9]:
def train(strategy='local',n_obs=1000,step=1000,lr_feed=0.01):
    
    if strategy=='local':
        date_seq=[31]+list(range(2,16))+[16]*15
        variables=[step_local,obj_local,label_local_train,logits_local_train]
    else:
        date_seq=[31]+list(range(2,23))+[23]*8
        variables=[step_online,obj_online,label_online_train,logits_online_train]

    for i in range(step):
        length,id_list,data_x,data_y=data_generator.next_batch(strategy,n_obs)
        _,los,lab,log=sess.run(variables,
                               feed_dict={x:data_x,
                                          y:data_y,
                                          device_input:[device_dict[u] for u in id_list],
                                          register_input:[register_dict[u] for u in id_list],
                                          date_input:date_seq[-length:],
                                          lr:lr_feed})

In [10]:
sess.run(tf.global_variables_initializer())

In [14]:
def cos_annealing_local(epoch=5):
    all_result=None
    for i in range(epoch):
        train('local',n_obs=1000,step=2000,lr_feed=0.01)
        train('local',n_obs=1000,step=2000,lr_feed=0.001)
        result=test('local')
        print(sess.run(penalty))
        result.columns=['label','prob%s'%i,'user_id']
        if i==0:
            all_result=result
        else:
            all_result=pd.merge(all_result,result)
    return all_result

def cos_annealing_online(epoch=5):
    all_result=None
    for i in range(epoch):
        train('online',n_obs=1000,step=2000,lr_feed=0.01)
        train('online',n_obs=1000,step=2000,lr_feed=0.001)
        result=test('online')
        print(sess.run(penalty))
        result.columns=['prob%s'%i,'user_id']
        if i==0:
            all_result=result
        else:
            all_result=pd.merge(all_result,result)
    return all_result

In [12]:
#线下测试
print(datetime.datetime.now())
result=cos_annealing_local(5)
print(datetime.datetime.now())

2018-08-19 11:50:41.944068
train_loss 0.460530827227288 0.7841321877065432
test_score: [0.8052266444108289, 0.8053691275167785, 0.8060189211268348, 0.8061283995122726, 0.8058818828418445, 0.8054848817590187]
0.00060405146


KeyboardInterrupt: 

In [15]:
#线上提交
print(datetime.datetime.now())
result=cos_annealing_online(5)
print(datetime.datetime.now())

2018-08-19 11:56:54.080738
train_loss 0.44000142605272863 0.7969419126112822
0.0007574438
train_loss 0.4397496822694218 0.7969369818028514
0.00073891063
2018-08-19 12:01:13.835935


In [18]:
#融合
result['prob']=(result.prob0+result.prob1+result.prob2+result.prob3+result.prob4)/5

In [19]:
result.sort_values(by='prob',ascending=False,inplace=True)
result=result.reset_index(drop=True)

In [20]:
result.loc[:24800,['user_id']].to_csv('output/result.csv',header=False,index=False)