In [1]:
# 将所有特征串联起来，构成RS_Train.csv
#RS_Test.csv
#为最后推荐系统做准备
from __future__ import division

import pickle
import numpy as np
import scipy.io as sio
import scipy.sparse as ss
import math as ma
from numpy.random import random  
from collections import defaultdict

class RecommonderSystem:
  def __init__(self):
    # 读入数据做初始化
    
    #用户和活动新的索引
    self.userIndex = pickle.load(open("PE_userIndex.pkl", 'rb'))
    self.eventIndex = pickle.load(open("PE_eventIndex.pkl", 'rb'))
    self.n_users = len(self.userIndex)
    self.n_items = len(self.eventIndex)
    
    #用户-活动关系矩阵R
    #在train_SVD会重新从文件中读取,二者要求的格式不同，来不及统一了:(
    self.userEventScores = sio.mmread("PE_userEventScores").todense()
    
    #倒排表
    ##每个用户参加的事件
    self.itemsForUser = pickle.load(open("PE_eventsForUser.pkl", 'rb'))
    ##事件参加的用户
    self.usersForItem = pickle.load(open("PE_usersForEvent.pkl", 'rb'))
    
    #基于模型的协同过滤参数初始化,训练
    self.init_SVD()
    self.train_SVD(trainfile = "train.csv")
    
    #根据用户属性计算出的用户之间的相似度
    self.userSimMatrix = sio.mmread("US_userSimMatrix").todense()
    
    #根据活动属性计算出的活动之间的相似度
    self.eventPropSim = sio.mmread("EV_eventPropSim").todense()
    self.eventContSim = sio.mmread("EV_eventContSim").todense()
    
    #每个用户的朋友的数目
    self.numFriends = sio.mmread("UF_numFriends")
    #用户的每个朋友参加活动的分数对该用户的影响
    self.userFriends = sio.mmread("UF_userFriends").todense()
    
    #活动本身的热度
    self.eventPopularity = sio.mmread("EA_eventPopularity").todense()
    
#     for uid1 in self.userIndex:
#         for uid2 in self.userIndex:
#             if not uid1== uid2:
#                 sim = self.sim_cal_UserCF( uid1, uid2)

  def init_SVD(self, K=20):
    #初始化模型参数（for 基于模型的协同过滤SVD_CF）
    self.K = K  
    
    
    #init parameters
    #bias
    self.bi = np.zeros(self.n_items)  
    self.bu = np.zeros(self.n_users)  
     
    
    #the small matrix
    self.P = random((self.n_users,self.K))/10*(np.sqrt(self.K))
    self.Q = random((self.K, self.n_items))/10*(np.sqrt(self.K))  
    
    
                  
          
  def train_SVD(self,trainfile = 'train.csv', steps=100,gamma=0.04,Lambda=0.15):
    #训练SVD模型（for 基于模型的协同过滤SVD_CF）
    #gamma：为学习率
    #Lambda：正则参数
    
    #偷懒了，为了和原来的代码的输入接口一样，直接从训练文件中去读取数据
    print ("SVD Train...")
    ftrain = open(trainfile, 'r')
    ftrain.readline()
    self.mu = 0.0
    n_records = 0
    uids = []  #每条记录的用户索引
    i_ids = [] #每条记录的item索引
    #用户-Item关系矩阵R（内容同userEventScores相同），临时变量，训练完了R不再需要
    R = np.zeros((self.n_users, self.n_items))
    
    for line in ftrain:
        cols = line.strip().split(",")
        u = self.userIndex[cols[0]]  #用户
        i = self.eventIndex[cols[1]] #活动
        
        uids.append(u)
        i_ids.append(i)
        
        R[u,i] = int(cols[4])  #interested
        self.mu += R[u,i]
        n_records += 1
    
    ftrain.close()
    self.mu /= n_records
    
    # 请补充完整SVD模型训练过程
    self.X = R

    
    for step in range(steps):  
        print ('the ',step,'-th  step is running')  
        rmse_sum=0.0 

        #将训练样本打散顺序
        kk = np.random.permutation(self.X.shape[0])  
        print(kk)
        for j in range(self.X.shape[0]):  

            #每次一个训练样本
            i=kk[j]  
            uid=int(self.X[i][0])  
            i_id=int(self.X[i][1])  
            
            rat=self.X[i][2]  

            #预测残差
            eui=rat-self.pred_SVD(uid,i_id)  
            #残差平方和
            rmse_sum+=eui**2  

            #随机梯度下降，更新
            self.bu[uid]+=gamma*(eui-Lambda*self.bu[uid])  
            self.bi[i_id]+=gamma*(eui-Lambda*self.bi[i_id]) 

            temp=self.Q[:,i_id]  
            self.Q[:,i_id]+=gamma*(eui*self.P[uid,:]-Lambda*self.Q[:,i_id])  
            
            self.P[uid,:]+=gamma*(eui*temp-Lambda*self.P[uid,:])  

        #学习率递减
        gamma=gamma*0.93  
        print ("the rmse of this step on train data is ",np.sqrt(rmse_sum/self.X.shape[0]))  
        #self.test(test_data)  

    
    print ("SVD trained")
    
  def pred_SVD(self, uid, i_id):
    #根据当前参数，预测用户uid对Item（i_id）的打分  
    m = self.mu
    _bi = self.bi[i_id]
    _bu = self.bu[uid]
    nd = np.dot(self.P[uid,:],self.Q[:,i_id])
    ans = m+_bi+_bu+nd
#     ans=self.mu + self.bi[i_id] + self.bu[uid] + np.dot(self.P[uid,:],self.Q[:,i_id])  
        
    #将打分范围控制在0-1之间
    if ans>1:  
        return 1  
    elif ans<0:  
        return 0
    return ans  

  def sim_cal_UserCF(self, uid1, uid2 ):
    
    #请补充基于用户的协同过滤中的两个用户uid1和uid2之间的相似度（根据两个用户对item打分的相似度）
    similarity = 0.0
    
    '''
    获得两个用户对于所有事件的打分
    获得两个用户对于所有事件的平均打分
    套用相似度公式获得相似度
    '''
    uid1_ = uid1#self.userIndex[uid1]
    uid2_ = uid2#self.userIndex[uid2]
    
    items1 = self.userEventScores[uid1_,:].getA1()
    sum_items1_bar = np.sum(items1) / len(items1)
    items2 = self.userEventScores[uid2_,:].getA1()
    sum_items2_bar = np.sum(items2) / len(items2)
    
    rap_rabar = (items1 - sum_items1_bar)
    rbp_rbbar = (items2 - sum_items2_bar)
    
    up = np.sum(rap_rabar * rbp_rbbar)
    down1 = ma.sqrt(np.sum(rap_rabar* rap_rabar)) + 0.000001
    down2 = ma.sqrt(np.sum(rbp_rbbar* rbp_rbbar)) + 0.000001
    
    
    similarity = up / (down1 * down2)
    
    
    return similarity

  def userCFReco(self, userId, eventId):
    """
    根据User-based协同过滤，得到event的推荐度
    基本的伪代码思路如下：
    for item i
      for every other user v that has a preference for i
        compute similarity s between u and v
        incorporate v's preference for i weighted by s into running aversge
    return top items ranked by weighted average
    """
    #请补充完整代码
    ans = 0.0
    
    '''
    单纯的公式实现
    '''
    
    u = self.userIndex[userId]
    i = self.eventIndex[eventId]
  
    u_all = self.userEventScores[u,:].getA1()
    u_bar = sum(u_all) / len(u_all)
    
    up_sum = 0
    down_sum = 0
    
    for uid2 in self.userIndex:
        v = self.userIndex[uid2]
        v_all = self.userEventScores[v,:].getA1()
        v_bar = np.sum(u_all) / len(u_all)
        sim = self.sim_cal_UserCF(u,v)
        if not (u == v) :
            up_sum = up_sum + sim * (self.userEventScores[v,i] - v_bar)
            down_sum = down_sum + sim    
    
    ans = up_sum / (down_sum + 0.000001)
    return ans


  def sim_cal_ItemCF(self, i_id1, i_id2):
    #计算Item i_id1和i_id2之间的相似性
    #请补充完整代码
    #similarity = 0.0
    
    '''
    获得有效用户标记
    '''
    user_valid = {}
    for user in self.UserForItem[i_id1]: #对item1打过分
        if user in self.UserForItem[i_id2]: # 也对item2打过分
            user_valid[user] = 1
            
    #获得有效用户数
    num_valid = len(user_valid)
    if num_valid == 0: # 没有共同打分的用户，相似度为0
        return 0
    
    #所有有效用户对1的打分
    item1_all = np.array([self.UsersForItem[i_id1][u] for u in user_valid])
    #所有有效用户对2的打分
    item2_all = np.array([self.UsersForItem[i_id2][u] for u in user_valid])
    
    sum1=np.sum(item1_all)  
    sum2=np.sum(item2_all)  
    sum1Sq=np.sum(item1_all**2)  
    sum2Sq=np.sum(item2_all**2)  
    pSum=np.sum(item1_all*item2_all)  

    #分子
    num=pSum-(sum1*sum2/n)  

    #分母
    den=np.sqrt((sum1Sq-sum1**2/n)*(sum2Sq-sum2**2/n))  
    if den==0:  
        return 0  

    #return similarity     
    return num/den  
            
  def eventCFReco(self, userId, eventId):    
    """
    根据基于物品的协同过滤，得到Event的推荐度
    基本的伪代码思路如下：
    for item i 
        for every item j tht u has a preference for
            compute similarity s between i and j
            add u's preference for j weighted by s to a running average
    return top items, ranked by weighted average
    """
    #请补充完整代码
    #ans = 0.0
    
    sim_accumulate=0.0  
    rat_acc=0.0  

    for item in self.itemsForUser[userId]:  #用户uid打过分的所有Item
        sim = self.sim_cal_ItemCF(item,eventId)    #该Item与i_id之间的相似度
        if sim<0:continue  
        #print sim,self.user_movie[uid][item],sim*self.user_movie[uid][item]  

        rat_acc += sim * self.itemsForUser[userId][item]  
        sim_accumulate += sim  

    #print rat_acc,sim_accumulate  
    if sim_accumulate==0: #no same user rated,return average rates of the data  
        return  self.mu  
    return rat_acc/sim_accumulate  

  
    #return ans
    
  def svdCFReco(self, userId, eventId):
    #基于模型的协同过滤, SVD++/LFM
    u = self.userIndex[userId]
    i = self.eventIndex[eventId]

    return self.pred_SVD(u,i)

  def userReco(self, userId, eventId):
    """
    类似基于User-based协同过滤，只是用户之间的相似度由用户本身的属性得到，计算event的推荐度
    基本的伪代码思路如下：
    for item i
      for every other user v that has a preference for i
        compute similarity s between u and v
        incorporate v's preference for i weighted by s into running aversge
    return top items ranked by weighted average
    """
    i = self.userIndex[userId]
    j = self.eventIndex[eventId]

    vs = self.userEventScores[:, j]
    sims = self.userSimMatrix[i, :]

    prod = sims * vs

    try:
      return prod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      return 0

  def eventReco(self, userId, eventId):
    """
    类似基于Item-based协同过滤，只是item之间的相似度由item本身的属性得到，计算Event的推荐度
    基本的伪代码思路如下：
    for item i 
      for every item j that u has a preference for
        compute similarity s between i and j
        add u's preference for j weighted by s to a running average
    return top items, ranked by weighted average
    """
    i = self.userIndex[userId]
    j = self.eventIndex[eventId]
    js = self.userEventScores[i, :]
    psim = self.eventPropSim[:, j]
    csim = self.eventContSim[:, j]
    pprod = js * psim
    cprod = js * csim
    
    pscore = 0
    cscore = 0
    try:
      pscore = pprod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      pass
    try:
      cscore = cprod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      pass
    return pscore, cscore

  def userPop(self, userId):
    """
    基于用户的朋友个数来推断用户的社交程度
    主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动
    """
    if userId in self.userIndex:
      i = self.userIndex[userId]
      try:
        return self.numFriends[0, i]
      except IndexError:
        return 0
    else:
      return 0

  def friendInfluence(self, userId):
    """
    朋友对用户的影响
    主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的
    用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响
    """
    nusers = np.shape(self.userFriends)[1]
    i = self.userIndex[userId]
    return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]

  def eventPop(self, eventId):
    """
    本活动本身的热度
    主要是通过参与的人数来界定的
    """
    i = self.eventIndex[eventId]
    return self.eventPopularity[i, 0]



In [2]:
def generateRSData(RS, train=True, header=True):
    """
    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起
    生成新的训练数据，用于分类器分类使用
    """
    fn = "train.csv" if train else "test.csv"
    fin = open(fn, 'rb')
    fout = open("RS_" + fn, 'wb')
    
    #忽略第一行（列名字）
    fin.readline().strip().decode().split(",")
    
    # write output header
    if header:
      ocolnames = ["invited", "userCF_reco", "evtCF_reco","svdCF_reco","user_reco", "evt_p_reco",
        "evt_c_reco", "user_pop", "frnd_infl", "evt_pop"]
      if train:
        ocolnames.append("interested")
        ocolnames.append("not_interested")
      j = ",".join(ocolnames) + "\n"
      fout.write(j.encode())
    
    ln = 0
    for line in fin:
      ln += 1
      if ln%10 == 0:
          print( "%s:%d (userId, eventId)=(%s, %s)" % (fn, ln, userId, eventId))
          #break;
      
      cols = line.strip().decode().split(",")
      userId = cols[0]
      eventId = cols[1]
      invited = cols[2]
      
      userCF_reco = RS.userCFReco(userId, eventId)
      itemCF_reco = RS.eventCFReco(userId, eventId)
      svdCF_reco = RS.svdCFReco(userId, eventId)
        
      user_reco = RS.userReco(userId, eventId)
      evt_p_reco, evt_c_reco = RS.eventReco(userId, eventId)
      user_pop = RS.userPop(userId)
     
      frnd_infl = RS.friendInfluence(userId)
      evt_pop = RS.eventPop(eventId)
      ocols = [invited, userCF_reco, itemCF_reco, svdCF_reco,user_reco, evt_p_reco,
        evt_c_reco, user_pop, frnd_infl, evt_pop]
      
      if train:
        ocols.append(cols[4]) # interested
        ocols.append(cols[5]) # not_interested
      fout.write((",".join(map(lambda x: str(x), ocols)) + "\n").encode())
    
    fin.close()
    fout.close()


In [3]:
RS = RecommonderSystem()
print( "生成训练数据...\n")
generateRSData(RS,train=True,  header=True)

print( "生成预测数据...\n")
generateRSData(RS, train=False, header=True)

SVD Train...
the  0 -th  step is running
[1446 1571 1432 ...  775 1146 1090]
the rmse of this step on train data is  0.04796344531680657
the  1 -th  step is running
[3058 1842 1505 ... 2768 2910 1491]
the rmse of this step on train data is  0.025594845209121104
the  2 -th  step is running
[1247  479 3025 ... 2686  422  953]
the rmse of this step on train data is  0.02557974321371307
the  3 -th  step is running
[ 945 1542  470 ...  343 2914 2994]
the rmse of this step on train data is  0.02556576901609378
the  4 -th  step is running
[1653 2516 1570 ... 1042 1779 1344]
the rmse of this step on train data is  0.025552833674691803
the  5 -th  step is running
[1068 2999 2311 ... 2403  853 1443]
the rmse of this step on train data is  0.025540855915730014
the  6 -th  step is running
[2282 2295  639 ...  857 1078 2762]
the rmse of this step on train data is  0.025529761382887803
the  7 -th  step is running
[2700  994 2774 ... 1056 2071 3049]
the rmse of this step on train data is  0.025519481

the rmse of this step on train data is  0.025376906459803224
the  69 -th  step is running
[ 158 2982  423 ... 1794 1421  874]
the rmse of this step on train data is  0.025431008460675798
the  70 -th  step is running
[ 337 3029  662 ...   86 2312 1538]
the rmse of this step on train data is  0.025347732906007177
the  71 -th  step is running
[2529 1591   61 ...  747 2977 2829]
the rmse of this step on train data is  0.02545508429723618
the  72 -th  step is running
[1569 1018  726 ... 1576 3374  872]
the rmse of this step on train data is  0.02530210909950871
the  73 -th  step is running
[3217 3341  779 ... 2363 1898  330]
the rmse of this step on train data is  0.025449391603433028
the  74 -th  step is running
[1401  655 2583 ... 1542  466  488]
the rmse of this step on train data is  0.025346312761399863
the  75 -th  step is running
[1155 1842 3209 ...   71 3314 2660]
the rmse of this step on train data is  0.025325008570159666
the  76 -th  step is running
[1128   79  942 ... 3311  905 

train.csv:780 (userId, eventId)=(210626346, 1718853706)
train.csv:790 (userId, eventId)=(220184636, 1421267283)
train.csv:800 (userId, eventId)=(221442949, 1390707377)
train.csv:810 (userId, eventId)=(223870698, 3863879024)
train.csv:820 (userId, eventId)=(238446352, 2764233627)
train.csv:830 (userId, eventId)=(240521211, 3123265999)
train.csv:840 (userId, eventId)=(242262473, 2891981561)
train.csv:850 (userId, eventId)=(244644122, 823015621)
train.csv:860 (userId, eventId)=(246547106, 1728390960)
train.csv:870 (userId, eventId)=(247769793, 1336517851)
train.csv:880 (userId, eventId)=(248805577, 2498491194)
train.csv:890 (userId, eventId)=(249577815, 862876550)
train.csv:900 (userId, eventId)=(249577815, 530842823)
train.csv:910 (userId, eventId)=(252627634, 4002702641)
train.csv:920 (userId, eventId)=(255311196, 2568338180)
train.csv:930 (userId, eventId)=(256692717, 2130616732)
train.csv:940 (userId, eventId)=(261990895, 2815171271)
train.csv:950 (userId, eventId)=(263599897, 1206554

train.csv:2230 (userId, eventId)=(588924229, 2153037761)
train.csv:2240 (userId, eventId)=(589199943, 1333696176)
train.csv:2250 (userId, eventId)=(593799836, 3458190779)
train.csv:2260 (userId, eventId)=(599144899, 3632072502)
train.csv:2270 (userId, eventId)=(601094182, 4243222986)
train.csv:2280 (userId, eventId)=(602317951, 2054614790)
train.csv:2290 (userId, eventId)=(603763318, 3816284328)
train.csv:2300 (userId, eventId)=(604530209, 3276213471)
train.csv:2310 (userId, eventId)=(605903903, 1532377761)
train.csv:2320 (userId, eventId)=(608310969, 1600413013)
train.csv:2330 (userId, eventId)=(609331956, 2554360190)
train.csv:2340 (userId, eventId)=(610133863, 3982727529)
train.csv:2350 (userId, eventId)=(612098420, 614606641)
train.csv:2360 (userId, eventId)=(613747359, 3376203612)
train.csv:2370 (userId, eventId)=(614599699, 2428145712)
train.csv:2380 (userId, eventId)=(616227847, 4140981574)
train.csv:2390 (userId, eventId)=(618927177, 1698790428)
train.csv:2400 (userId, eventId)

train.csv:3680 (userId, eventId)=(1030441254, 530021453)
train.csv:3690 (userId, eventId)=(1037260580, 673098017)
train.csv:3700 (userId, eventId)=(1038432983, 2144881822)
train.csv:3710 (userId, eventId)=(1039126403, 2498491194)
train.csv:3720 (userId, eventId)=(1044598291, 952519340)
train.csv:3730 (userId, eventId)=(1047741032, 3768548263)
train.csv:3740 (userId, eventId)=(1053832375, 2529072432)
train.csv:3750 (userId, eventId)=(1055981550, 1203717384)
train.csv:3760 (userId, eventId)=(1060557660, 3925512770)
train.csv:3770 (userId, eventId)=(1061545438, 2013513606)
train.csv:3780 (userId, eventId)=(1063337028, 2181144698)
train.csv:3790 (userId, eventId)=(1066906993, 622120837)
train.csv:3800 (userId, eventId)=(1067164735, 200354622)
train.csv:3810 (userId, eventId)=(1067164735, 72461116)
train.csv:3820 (userId, eventId)=(1067164735, 2384254802)
train.csv:3830 (userId, eventId)=(1067164735, 3649047558)
train.csv:3840 (userId, eventId)=(1067164735, 1105514937)
train.csv:3850 (userI

train.csv:5100 (userId, eventId)=(1452056386, 4202927804)
train.csv:5110 (userId, eventId)=(1459241748, 2193156408)
train.csv:5120 (userId, eventId)=(1463717359, 615488222)
train.csv:5130 (userId, eventId)=(1463808662, 327009161)
train.csv:5140 (userId, eventId)=(1474775097, 1361307272)
train.csv:5150 (userId, eventId)=(1482478167, 2498491194)
train.csv:5160 (userId, eventId)=(1484078818, 2806565970)
train.csv:5170 (userId, eventId)=(1486465606, 1269035551)
train.csv:5180 (userId, eventId)=(1493407039, 2977827251)
train.csv:5190 (userId, eventId)=(1507405031, 776341364)
train.csv:5200 (userId, eventId)=(1515828822, 3126487334)
train.csv:5210 (userId, eventId)=(1518400220, 1840365070)
train.csv:5220 (userId, eventId)=(1525918364, 2130616732)
train.csv:5230 (userId, eventId)=(1528359455, 1394487790)
train.csv:5240 (userId, eventId)=(1531475209, 4239152097)
train.csv:5250 (userId, eventId)=(1535643599, 353933726)
train.csv:5260 (userId, eventId)=(1537885461, 396853098)
train.csv:5270 (use

train.csv:6530 (userId, eventId)=(1926859556, 2187392019)
train.csv:6540 (userId, eventId)=(1931485787, 633659090)
train.csv:6550 (userId, eventId)=(1931818423, 440098296)
train.csv:6560 (userId, eventId)=(1933513142, 476754780)
train.csv:6570 (userId, eventId)=(1935452377, 616825218)
train.csv:6580 (userId, eventId)=(1935452377, 3266414126)
train.csv:6590 (userId, eventId)=(1946827878, 952519340)
train.csv:6600 (userId, eventId)=(1948333652, 635799819)
train.csv:6610 (userId, eventId)=(1949443459, 1205670273)
train.csv:6620 (userId, eventId)=(1951340806, 762779146)
train.csv:6630 (userId, eventId)=(1952851403, 689023473)
train.csv:6640 (userId, eventId)=(1954193338, 3317052947)
train.csv:6650 (userId, eventId)=(1954960931, 3177059558)
train.csv:6660 (userId, eventId)=(1955766052, 3213836603)
train.csv:6670 (userId, eventId)=(1959680039, 1549888315)
train.csv:6680 (userId, eventId)=(1964309262, 1600413013)
train.csv:6690 (userId, eventId)=(1966851184, 1484873489)
train.csv:6700 (userId

train.csv:7960 (userId, eventId)=(2329661605, 2114371180)
train.csv:7970 (userId, eventId)=(2329755329, 547383999)
train.csv:7980 (userId, eventId)=(2332715028, 2149464820)
train.csv:7990 (userId, eventId)=(2334852679, 1900681884)
train.csv:8000 (userId, eventId)=(2338481531, 2525447278)
train.csv:8010 (userId, eventId)=(2340199364, 2683439036)
train.csv:8020 (userId, eventId)=(2342735753, 2529072432)
train.csv:8030 (userId, eventId)=(2343762978, 3643004825)
train.csv:8040 (userId, eventId)=(2344133160, 907302600)
train.csv:8050 (userId, eventId)=(2348719537, 3801087697)
train.csv:8060 (userId, eventId)=(2350981164, 4002798045)
train.csv:8070 (userId, eventId)=(2353444142, 3047117217)
train.csv:8080 (userId, eventId)=(2360121322, 2274620725)
train.csv:8090 (userId, eventId)=(2375503792, 2098941988)
train.csv:8100 (userId, eventId)=(2378571713, 3854862695)
train.csv:8110 (userId, eventId)=(2381922770, 2129139974)
train.csv:8120 (userId, eventId)=(2381922770, 2129139974)
train.csv:8130 (

train.csv:9380 (userId, eventId)=(2766269208, 3576586766)
train.csv:9390 (userId, eventId)=(2766269208, 1510744133)
train.csv:9400 (userId, eventId)=(2768571751, 1570416070)
train.csv:9410 (userId, eventId)=(2771438232, 95759437)
train.csv:9420 (userId, eventId)=(2772609387, 2389263741)
train.csv:9430 (userId, eventId)=(2774351000, 261646692)
train.csv:9440 (userId, eventId)=(2780094512, 3342889803)
train.csv:9450 (userId, eventId)=(2787327616, 1617947886)
train.csv:9460 (userId, eventId)=(2787637778, 1269035551)
train.csv:9470 (userId, eventId)=(2788859591, 3256009463)
train.csv:9480 (userId, eventId)=(2789880308, 741936389)
train.csv:9490 (userId, eventId)=(2791418962, 3199229320)
train.csv:9500 (userId, eventId)=(2791418962, 4223848259)
train.csv:9510 (userId, eventId)=(2792856754, 52246840)
train.csv:9520 (userId, eventId)=(2794470551, 3768680031)
train.csv:9530 (userId, eventId)=(2796038131, 4063572110)
train.csv:9540 (userId, eventId)=(2804635189, 1164582156)
train.csv:9550 (user

train.csv:10790 (userId, eventId)=(3108480638, 608092517)
train.csv:10800 (userId, eventId)=(3115928657, 2149464820)
train.csv:10810 (userId, eventId)=(3117046697, 1902753965)
train.csv:10820 (userId, eventId)=(3118938050, 862976415)
train.csv:10830 (userId, eventId)=(3122183291, 1544421101)
train.csv:10840 (userId, eventId)=(3123500032, 2212723194)
train.csv:10850 (userId, eventId)=(3123712598, 1203717384)
train.csv:10860 (userId, eventId)=(3138635490, 875529449)
train.csv:10870 (userId, eventId)=(3142327997, 2054616566)
train.csv:10880 (userId, eventId)=(3147626282, 828104990)
train.csv:10890 (userId, eventId)=(3147626282, 2847152365)
train.csv:10900 (userId, eventId)=(3147626282, 3745079581)
train.csv:10910 (userId, eventId)=(3148555709, 1167949960)
train.csv:10920 (userId, eventId)=(3151272434, 3710909677)
train.csv:10930 (userId, eventId)=(3155504635, 1521958638)
train.csv:10940 (userId, eventId)=(3157328676, 703370904)
train.csv:10950 (userId, eventId)=(3165928866, 992561323)
tra

train.csv:12190 (userId, eventId)=(3466521212, 3777087288)
train.csv:12200 (userId, eventId)=(3467140948, 1532377761)
train.csv:12210 (userId, eventId)=(3467307298, 3756572732)
train.csv:12220 (userId, eventId)=(3471761709, 10609218)
train.csv:12230 (userId, eventId)=(3474900298, 734470389)
train.csv:12240 (userId, eventId)=(3477857993, 4233032010)
train.csv:12250 (userId, eventId)=(3481743878, 810535696)
train.csv:12260 (userId, eventId)=(3486057072, 1742345117)
train.csv:12270 (userId, eventId)=(3486057072, 767782490)
train.csv:12280 (userId, eventId)=(3486057072, 3438779574)
train.csv:12290 (userId, eventId)=(3488552280, 2298040175)
train.csv:12300 (userId, eventId)=(3500240032, 268233790)
train.csv:12310 (userId, eventId)=(3502990892, 3950589515)
train.csv:12320 (userId, eventId)=(3504006486, 3930077968)
train.csv:12330 (userId, eventId)=(3504868122, 907302600)
train.csv:12340 (userId, eventId)=(3505511916, 1451401793)
train.csv:12350 (userId, eventId)=(3507534894, 327075228)
train

train.csv:13590 (userId, eventId)=(3835637004, 1778753558)
train.csv:13600 (userId, eventId)=(3838242688, 1816813281)
train.csv:13610 (userId, eventId)=(3839562998, 3302719635)
train.csv:13620 (userId, eventId)=(3839562998, 3692401116)
train.csv:13630 (userId, eventId)=(3841524478, 771676713)
train.csv:13640 (userId, eventId)=(3841682652, 317715279)
train.csv:13650 (userId, eventId)=(3842862938, 524127618)
train.csv:13660 (userId, eventId)=(3843287866, 1532377761)
train.csv:13670 (userId, eventId)=(3848620961, 3849306291)
train.csv:13680 (userId, eventId)=(3850128249, 4114296702)
train.csv:13690 (userId, eventId)=(3851487388, 627338344)
train.csv:13700 (userId, eventId)=(3861589862, 3327757331)
train.csv:13710 (userId, eventId)=(3865016311, 2456139530)
train.csv:13720 (userId, eventId)=(3866177358, 955398943)
train.csv:13730 (userId, eventId)=(3867241412, 1255629030)
train.csv:13740 (userId, eventId)=(3870800241, 3069899537)
train.csv:13750 (userId, eventId)=(3877652693, 2082389230)
tr

train.csv:14990 (userId, eventId)=(4195660981, 2963144429)
train.csv:15000 (userId, eventId)=(4197193550, 1628057176)
train.csv:15010 (userId, eventId)=(4199616576, 400356575)
train.csv:15020 (userId, eventId)=(4199616576, 1826343214)
train.csv:15030 (userId, eventId)=(4200601044, 2784556347)
train.csv:15040 (userId, eventId)=(4202081674, 539032065)
train.csv:15050 (userId, eventId)=(4202112938, 753115138)
train.csv:15060 (userId, eventId)=(4205646791, 3928440935)
train.csv:15070 (userId, eventId)=(4206471336, 2149464820)
train.csv:15080 (userId, eventId)=(4206915288, 4157683270)
train.csv:15090 (userId, eventId)=(4213772349, 2558272675)
train.csv:15100 (userId, eventId)=(4217222631, 4130722320)
train.csv:15110 (userId, eventId)=(4217222631, 1673895160)
train.csv:15120 (userId, eventId)=(4217222631, 2860418664)
train.csv:15130 (userId, eventId)=(4220962429, 1124649470)
train.csv:15140 (userId, eventId)=(4222590338, 300004583)
train.csv:15150 (userId, eventId)=(4226575687, 914532885)
tr

test.csv:1080 (userId, eventId)=(473558899, 1076364848)
test.csv:1090 (userId, eventId)=(483141559, 2143656308)
test.csv:1100 (userId, eventId)=(485626279, 1922937308)
test.csv:1110 (userId, eventId)=(489332113, 600370874)
test.csv:1120 (userId, eventId)=(489332113, 1533039925)
test.csv:1130 (userId, eventId)=(491496715, 3680209340)
test.csv:1140 (userId, eventId)=(498762190, 1394911305)
test.csv:1150 (userId, eventId)=(502787831, 2428145712)
test.csv:1160 (userId, eventId)=(507590029, 87962584)
test.csv:1170 (userId, eventId)=(507841367, 4002702641)
test.csv:1180 (userId, eventId)=(511678471, 2601678976)
test.csv:1190 (userId, eventId)=(511678471, 1138354613)
test.csv:1200 (userId, eventId)=(515293671, 738484716)
test.csv:1210 (userId, eventId)=(516201949, 2806565970)
test.csv:1220 (userId, eventId)=(521893181, 4219488434)
test.csv:1230 (userId, eventId)=(524529500, 2767814706)
test.csv:1240 (userId, eventId)=(536673024, 1488068589)
test.csv:1250 (userId, eventId)=(541187878, 33351533

test.csv:2550 (userId, eventId)=(1034862978, 810618772)
test.csv:2560 (userId, eventId)=(1038960564, 1468446505)
test.csv:2570 (userId, eventId)=(1039053975, 2452291959)
test.csv:2580 (userId, eventId)=(1040473939, 4186611895)
test.csv:2590 (userId, eventId)=(1045779976, 1154290723)
test.csv:2600 (userId, eventId)=(1049481453, 1545807185)
test.csv:2610 (userId, eventId)=(1049843964, 2693086474)
test.csv:2620 (userId, eventId)=(1050303296, 675888033)
test.csv:2630 (userId, eventId)=(1060988803, 216378107)
test.csv:2640 (userId, eventId)=(1064453408, 2077865887)
test.csv:2650 (userId, eventId)=(1066251866, 2188481598)
test.csv:2660 (userId, eventId)=(1066251866, 4021240345)
test.csv:2670 (userId, eventId)=(1066372954, 1773230862)
test.csv:2680 (userId, eventId)=(1069845555, 64255702)
test.csv:2690 (userId, eventId)=(1071963586, 2073609284)
test.csv:2700 (userId, eventId)=(1073188652, 3232053960)
test.csv:2710 (userId, eventId)=(1078537007, 3838044081)
test.csv:2720 (userId, eventId)=(108

test.csv:4000 (userId, eventId)=(1623287180, 1626678328)
test.csv:4010 (userId, eventId)=(1626755496, 939765972)
test.csv:4020 (userId, eventId)=(1628915638, 979051762)
test.csv:4030 (userId, eventId)=(1637866930, 3830416827)
test.csv:4040 (userId, eventId)=(1638568594, 3460711091)
test.csv:4050 (userId, eventId)=(1644052705, 383090520)
test.csv:4060 (userId, eventId)=(1645299738, 3048578499)
test.csv:4070 (userId, eventId)=(1661089307, 2428145712)
test.csv:4080 (userId, eventId)=(1671510792, 728908337)
test.csv:4090 (userId, eventId)=(1683687851, 1269035551)
test.csv:4100 (userId, eventId)=(1686132519, 1641491432)
test.csv:4110 (userId, eventId)=(1696780091, 2073609284)
test.csv:4120 (userId, eventId)=(1697658257, 3168283375)
test.csv:4130 (userId, eventId)=(1699374488, 2710165417)
test.csv:4140 (userId, eventId)=(1712635579, 4149184300)
test.csv:4150 (userId, eventId)=(1727210805, 812516824)
test.csv:4160 (userId, eventId)=(1733597545, 1532377761)
test.csv:4170 (userId, eventId)=(173

test.csv:5450 (userId, eventId)=(2296487275, 3657281130)
test.csv:5460 (userId, eventId)=(2301083010, 3950589515)
test.csv:5470 (userId, eventId)=(2304075585, 3978621008)
test.csv:5480 (userId, eventId)=(2313021179, 3123256869)
test.csv:5490 (userId, eventId)=(2318198315, 1565715575)
test.csv:5500 (userId, eventId)=(2318415276, 2509151803)
test.csv:5510 (userId, eventId)=(2322320168, 2288198297)
test.csv:5520 (userId, eventId)=(2336501033, 2541362268)
test.csv:5530 (userId, eventId)=(2337182196, 2779241460)
test.csv:5540 (userId, eventId)=(2343176131, 639120581)
test.csv:5550 (userId, eventId)=(2346084629, 771676713)
test.csv:5560 (userId, eventId)=(2357605380, 2149464820)
test.csv:5570 (userId, eventId)=(2361587577, 601724573)
test.csv:5580 (userId, eventId)=(2366244106, 955398943)
test.csv:5590 (userId, eventId)=(2367420878, 1642783054)
test.csv:5600 (userId, eventId)=(2371436573, 794747940)
test.csv:5610 (userId, eventId)=(2377207090, 771676713)
test.csv:5620 (userId, eventId)=(2380

test.csv:6900 (userId, eventId)=(2883004291, 771676713)
test.csv:6910 (userId, eventId)=(2891771317, 1841145872)
test.csv:6920 (userId, eventId)=(2896196906, 1958166234)
test.csv:6930 (userId, eventId)=(2896254370, 2893435883)
test.csv:6940 (userId, eventId)=(2903534974, 3429958607)
test.csv:6950 (userId, eventId)=(2906437598, 797638314)
test.csv:6960 (userId, eventId)=(2910006718, 2805037878)
test.csv:6970 (userId, eventId)=(2911203191, 3634298859)
test.csv:6980 (userId, eventId)=(2926898853, 498238691)
test.csv:6990 (userId, eventId)=(2927241033, 3389640666)
test.csv:7000 (userId, eventId)=(2927772127, 1532377761)
test.csv:7010 (userId, eventId)=(2930631258, 2195745382)
test.csv:7020 (userId, eventId)=(2932877456, 4231641706)
test.csv:7030 (userId, eventId)=(2936934993, 782683695)
test.csv:7040 (userId, eventId)=(2943972367, 4125420656)
test.csv:7050 (userId, eventId)=(2945719859, 2598678508)
test.csv:7060 (userId, eventId)=(2954540407, 3696762643)
test.csv:7070 (userId, eventId)=(29

test.csv:8350 (userId, eventId)=(3556989632, 684002741)
test.csv:8360 (userId, eventId)=(3556989632, 602394192)
test.csv:8370 (userId, eventId)=(3562653363, 4203627753)
test.csv:8380 (userId, eventId)=(3573553441, 892071473)
test.csv:8390 (userId, eventId)=(3579660533, 3297564047)
test.csv:8400 (userId, eventId)=(3581835660, 185968523)
test.csv:8410 (userId, eventId)=(3582004219, 3083179290)
test.csv:8420 (userId, eventId)=(3594171699, 1076364848)
test.csv:8430 (userId, eventId)=(3598550965, 3981414613)
test.csv:8440 (userId, eventId)=(3601169721, 3684638133)
test.csv:8450 (userId, eventId)=(3601169721, 2739960670)
test.csv:8460 (userId, eventId)=(3601169721, 2473947326)
test.csv:8470 (userId, eventId)=(3601169721, 510692114)
test.csv:8480 (userId, eventId)=(3601169721, 1278199678)
test.csv:8490 (userId, eventId)=(3601169721, 2676714275)
test.csv:8500 (userId, eventId)=(3601169721, 154434302)
test.csv:8510 (userId, eventId)=(3601169721, 1077901228)
test.csv:8520 (userId, eventId)=(3601

test.csv:9800 (userId, eventId)=(4100120866, 3752251002)
test.csv:9810 (userId, eventId)=(4107865904, 1205670273)
test.csv:9820 (userId, eventId)=(4107865904, 1178194459)
test.csv:9830 (userId, eventId)=(4116824604, 3970983262)
test.csv:9840 (userId, eventId)=(4122118697, 2012765486)
test.csv:9850 (userId, eventId)=(4128087249, 245410232)
test.csv:9860 (userId, eventId)=(4128216168, 268233790)
test.csv:9870 (userId, eventId)=(4132156908, 1600413013)
test.csv:9880 (userId, eventId)=(4136255266, 2153037761)
test.csv:9890 (userId, eventId)=(4148656943, 176859368)
test.csv:9900 (userId, eventId)=(4155564936, 1190068601)
test.csv:9910 (userId, eventId)=(4156591803, 2272220944)
test.csv:9920 (userId, eventId)=(4157381568, 3378996021)
test.csv:9930 (userId, eventId)=(4159459029, 853521105)
test.csv:9940 (userId, eventId)=(4159956578, 41642879)
test.csv:9950 (userId, eventId)=(4165509784, 4242816413)
test.csv:9960 (userId, eventId)=(4166294065, 3860330975)
test.csv:9970 (userId, eventId)=(4168

时间、地点等特征都没有处理了，可以考虑用户看到event的时间与event开始时间的差、用户地点和event地点的差异。。。