In [1]:
import os
import pandas as pd 
import numpy as np

dataframe = pd.read_csv('train.csv')

for index in dataframe.columns:
    print('Missing value: ',dataframe[index].isnull().sum())
    print(dataframe[index].describe())
    print()

Missing value:  0
count                      416962
unique                     173711
top       2019-10-02 20:37:56 UTC
freq                          102
Name: event_time, dtype: object

Missing value:  0
count     416962
unique         4
top         view
freq      168854
Name: event_type, dtype: object

Missing value:  0
count    4.169620e+05
mean     5.498023e+06
std      1.268676e+06
min      3.752000e+03
25%      5.729011e+06
50%      5.809809e+06
75%      5.850305e+06
max      5.892800e+06
Name: product_id, dtype: float64

Missing value:  0
count    4.169620e+05
mean     1.556230e+18
std      1.644289e+17
min      1.490000e+18
25%      1.490000e+18
50%      1.490000e+18
75%      1.490000e+18
max      2.200000e+18
Name: category_id, dtype: float64

Missing value:  410798
count                              6164
unique                               10
top       appliances.environment.vacuum
freq                               2982
Name: category_code, dtype: object

Missing value:  16

In [2]:
dataframe['product_id'] = dataframe['product_id'].astype('object')
print(dataframe['product_id'].dtype)
print(dataframe['product_id'].describe())

object
count      416962
unique      32734
top       5892179
freq         1063
Name: product_id, dtype: int64


In [3]:
dataframe.fillna(value={'brand':'null'},inplace=True)
print(dataframe['brand'].isnull().sum())

0


In [4]:
import datetime

index_len = len(dataframe.index)
date_list = []

for i in range(index_len):
    date_str = dataframe['event_time'][i]
    date_list.append(datetime.datetime.strptime(date_str[0:19],'%Y-%m-%d %H:%M:%S'))

dataframe['event_time'] = pd.DataFrame(date_list)
print(dataframe['event_time'].dtype)
print(dataframe['event_time'].head())

datetime64[ns]
0   2019-10-01 00:00:00
1   2019-10-01 00:00:03
2   2019-10-01 00:00:07
3   2019-10-01 00:02:32
4   2019-10-01 00:02:40
Name: event_time, dtype: datetime64[ns]


In [5]:
print(dataframe[dataframe['price'] < 0].index)
dataframe = dataframe.drop(dataframe[dataframe['price'] < 0].index)
print(dataframe[dataframe['price'] < 0].index)

Int64Index([63196, 165348], dtype='int64')
Int64Index([], dtype='int64')


In [6]:
print(dataframe.dtypes)

event_time       datetime64[ns]
event_type               object
product_id               object
category_id             float64
category_code            object
brand                    object
price                   float64
user_id                   int64
user_session             object
dtype: object


In [7]:
print(dataframe.head())

           event_time event_type product_id   category_id category_code  \
0 2019-10-01 00:00:00       cart    5773203  1.490000e+18           NaN   
1 2019-10-01 00:00:03       cart    5773353  1.490000e+18           NaN   
2 2019-10-01 00:00:07       cart    5723490  1.490000e+18           NaN   
3 2019-10-01 00:02:32       cart    5857283  1.490000e+18           NaN   
4 2019-10-01 00:02:40       cart    5723523  1.490000e+18           NaN   

    brand  price  user_id                          user_session  
0  runail   2.62        1  26dd6e6e-4dac-4778-8d2c-92e149dab885  
1  runail   2.62        1  26dd6e6e-4dac-4778-8d2c-92e149dab885  
2  runail   2.62        1  26dd6e6e-4dac-4778-8d2c-92e149dab885  
3  runail   2.62        1  26dd6e6e-4dac-4778-8d2c-92e149dab885  
4  runail   2.62        1  26dd6e6e-4dac-4778-8d2c-92e149dab885  


任务分工说明：  
1. 产品，价格，品牌是否重叠？重叠度有多高 （王）  
2. 价格（连续变量）离散化以及brand空值处理  （王） 注：离群点的存在，合理划分区间  
3. 对购买之后的记录进行删除操作 （王）  
4. 剔除无购买行为的用户  （王）  
5. 构建品牌-价格字典 （张）  
6. 设计测试评价指标 （张）  
7. 设计个人行为模型，要考虑event,price,brand等变量，并且对event变量做好处理，通过购买产品进行监督训练 （邢）  
8. 考虑如何合成公共字典和个人行为模型，以概率的形式输出不同产品购买的可能性 （邢）  
  
具体要求：  
王：返回处理结束后的csv文件，并对重叠度进行基本描述  
张：基于新的数据集构建字典，返回字典格式的数据，其中字典中的值为对应产品的频数，实现测试评价指标的代码，输入为预测的概率序列（从高到低），输出为每个产品的召回率和整体的平均倒数排名  
邢：实现根据输入变量预测产品输出概率的模型，并设计好训练模式，实现将公共字典中的产品概率与浏览产品的预测概率结合的最终产品概率预测算法  


In [8]:
# import dataset after preprocess by wang
train_data = pd.read_csv('train_clear.csv')
print(train_data.dtypes)
print(train_data['brand'].isnull().sum())
print(train_data['price'].value_counts())
train_data.fillna(value={'brand':'null'},inplace=True)
print(train_data['brand'].isnull().sum())
print()
test_data = pd.read_csv('test_clear.csv')
print(test_data.dtypes)
print(test_data['brand'].isnull().sum())
print(test_data['price'].value_counts())
test_data.fillna(value={'brand':'null'},inplace=True)
print(test_data['brand'].isnull().sum())
test_data['price'] = test_data['price'].astype('int64')
print(test_data['price'].dtype)

Unnamed: 0         int64
event_time        object
event_type        object
product_id         int64
category_id      float64
category_code     object
brand             object
price            float64
user_id            int64
user_session      object
dtype: object
44357
1.0    30177
2.0    27638
3.0    21483
4.0    18852
5.0    14379
Name: price, dtype: int64
0

Unnamed: 0         int64
event_time        object
event_type        object
product_id         int64
category_id      float64
category_code     object
brand             object
price            float64
user_id            int64
user_session      object
dtype: object
426
1.0    342
2.0    328
4.0    195
3.0    182
5.0    167
Name: price, dtype: int64
0
int64


In [9]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,0,2019-10-01 00:00:00,cart,5773203,1.49e+18,,runail,2.0,1,26dd6e6e-4dac-4778-8d2c-92e149dab885
1,1,2019-10-01 00:00:03,cart,5773353,1.49e+18,,runail,2.0,1,26dd6e6e-4dac-4778-8d2c-92e149dab885
2,2,2019-10-01 00:00:07,cart,5723490,1.49e+18,,runail,2.0,1,26dd6e6e-4dac-4778-8d2c-92e149dab885
3,3,2019-10-01 00:02:32,cart,5857283,1.49e+18,,runail,2.0,1,26dd6e6e-4dac-4778-8d2c-92e149dab885
4,4,2019-10-01 00:02:40,cart,5723523,1.49e+18,,runail,2.0,1,26dd6e6e-4dac-4778-8d2c-92e149dab885


In [10]:
print(len(train_data.index))
print(len(test_data.index))
print(len(train_data['user_id'].unique()))
print(len(test_data['user_id'].unique()))

112529
1214
2636
49


In [11]:
import warnings

warnings.filterwarnings('ignore')
#build a dictionary based on brand and price
diction = {}
#print(len(dataframe['brand'].unique()))
a = train_data['brand'].value_counts()
price_level = [1,2,3,4,5] #Discretization
for key in a.keys():
    diction[key] = {}
    for val in price_level:
        diction[key][val] = {}




# basic assumption: purchase event is the most related event, frequency contributes little.
user_ids = train_data['user_id'].value_counts()

for user_id, count in enumerate(user_ids):
    record = train_data[train_data['user_id'] == user_id][train_data['event_type'] == 'purchase']
    #remove duplicate
    update_record = record.drop_duplicates(subset=['product_id'],keep='first',inplace=False)
    
    for i in update_record.index:
        product_id = update_record.loc[i,'product_id']
        brand = update_record.loc[i,'brand']
        price = update_record.loc[i,'price']
        if product_id not in diction[brand][price]:
            diction[brand][price][product_id] = 1
        else:
            diction[brand][price][product_id] += 1
        





In [12]:
print(diction['masura'])

{1: {5859462: 3, 5826988: 2, 5813058: 3, 5826989: 1, 5797965: 1, 5859468: 1, 5788524: 1, 5796976: 1, 5796983: 2, 5797967: 1, 5801770: 1, 5804167: 1, 5804168: 1, 5813060: 1, 5813068: 1, 5826996: 1, 5867965: 1, 5774877: 1, 5774872: 1, 5830548: 2, 5774871: 1, 5788522: 1, 5794153: 1, 5796995: 1, 5774893: 1, 5830534: 1, 5603537: 1, 5813483: 2, 5813064: 1, 5859478: 2, 5830561: 1, 5813482: 1, 5839699: 1, 5839706: 1, 5839703: 1, 5839702: 1, 5839697: 1, 5758509: 1, 5774897: 1, 5796998: 1, 5794155: 1}, 2: {5859410: 1, 5859411: 1, 5859413: 1, 5839677: 2, 5859407: 1, 5839667: 1, 5859420: 1, 5839676: 1, 5839672: 2, 5859402: 3, 5683351: 1, 5839656: 1}, 3: {5883344: 1, 5839690: 1, 5746386: 1, 5774863: 1, 5853684: 1}, 4: {5627420: 1, 5624537: 1, 5747404: 1}, 5: {5774857: 1, 27764: 1}}


In [13]:
#build a judgement calculation method
predict_1 = {5802428: 0.8, 5688124: 0.6, 5739989: 0.4, 5773353: 0.2}
truth_1 = [5688124, 5773353, 5385]
predict_2 = {5688124:0.7, 5385:0.5, 5555:0.9}
truth_2 = [5385,5688124]
predict = [predict_1,predict_2]
truth = [truth_1,truth_2]
#MRR
def MRR(p,t):
    Q = len(p)
    mrr = 0
    for i in range(Q):
        temp_p = p[i]
        temp_t = t[i]
        p_value = sorted(temp_p.items(),key= lambda item:item[1], reverse=True)
        p_len = len(p_value)
        find = False
        for j in range(p_len):
            k,v = p_value[j]
            if k in temp_t:
                rank = 1/(j+1)
                find = True
                break

        if find == False:
            #optimistic : the recommend algorithm will find truth product next.
            rank = 1/(p_len+1)

        mrr += rank
    mrr /= Q
    return mrr

print(MRR(predict,truth))

#Inclusion Rate: a numerical identifier to measure to what extent truth product is covered by predict product
def Inclusion(p,t):
    Q = len(p)
    inclusion = 0
    for i in range(Q):
        temp_p = p[i]
        temp_t = t[i]
        score = 0
        for member in temp_t:
            if member in temp_p.keys():
                score += temp_p[member]
        
        score /= len(temp_t)
        inclusion += score
    
    inclusion /= Q
    return inclusion

print(Inclusion(predict,truth))


0.5
0.43333333333333335


In [14]:
#定义品牌与事件的数组
brand_ser=train_data['brand'].value_counts()
brand=np.array(brand_ser.index)
event=['view','cart','remove_from_cart']

In [15]:
#将用户记录向量化，格式为[view,cart,remove_from_cart,price,brand]，其中价格是用该用户操作商品的平均价格，brand有198个元素。
def getinputvec(train_data,brand,event):    
    id=train_data['user_id'][0]
    q=0
    k=0
    input=np.zeros((len(train_data['user_id'].value_counts()),202))
    for i in range(len(train_data)):
        if train_data['event_type'][i]=='purchase': 
                if id!=train_data['user_id'][i]:
                    input[q][3]=input[q][3]/k
                    k=1
                    id=train_data['user_id'][i]
                    q=q+1
        if train_data['event_type'][i]!='purchase':    
            if id==train_data['user_id'][i]:
                k=k+1                
                for n in range(3):
                    if event[n]==train_data['event_type'][i]:
                        input[q][n]=input[q][n]+1
                input[q][3]=input[q][3]+train_data['price'][i]
                for n in range(len(brand)):
                    if brand[n]==train_data['brand'][i]:
                        input[q][n+4]=input[q][n+4]+1

            if id!=train_data['user_id'][i]:
                input[q][3]=input[q][3]/k
                k=1
                id=train_data['user_id'][i]
                q=q+1
             
                for n in range(3):
                    if event[n]==train_data['event_type'][i]:
                        input[q][n]=input[q][n]+1
                input[q][3]=input[q][3]+train_data['price'][i]
                for n in range(len(brand)):
                    if brand[n]==train_data['brand'][i]:
                        input[q][n+4]=input[q][n+4]+1
    input[len(train_data['user_id'].value_counts())-1][3]=input[len(train_data['user_id'].value_counts())-1][3]/k
    for i in range((len(train_data['user_id'].value_counts()))):
         for n in range(len(brand)):
            input[i][n+4]=input[i][n+4]/10
    for i in range((len(train_data['user_id'].value_counts()))):
         for n in range(3):
            input[i][n]=input[i][n]/10
    return input

In [16]:
#将用户购买结果向量化，格式为[purchase,price,brand]，其中purchase是用户的购买次数，价格是用该用户购买商品的平均价格，brand有198个元素。
def getoutputvec(train_data,brand,event):    
    id=train_data['user_id'][0]
    q=0
    k=0
    output=np.zeros((len(train_data['user_id'].value_counts()),200))
    for i in range(len(train_data)):
        if id==train_data['user_id'][i]:
            if train_data['event_type'][i]=='purchase': 
                k=k+1
                output[q][0]=output[q][0]+1
                output[q][1]=output[q][1]+train_data['price'][i]
                for n in range(len(brand)):
                    if brand[n]==train_data['brand'][i]:
                        output[q][n+2]=output[q][n+2]+1
            
        if id!=train_data['user_id'][i]:
            if train_data['event_type'][i]=='purchase': 
                output[q][1]=output[q][1]/k
                k=1
                id=train_data['user_id'][i]
                q=q+1
                output[q][0]=output[q][0]+1
                output[q][1]=output[q][1]+train_data['price'][i]
                for n in range(len(brand)):
                    if brand[n]==train_data['brand'][i]:
                        output[q][n+2]=output[q][n+2]+1            
    output[len(train_data['user_id'].value_counts())-1][1]=output[len(train_data['user_id'].value_counts())-1][1]/k
    for i in range((len(train_data['user_id'].value_counts()))):
         for n in range(len(brand)):
            output[i][n+2]=output[i][n+2]/5
    for i in range((len(train_data['user_id'].value_counts()))):
            output[i][0]=output[i][0]/5
    return output

In [17]:
#获得用于BP神经网络训练测试的输入输出向量
train_set_x= getinputvec(train_data,brand,event)
test_set_x=getinputvec(test_data,brand,event)
train_set_y_T= getoutputvec(train_data,brand,event)
test_set_y_T=getoutputvec(test_data,brand,event)

In [18]:
#将输出向量转置并处理成bool型数据，转换成BP网络可识别的形式。
train_set_y=np.transpose(train_set_y_T)
test_set_y=np.transpose(test_set_y_T)
for i in range(2,len(train_set_y)):
    for n in range(len(train_set_y[0])):
        if train_set_y[i][n]!=0:
            train_set_y[i][n]=1
for i in range(2,len(test_set_y)):
    for n in range(len(test_set_y[0])):
        if test_set_y[i][n]!=0:
            test_set_y[i][n]=1

In [19]:
#编写BP网络函数，层数为2，含200个神经元。
import os
import pandas as pd
from sklearn.neural_network import MLPClassifier

def BP(train_set_x,train_set_y,test_set_y):
    
    bp = MLPClassifier(solver='lbfgs',activation = 'tanh', alpha=1e-5,  hidden_layer_sizes=(200,2), random_state=1, batch_size=9, verbose=True)
    bp.fit(train_set_x, train_set_y)

    y_pred=bp.predict(train_set_x)
    y_pred
    number = len(y_pred)
    true = 0
    false = 0
    trainright=0
    for i in range(number):
        if y_pred[i] == train_set_y[i]:
            true = true + 1
        else :
            false = false + 1
    trainright=true/(true+false)
   # print("训练集正确:"+str(true))
   # print("训练集错误:"+str(false))
   # print('训练集准确率： %f' %(trainright))

    y_pred=bp.predict(test_set_x)
    number = len(y_pred)
    true = 0
    false = 0
    testright=0
    for i in range(number):
        if y_pred[i] == test_set_y[i]:
            true = true + 1
        else :
            false = false + 1
    testright=true/(true+false)
   # print("测试集正确:"+str(true))
   # print("测试集错误:"+str(false))
   # print('测试集准确率： %f' %(testright))
    return trainright,testright,y_pred

In [20]:
#预测所有的test集中的用户购买品牌，存入y_pred矩阵中，并计算对于所有用户购买品牌预测的平均准确率
trainright=0
testright=0
trainrighti=0
testrighti=0
y_predi=[]
y_pred=np.zeros((198,len(test_set_y[0])))
for i in range(2,len(train_set_y)):
    trainrighti,testrighti,y_predi=BP(train_set_x,train_set_y[i],test_set_y[i])
    trainright=trainright+trainrighti
    testright=testright+testrighti
    y_pred[i-2]=y_predi
trainright=trainright/len(train_set_y)
testright=testright/len(train_set_y)
print('训练集平均准确率： %f' %(trainright))
print('测试集平均准确率： %f' %(testright))

训练集平均准确率： 0.981978
测试集平均准确率： 0.981735


In [21]:
#将预测的y数组转置，第n行是第n位用户，n列是第n个品牌
y_predict=np.transpose(y_pred)
#按表格顺序获得userid的列表
k=test_data['user_id'][0]
userid=[]
userid.append(test_data['user_id'][0])
for i in range(len(test_data['user_id'])):
    if k!=test_data['user_id'][i]:
        userid.append(test_data['user_id'][i])
        k=test_data['user_id'][i]

In [24]:
#定义预测
def predict(id,y_predict,userid,brand,test_set_y):
    true=0
    false=0
    brand_predict=[]
    k=0
    q=0
    for n in range(len(userid)):
        if id==userid[n]:
            k=n
    for n in range(len(y_predict[0])):
        if y_predict[k][n]==1:
            brand_predict.append(brand[n])
        if y_predict[k][n]==test_set_y[n][k]:
            true=true+1
        else:
            false=false+1
    testright=true/(true+false)
    print('预测id为 %d的用户购买的商品品牌为：' %(id))
    print(brand_predict)
    print('预测准确率为：%f'%(testright))
            

In [25]:
#拿id为54066的用户为例，可以预测其购买的品牌，并给出了预测准确率
predict(54066,y_predict,userid,brand,test_set_y)

预测id为 54066的用户购买的商品品牌为：
['null', 'irisk', 'grattol', 'ingarden']
预测准确率为：0.969697
