In [2]:
import os
import pandas as pd 

dataframe = pd.read_csv('train.csv')

for index in dataframe.columns:
    print('Missing value: ',dataframe[index].isnull().sum())
    print(dataframe[index].describe())
    print()


Missing value:  0
count                      416962
unique                     173711
top       2019-10-02 20:37:56 UTC
freq                          102
Name: event_time, dtype: object

Missing value:  0
count     416962
unique         4
top         view
freq      168854
Name: event_type, dtype: object

Missing value:  0
count    4.169620e+05
mean     5.498023e+06
std      1.268676e+06
min      3.752000e+03
25%      5.729011e+06
50%      5.809809e+06
75%      5.850305e+06
max      5.892800e+06
Name: product_id, dtype: float64

Missing value:  0
count    4.169620e+05
mean     1.556230e+18
std      1.644289e+17
min      1.490000e+18
25%      1.490000e+18
50%      1.490000e+18
75%      1.490000e+18
max      2.200000e+18
Name: category_id, dtype: float64

Missing value:  410798
count                              6164
unique                               10
top       appliances.environment.vacuum
freq                               2982
Name: category_code, dtype: object

Missing value:  16

In [3]:
dataframe['product_id'] = dataframe['product_id'].astype('object')
print(dataframe['product_id'].dtype)
print(dataframe['product_id'].describe())

object
count      416962
unique      32734
top       5892179
freq         1063
Name: product_id, dtype: int64


In [4]:
dataframe.fillna(value={'brand':'null'},inplace=True)
print(dataframe['brand'].isnull().sum())


0


In [5]:
import datetime

index_len = len(dataframe.index)
date_list = []

for i in range(index_len):
    date_str = dataframe['event_time'][i]
    date_list.append(datetime.datetime.strptime(date_str[0:19],'%Y-%m-%d %H:%M:%S'))

dataframe['event_time'] = pd.DataFrame(date_list)
print(dataframe['event_time'].dtype)
print(dataframe['event_time'].head())

datetime64[ns]
0   2019-10-01 00:00:00
1   2019-10-01 00:00:03
2   2019-10-01 00:00:07
3   2019-10-01 00:02:32
4   2019-10-01 00:02:40
Name: event_time, dtype: datetime64[ns]


In [6]:
print(dataframe[dataframe['price'] < 0].index)
dataframe = dataframe.drop(dataframe[dataframe['price'] < 0].index)
print(dataframe[dataframe['price'] < 0].index)

Int64Index([63196, 165348], dtype='int64')
Int64Index([], dtype='int64')


In [7]:
print(dataframe.dtypes)

event_time       datetime64[ns]
event_type               object
product_id               object
category_id             float64
category_code            object
brand                    object
price                   float64
user_id                   int64
user_session             object
dtype: object


In [8]:
print(dataframe.head())

           event_time event_type product_id   category_id category_code  \
0 2019-10-01 00:00:00       cart    5773203  1.490000e+18           NaN   
1 2019-10-01 00:00:03       cart    5773353  1.490000e+18           NaN   
2 2019-10-01 00:00:07       cart    5723490  1.490000e+18           NaN   
3 2019-10-01 00:02:32       cart    5857283  1.490000e+18           NaN   
4 2019-10-01 00:02:40       cart    5723523  1.490000e+18           NaN   

    brand  price  user_id                          user_session  
0  runail   2.62        1  26dd6e6e-4dac-4778-8d2c-92e149dab885  
1  runail   2.62        1  26dd6e6e-4dac-4778-8d2c-92e149dab885  
2  runail   2.62        1  26dd6e6e-4dac-4778-8d2c-92e149dab885  
3  runail   2.62        1  26dd6e6e-4dac-4778-8d2c-92e149dab885  
4  runail   2.62        1  26dd6e6e-4dac-4778-8d2c-92e149dab885  


任务分工说明：  
1. 产品，价格，品牌是否重叠？重叠度有多高 （王）  
2. 价格（连续变量）离散化以及brand空值处理  （王） 注：离群点的存在，合理划分区间  
3. 对购买之后的记录进行删除操作 （王）  
4. 剔除无购买行为的用户  （王）  
5. 构建品牌-价格字典 （张）  
6. 设计测试评价指标 （张）  
7. 设计个人行为模型，要考虑event,price,brand等变量，并且对event变量做好处理，通过购买产品进行监督训练 （邢）  
8. 考虑如何合成公共字典和个人行为模型，以概率的形式输出不同产品购买的可能性 （邢）  
  
具体要求：  
王：返回处理结束后的csv文件，并对重叠度进行基本描述  
张：基于新的数据集构建字典，返回字典格式的数据，其中字典中的值为对应产品的频数，实现测试评价指标的代码，输入为预测的概率序列（从高到低），输出为每个产品的召回率和整体的平均倒数排名  
邢：实现根据输入变量预测产品输出概率的模型，并设计好训练模式，实现将公共字典中的产品概率与浏览产品的预测概率结合的最终产品概率预测算法  


In [24]:
#build a dictionary based on brand and price
toy_example = {}
#print(len(dataframe['brand'].unique()))
a = dataframe['brand'].value_counts()
price_level = [0,1,2,3,4,5,6,7,8,9] #Discretization
for key in a.keys():
    toy_example[key] = {}
    for val in price_level:
        toy_example[key][val] = {}


Train_length = dataframe['user_id'].max()
valid_event = ['purchase']
# basic assumption: purchase event is the most related event, frequency contributes little.
user_id = 1

while (user_id <= Train_length):
    record = dataframe[dataframe['user_id'] == user_id][dataframe['event_type'].isin(valid_event)]
    #remove duplicate
    update_record = record.drop_duplicates(subset=['product_id'],keep='first',inplace=False)
    
    for i in update_record.index:
        product_id = update_record.loc[i,'product_id']
        brand = update_record.loc[i,'brand']
        price = update_record.loc[i,'price']
        price = round(price)  # wait for Discretization
        if product_id not in toy_example[brand][price]:
            toy_example[brand][price][product_id] = 1
        else:
            toy_example[brand][price][product_id] += 1
        

    user_id += 1
    break

b = list(toy_example.items())
print(b[:5])

[('null', {0: {5802428: 1, 5688124: 1}, 1: {5853646: 1}, 2: {}, 3: {5739989: 1}, 4: {}, 5: {}, 6: {}, 7: {}, 8: {}, 9: {}}), ('runail', {0: {5700046: 1}, 1: {}, 2: {}, 3: {5857283: 1, 5773353: 1, 5773313: 1, 5773203: 1, 5773201: 1, 5723529: 1, 5723523: 1, 5723490: 1}, 4: {5385: 1}, 5: {}, 6: {}, 7: {}, 8: {}, 9: {}}), ('irisk', {0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}, 7: {}, 8: {}, 9: {}}), ('masura', {0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}, 7: {}, 8: {}, 9: {}}), ('bpw.style', {0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}, 7: {}, 8: {}, 9: {}})]


In [35]:
#build a judgement calculation method
predict_1 = {5802428: 0.8, 5688124: 0.6, 5739989: 0.4, 5773353: 0.2}
truth_1 = [5688124, 5773353, 5385]
predict_2 = {5688124:0.7, 5385:0.5, 5555:0.9}
truth_2 = [5385,5688124]
predict = [predict_1,predict_2]
truth = [truth_1,truth_2]
#MRR
def MRR(p,t):
    Q = len(p)
    mrr = 0
    for i in range(Q):
        temp_p = p[i]
        temp_t = t[i]
        p_value = sorted(temp_p.items(),key= lambda item:item[1], reverse=True)
        p_len = len(p_value)
        find = False
        for j in range(p_len):
            k,v = p_value[j]
            if k in temp_t:
                rank = 1/(j+1)
                find = True
                break

        if find == False:
            #optimistic : the recommend algorithm will find truth product next.
            rank = 1/(p_len+1)

        mrr += rank
    mrr /= Q
    return mrr

print(MRR(predict,truth))

#Inclusion Rate: a numerical identifier to measure to what extent truth product is covered by predict product
def Inclusion(p,t):
    Q = len(p)
    inclusion = 0
    for i in range(Q):
        temp_p = p[i]
        temp_t = t[i]
        score = 0
        for member in temp_t:
            if member in temp_p.keys():
                score += temp_p[member]
        
        score /= len(temp_t)
        inclusion += score
    
    inclusion /= Q
    return inclusion

print(Inclusion(predict,truth))


0.5
0.43333333333333335
