# 整理数据集

In [1]:
import pickle
import pandas as pd
import numpy as np

## 加载查看数据

In [2]:
with open('../raw_data/reviews.pkl', 'rb') as f:
  reviews_df = pickle.load(f)
  reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]
with open('../raw_data/meta.pkl', 'rb') as f:
  meta_df = pickle.load(f)
  meta_df = meta_df[['asin', 'categories']]
  meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1])  # 一个产品可能属于多个类别，取最后一个标签作为该产品的类别

In [3]:
print('all reviews: %d\t all meta:%d' % (len(reviews_df),len(meta_df)))

all reviews: 1689188	 all meta:63001


In [7]:
reviews_df.head()

Unnamed: 0,reviewerID,asin,unixReviewTime
0,AO94DHGC771SJ,528881469,1370131200
1,AMO214LNFCEI4,528881469,1290643200
2,A3N7T0DY83Y4IG,528881469,1283990400
3,A1H8PY3QHMQQA0,528881469,1290556800
4,A24EV6RXELQZ63,528881469,1317254400


In [5]:
meta_df.head()

Unnamed: 0,asin,categories
0,528881469,Trucking GPS
1,594451647,Chargers & Adapters
2,594481813,Power Adapters
3,972683275,TV Ceiling & Wall Mounts
4,1400532620,eBook Readers & Accessories


In [11]:
user_records = []
def count_records(x):
    user_records.append(len(x))

reviews_df.groupby('reviewerID').apply(count_records)
user_records = np.array(user_records)
"用户平均记录数：%.2f" % (user_records.mean())

'用户平均记录数：8.78'

In [None]:
# 过滤用户数据少于50条的数据
def filter_user(x):
    return len(x['asin']) > 50

In [None]:
valid_user = reviews_df.groupby(['reviewerID']).filter(filter_user)

In [None]:
valid_meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())]
valid_meta_df = valid_meta_df.reset_index(drop=True)

In [None]:
print('sub records:%d\tsub meta:%d' %(len(valid_user),len(valid_meta_df)))

In [None]:
def build_map(df, col_name):
  key = sorted(df[col_name].unique().tolist())
  m = dict(zip(key, range(len(key))))
  df[col_name] = df[col_name].map(lambda x: m[x])
  return m, key

In [None]:
asin_map, asin_key = build_map(valid_meta_df, 'asin')
cate_map, cate_key = build_map(valid_meta_df, 'categories')
revi_map, revi_key = build_map(valid_user, 'reviewerID')

In [None]:
user_count, item_count, cate_count, example_count =\
    len(revi_map), len(asin_map), len(cate_map), valid_user.shape[0]
print('user_count: %d\titem_count: %d\tcate_count: %d\texample_count: %d' %
      (user_count, item_count, cate_count, example_count))

In [None]:
valid_meta_df = valid_meta_df.sort_values('asin')
valid_meta_df = valid_meta_df.reset_index(drop=True)


valid_user['asin'] = valid_user['asin'].map(lambda x: asin_map[x])
valid_user = valid_user.sort_values(['reviewerID', 'unixReviewTime'])
valid_user = valid_user.reset_index(drop=True)
valid_user = valid_user[['reviewerID', 'asin', 'unixReviewTime']]


In [None]:
cate_list = [valid_meta_df['categories'][i] for i in range(len(asin_map))]
cate_list = np.array(cate_list, dtype=np.int32)

In [None]:
with open('../raw_data_sub/remap.pkl', 'wb') as f:
  pickle.dump(valid_user, f, pickle.HIGHEST_PROTOCOL) # uid, iid
  pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL) # cid of iid line
  pickle.dump((user_count, item_count, cate_count, example_count),
              f, pickle.HIGHEST_PROTOCOL)
  pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL)