In [29]:
from collections import defaultdict
import os
import torch
import random
import numpy as np
import pandas as pd
import json
import pickle
import gzip
from tqdm import tqdm
import jsonlines
import re

def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)


def save_pickle(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)


def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)


def ReadLineFromFile(path):
    lines = []
    with open(path, 'r') as fd:
        for line in fd:
            lines.append(line.rstrip('\n'))
    return lines


def parse(path):
    data = []
    with gzip.open(path) as f:
        for l in f:
            data.append(json.loads(l.strip()))
    return data

def replace_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text)

In [30]:
def get_amazon_data(dataset_name, rating_score):
    '''
    reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
    asin - ID of the product, e.g. 0000013714
    reviewerName - name of the reviewer
    helpful - helpfulness rating of the review, e.g. 2/3
    --"helpful": [2, 3],
    reviewText - text of the review
    --"reviewText": "I bought this for my husband who plays the piano. ..."
    overall - rating of the product
    --"overall": 5.0,
    summary - summary of the review
    --"summary": "Heavenly Highway Hymns",
    unixReviewTime - time of the review (unix time)
    --"unixReviewTime": 1252800000,
    reviewTime - time of the review (raw)
    --"reviewTime": "09 13, 2009"
    '''
    datas = []
    # older Amazon
    data_file = '/common/home/km1558/amazon_data/' + 'reviews_' + dataset_name + '_5.json.gz'
    # latest Amazon
    # data_file = '/home/hui_wang/data/new_Amazon/' + dataset_name + '.json.gz'
    parsed_data = parse(data_file)
    
    # add name for user
    # for inter in parsed_data:
    #     if "reviewerName" not in inter.keys():
    #         continue
    #     user_name = inter['reviewerName'].replace('"','').replace(",", "")
    #     user_name = replace_multiple_spaces(user_name)
    #     if float(inter['overall']) <= rating_score:  # 小于一定分数去掉
    #         continue
    #     user = inter['reviewerID']
    #     item = inter['asin']
    #     time = inter['unixReviewTime']
    #     datas.append((user, user_name, item, int(time)))
    # return datas
    for inter in parsed_data:
        if float(inter['overall']) <= rating_score:  # 小于一定分数去掉
            continue
        user = inter['reviewerID']
        item = inter['asin']
        time = inter['unixReviewTime']
        datas.append((user, item, int(time)))
    return datas

In [31]:
dataset_name = "Beauty"

data = get_amazon_data(dataset_name, rating_score = 0)
print(data[:10])

[('A1YJEY40YUW4SE', '7806397051', 1391040000), ('A60XNB876KYML', '7806397051', 1397779200), ('A3G6XNM240RMWA', '7806397051', 1378425600), ('A1PQFP6SAJ6D80', '7806397051', 1386460800), ('A38FVHZTNQ271F', '7806397051', 1382140800), ('A3BTN14HIZET6Z', '7806397051', 1365984000), ('A1Z59RFKN0M5QL', '7806397051', 1376611200), ('AWUO9P6PL1SY8', '7806397051', 1378252800), ('A3LMILRM9OC3SA', '9759091062', 1405209600), ('A30IP88QK3YUIO', '9759091062', 1388102400)]


In [32]:
mode = "user"

def get_interaction(datas):
    user_seq = {}
    
    # add user name
#     for data in datas:
#         user, user_name, item, time = data
#         user_plus_name = user + ":" + user_name
#         if user_plus_name in user_seq:
#             user_seq[user_plus_name].append((item, time))
#         else:
#             user_seq[user_plus_name] = []
#             user_seq[user_plus_name].append((item, time))
    
#     # print(user_seq.items())
#     for user in user_seq.keys():
#         item_time = user_seq[user]
#         # item_time.sort(key=lambda x: x[1])  # 对各个数据集得单独排序
#         items = []
#         for t in item_time:
#             items.append(t[0])
#         user_seq[user] = items
#     return user_seq
    for data in datas:
        # item, user, time = data
        if mode == "item":
            user, item, time = data
        elif mode == "user":
            item, user, time = data
            
        if user in user_seq:
            user_seq[user].append((item, time))
        else:
            user_seq[user] = []
            user_seq[user].append((item, time))
    
    # print(user_seq.items())
    for user in user_seq.keys():
        item_time = user_seq[user]
        # item_time.sort(key=lambda x: x[1])  # 对各个数据集得单独排序
        items = []
        for t in item_time:
            items.append(t[0])
        # random.seed(123)
        random.shuffle(items)
        user_seq[user] = items
        # print(user)
        # print(items)
    return user_seq
    
user_items = get_interaction(data)

In [33]:
def check_Kcore(user_items, user_core, item_core):
    user_count = defaultdict(int)
    item_count = defaultdict(int)
    for user, items in user_items.items():
        for item in items:
            user_count[user] += 1
            item_count[item] += 1

    for user, num in user_count.items():
        if num < user_core:
            return user_count, item_count, False
    for item, num in item_count.items():
        if num < item_core:
            return user_count, item_count, False
    return user_count, item_count, True  # 已经保证Kcore


# 循环过滤 K-core
def filter_Kcore(user_items, user_core, item_core):  # user 接所有items
    user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
    # for user in user_items:
    #     if len(user_items[user]) >= 5:
    #         print(user)
    #         print(user_items[user])

    while not isKcore:
        for user, num in user_count.items():
            if user_count[user] < user_core:  # 直接把user 删除
                user_items.pop(user)
            else:
                for item in user_items[user]:
                    if item_count[item] < item_core:
                        user_items[user].remove(item)
        user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
    return user_items

# user_items = filter_Kcore(user_items, 5, 5)

# idx = 0
# for user in user_items.keys():
#     if idx >= 10:
#         break
#     idx += 1
#     print(user)
#     print(user_items[user])

In [34]:
def id_map(user_items):  # user_items dict
    user2id = {}  # raw 2 uid
    item2id = {}  # raw 2 iid
    id2user = {}  # uid 2 raw
    id2item = {}  # iid 2 raw
    user_id = 1
    item_id = 1
    final_data = {}
    random_user_list = list(user_items.keys())
    random.shuffle(random_user_list)
    user_ids = [i for i in range(1, len(random_user_list) + 1)]
    total_items = []
    for user in random_user_list:
        items = user_items[user]
        for item in items:
            if item not in total_items:
                total_items.append(item)

    item_nums = len(total_items)

    item_ids = [i for i in range(1, item_nums + 1)]
    final_data = {}

    for user in random_user_list:
        if user not in user2id:
            user_index = random.randint(0, len(user_ids) - 1)
            user_id = user_ids[user_index]
            user2id[user] = str(user_id)
            id2user[str(user_id)] = user
            user_ids.remove(user_id)
        iids = []  # item id lists
        items = user_items[user]
        for item in items:
            if item not in item2id:
                item_index = random.randint(0, len(item_ids) - 1)
                item_id = item_ids[item_index]
                item2id[item] = str(item_id)
                id2item[str(item_id)] = item
                item_ids.remove(item_id)
            iids.append(item2id[item])
        uid = user2id[user]
        final_data[uid] = iids

    # sequential data excluding evaluation and test dataset

    data_maps = {
        'user2id': item2id,
        'item2id': user2id,
        'id2user': id2item,
        'id2item': id2user
    }

    return final_data, user_id - 1, item_id - 1, data_maps

final_data, user_id, item_id, data_maps = id_map(user_items)



In [35]:
idx = 0
for user in final_data.keys():
    idx += 1
    if idx > 10:
        break
    print(user)
    print(final_data[user])

1
['1', '2', '3', '4', '5', '6', '5235', '3228']
2
['7', '8', '9', '10', '11', '12', '13', '14', '15', '10305', '14488']
3
['16', '17', '18', '7998', '20296']
4
['19', '20', '21', '22', '7491', '5649']
5
['23', '24', '25', '4753', '3606']
6
['26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '4475', '9950']
7
['46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '14932', '8550']
8
['67', '49', '68', '290', '14270']
9
['69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '81', '1115']
10
['80', '81', '82', '70', '71', '2654', '13796']


In [36]:
base_save_dir = "/common/home/km1558/amazon_data/data"

import os

task = "beauty"

indexing = mode + "_CF_indices"

save_dir = os.path.join(base_save_dir, task, indexing)

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

data_file = os.path.join(save_dir, "data.txt")

with open(data_file, 'w') as out:
    for user, items in final_data.items():
        out.write(user + ' ' + ' '.join(items) + '\n')

datamaps_file = os.path.join(save_dir, mode + "_CF_datamaps.json")
        
json_str = json.dumps(data_maps, indent=2)
with open(datamaps_file, 'w') as out:
    out.write(json_str)