## Shuffling P5 dataset

In [None]:
import json
import os
import pickle
import random
import shutil

In [None]:
source_dataset = "beauty"
seed = 1127
target_dataset = source_dataset + "_shuffled"

os.makedirs(target_dataset, exist_ok=True)

### Load files

In [None]:
datamaps = json.load(open(os.path.join(source_dataset, "datamaps.json"), "r", encoding="utf-8"))
user_id2name = pickle.load(open(os.path.join(source_dataset, "user_id2name.pkl"), "rb"))

In [None]:
negative_samples = open(os.path.join(source_dataset, "negative_samples.txt"), "r", encoding="utf-8").readlines()
sequential_data = open(os.path.join(source_dataset, "sequential_data.txt"), "r", encoding="utf-8").readlines()

In [None]:
amazon_user_ids = list(datamaps["user2id"].keys())
len(amazon_user_ids), amazon_user_ids[:5]

In [None]:
amazon_item_ids = list(datamaps["item2id"].keys())
len(amazon_item_ids), amazon_item_ids[:5]

### Shuffle IDs

In [None]:
random.seed(seed)
random.shuffle(amazon_user_ids)
random.seed(seed)
random.shuffle(amazon_item_ids)

In [None]:
new_user2id = {user_id: str(i) for i, user_id in enumerate(amazon_user_ids, start=1)}
new_id2user = {str(i): user_id for i, user_id in enumerate(amazon_user_ids, start=1)}
new_item2id = {item_id: str(i) for i, item_id in enumerate(amazon_item_ids, start=1)}
new_id2item = {str(i): item_id for i, item_id in enumerate(amazon_item_ids, start=1)}

In [None]:
new_datamaps = {
    "user2id": new_user2id,
    "id2user": new_id2user,
    "item2id": new_item2id,
    "id2item": new_id2item
}

In [None]:
json.dump(new_datamaps, open(os.path.join(target_dataset, "datamaps.json"), "w", encoding="utf-8"), indent=1,
          ensure_ascii=False)

In [None]:
old_user_to_new_user_map = {int(datamaps["user2id"][user_id]): int(new_user2id[user_id]) for user_id in amazon_user_ids}
old_item_to_new_item_map = {int(datamaps["item2id"][item_id]): int(new_item2id[item_id]) for item_id in amazon_item_ids}

In [None]:
json.dump(old_user_to_new_user_map,
          open(os.path.join(target_dataset, "old_user_to_new_user_map.json"), "w", encoding="utf-8"), indent=1,
          ensure_ascii=False)
json.dump(old_item_to_new_item_map,
          open(os.path.join(target_dataset, "old_item_to_new_item_map.json"), "w", encoding="utf-8"), indent=1,
          ensure_ascii=False)

### negative_samples.txt

In [None]:
negative_samples = [list(map(int, line.strip().split(" "))) for line in negative_samples]
negative_samples[0]

In [None]:
negative_samples = [[old_user_to_new_user_map[line[0]]] + list(map(lambda x: old_item_to_new_item_map[x], line[1:])) for line in negative_samples]
negative_samples[0]

In [None]:
negative_samples.sort(key=lambda x: x[0])
negative_samples[0]

In [None]:
with open(os.path.join(target_dataset, "negative_samples.txt"), "w", encoding="utf-8") as f:
    for line in negative_samples:
        f.write(" ".join(map(str, line)) + "\n")

### sequential_data.txt

In [None]:
sequential_data = [list(map(int, line.strip().split(" "))) for line in sequential_data]
sequential_data[0]

In [None]:
sequential_data = [[old_user_to_new_user_map[line[0]]] + list(map(lambda x: old_item_to_new_item_map[x], line[1:])) for line in sequential_data]
sequential_data[0]

In [None]:
sequential_data.sort(key=lambda x: x[0])
sequential_data[0]

In [None]:
with open(os.path.join(target_dataset, "sequential_data.txt"), "w", encoding="utf-8") as f:
    for line in sequential_data:
        f.write(" ".join(map(str, line)) + "\n")

### user_id2name.pkl

In [None]:
user_id2name

In [None]:
user_id2name["1"]

In [None]:
ids, names = zip(*user_id2name.items())

In [None]:
ids[:5]

In [None]:
new_ids = list(map(str, map(lambda x: old_user_to_new_user_map[x], map(int, ids))))

In [None]:
new_ids[:5]

In [None]:
user_id2name = {new_id: name for new_id, name in zip(new_ids, names)}

In [None]:
user_id2name["1"]

In [None]:
pickle.dump(user_id2name, open(os.path.join(target_dataset, "user_id2name.pkl"), "wb"))

### Copy rest of the files

In [None]:
a = input("Copy rest of the files? (y/n)")

if a != "y":
    exit()

In [None]:
untouched_files = ["exp_splits.pkl", "meta.json.gz", "rating_splits_augmented.pkl", "review_splits.pkl", "zeroshot_exp_splits.pkl"]

In [None]:
for file in untouched_files:
    shutil.copy(os.path.join(source_dataset, file), os.path.join(target_dataset, file))