In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# DATASET = "Sports_and_Outdoors"
# DATASET = "Toys_and_Games"
# DATASET = "Grocery_and_Gourmet_Food"
# DATASET = "Video_Games"
DATASET = "Office_Products"
SAVE_PATH = "/content/drive/MyDrive/Capstone/Dataset"
SEED = 2107
MAX_NEGATIVE_ITEMS = 5000

COLD_ITEMS_PROPORTION = 0.2
TEST_WARM_INTERACTIONs_PROPORTION = 0.1
MIN_INTERACTIONS_PER_USER = 5

In [13]:
!mkdir {SAVE_PATH}/{DATASET}

mkdir: cannot create directory ‘/content/drive/MyDrive/Capstone/Dataset/Office_Products’: File exists


In [14]:
!pip install -U sentence-transformers



In [15]:
![ ! -f reviews_{DATASET}.json ] && wget https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{DATASET}.json.gz
![ ! -f meta_{DATASET}.json ] && wget https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{DATASET}.json.gz
![ ! -f image_features_{DATASET}.b ] && wget https://snap.stanford.edu/data/amazon/productGraph/image_features/categoryFiles/image_features_{DATASET}.b

--2024-05-26 09:02:52--  https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Office_Products.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 249771721 (238M) [application/x-gzip]
Saving to: ‘reviews_Office_Products.json.gz’


2024-05-26 09:03:01 (27.9 MB/s) - ‘reviews_Office_Products.json.gz’ saved [249771721/249771721]

--2024-05-26 09:03:01--  https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Office_Products.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47605216 (45M) [application/x-gzip]
Saving to: ‘meta_Office_Products.json.gz’


2024-05-26 09:03:04 (14.2 MB/s) - ‘meta_Office_Products.json.gz’ saved [47605216/47605216]


In [16]:
![ ! -f reviews_{DATASET}.json ] && gzip -d reviews_{DATASET}.json.gz
![ ! -f meta_{DATASET}.json ] && gzip -d meta_{DATASET}.json.gz

In [17]:
import os
import gc
import re
import gzip
import json
import array
import numpy as np
import pandas as pd
import torch
import requests
import random
from tqdm import tqdm
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sentence_transformers import SentenceTransformer
# from torchvision.models import resnet50, ResNet50_Weights
# from torchvision.models import convnext_base, ConvNeXt_Base_Weights
# from torchvision.models.feature_extraction import get_graph_node_names
# from torchvision.models.feature_extraction import create_feature_extractor

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [18]:
TEXT_FEATURES = 384
IMAGE_FEATURES = 4096
TEXT_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

#  Process Raw Data

In [19]:
def combine_text(row):
    context = ""

    if 'categories' in row:
        string = ""
        for cates in row['categories']:
            for cate in cates:
                string += cate + ' '
        row['categories'] = string.strip()

    # print(row)
    for key in ['title', 'categories', 'description', 'brand', 'price']:
        if key in row:
            context += str(row[key]) + ' '
    context = context.strip()
    context = re.sub(r'\s+', ' ', context)
    # print(context)
    return context

In [20]:
n_items = 0
items_id = {}
contexts = []
img_urls = []
item_categories = []
categories_id = {}
item_brands = []
brands_id = {}
cnt_repeated_item = 0

with open(f"meta_{DATASET}.json") as f:
    idx = 0
    for line in f:
        idx += 1
        data = eval(line)
        if (data['asin'] in items_id):
            cnt_repeated_item += 1
            continue

        if 'brand' in data and data['brand'].lower() != "unknown":
            if data['brand'] not in brands_id:
                brands_id[data['brand']] = len(brands_id)
            item_brands.append(brands_id[data['brand']])
        else:
            item_brands.append(-1)

        item_categories.append([])
        # if (len(data['categories']) > 1):
        #     print(data['asin'], data['categories'])
        for category in data['categories'][0]:
            if category not in categories_id:
                categories_id[category] = len(categories_id)
            item_categories[-1].append(categories_id[category])

        items_id[data['asin']] = n_items
        contexts.append(combine_text(data))
        img_urls.append(data['imUrl'] if 'imUrl' in data else None)
        n_items += 1

        # if idx <= 10:
        #     print(line)
        #     break

print(n_items)
print(cnt_repeated_item)
print(len(categories_id))

134838
0
793


In [21]:
print(list(brands_id.keys())[:10])
print(len(set(item_brands)))
print(sum(x == -1 for x in item_brands))
print(sum(x > -1 for x in item_brands))

['Scholastic', 'Scholastic Teacher&#39;s Friend', 'Estee Lauder', 'Rand McNally', 'Scholastic Classroom Resources', 'RAI Publishing', 'Crane', 'GALLOPADE', 'Punch Studio', 'Alpha Omega Publications']
6345
78125
56713


In [22]:
n_users = 0
users_id = {}
users_cnt = []
interactions = []

error_cnt = 0
with open(f"reviews_{DATASET}.json") as f:
    for line in f:
        item = eval(line)

        prod_id = item['asin']
        reviewer_id = item['reviewerID']
        timestamp = item['unixReviewTime']

        if reviewer_id not in users_id:
            users_id[reviewer_id] = n_users
            users_cnt.append(0)
            n_users += 1

        try:
            user_id = users_id[reviewer_id]
            item_id = items_id[prod_id]

            interactions.append((user_id, item_id, timestamp))
            users_cnt[user_id] += 1
        except:
            error_cnt += 1

print(len(interactions))
print(n_users, n_items)

1243186
909314 134838


In [23]:
print(error_cnt)

0


## Filter user have greater or equal to 5 interactions

In [24]:
users_cnt = np.array(users_cnt)
print(users_cnt)

[2 1 4 ... 1 1 1]


In [25]:
print(np.sum(users_cnt))

1243186


In [26]:
users_greater_than_5_cnt = np.where(users_cnt >= MIN_INTERACTIONS_PER_USER)[0]
set_users_greater_than_5_cnt = set(users_greater_than_5_cnt)
print(np.sum(users_cnt[users_greater_than_5_cnt]))
print(len(users_greater_than_5_cnt))
print(len(set_users_greater_than_5_cnt))

145141
16772
16772


In [27]:
n_filtered_users = 0
n_filtered_items = 0
filtered_users_id = {}
filtered_items_id = {}
filtered_interactions = []

for interaction in interactions:
    if interaction[0] in set_users_greater_than_5_cnt:
        if interaction[0] not in filtered_users_id:
            filtered_users_id[interaction[0]] = n_filtered_users
            n_filtered_users += 1
        if interaction[1] not in filtered_items_id:
            filtered_items_id[interaction[1]] = n_filtered_items
            n_filtered_items += 1
        filtered_interactions.append((filtered_users_id[interaction[0]], filtered_items_id[interaction[1]], interaction[2]))

print(n_filtered_users, n_filtered_items)
print(n_filtered_items * 0.3 * 0.5)
print(np.min([i[0] for i in filtered_interactions]), np.max([i[0] for i in filtered_interactions]))
print(np.min([i[1] for i in filtered_interactions]), np.max([i[1] for i in filtered_interactions]))

16772 37956
5693.4
0 16771
0 37955


In [28]:
filtered_items_contexts = [""] * n_filtered_items
filtered_items_img_urls = [None] * n_filtered_items
filtered_items_categories = [[]] * n_filtered_items
filtered_items_brands = [-1] * n_filtered_items
for key, value in filtered_items_id.items():
    filtered_items_contexts[value] = contexts[key]
    filtered_items_img_urls[value] = img_urls[key]
    filtered_items_categories[value] = item_categories[key]
    filtered_items_brands[value] = item_brands[key]

## Visualize Filtered Dataset

In [29]:
# # prompt: plot distribution of users count for only user greater than 5 interaction and smaller than 100

# bins = np.arange(5, 100, 5)
# plt.hist(users_cnt[users_greater_than_5_cnt], bins=bins, edgecolor='black')
# plt.title('Distribution of Users Count (5 <= Count < 100)')
# plt.xlabel('Users Count')
# plt.ylabel('Frequency')
# plt.show()


In [30]:
# # prompt: plot distribution of items count after filter

# bins = np.arange(5, 40, 1)
# items_cnt = {}
# for interaction in interactions:
#     if interaction[1] not in items_cnt:
#         items_cnt[interaction[1]] = 0
#     items_cnt[interaction[1]] += 1

# print(min(items_cnt.values()), max(items_cnt.values()), np.mean(list(items_cnt.values())))

# plt.hist(list(items_cnt.values()), bins=bins, edgecolor='black')
# plt.title('Distribution of Items Count (5 <= Count < 100)')
# plt.xlabel('Items Count')
# plt.ylabel('Frequency')
# plt.show()


# Split Data

In [31]:
warm_items, cold_items = train_test_split(np.arange(n_filtered_items), test_size=COLD_ITEMS_PROPORTION, random_state=SEED, shuffle=True)
val_cold_items, test_cold_items = train_test_split(cold_items, test_size=0.5, random_state=SEED, shuffle=True)

set_warm_items, set_cold_items = set(warm_items), set(cold_items)
set_val_cold_items, set_test_cold_items = set(val_cold_items), set(test_cold_items)

print(len(set_warm_items), len(set_val_cold_items), len(set_test_cold_items))
assert len(set_warm_items) + len(set_val_cold_items) + len(set_test_cold_items) == n_filtered_items

30364 3796 3796


In [32]:
print(len(set_cold_items))
print(type(set_cold_items))
print(set_cold_items)

print(len(set_warm_items))
print(type(set_warm_items))
print(set_warm_items)

assert len(set_cold_items.intersection(set_warm_items)) == 0

7592
<class 'set'>
{0, 32772, 32775, 32778, 32782, 32784, 17, 20, 32791, 32793, 27, 32796, 32, 37, 32806, 32809, 48, 49, 50, 52, 32822, 59, 63, 66, 68, 69, 73, 75, 76, 77, 80, 83, 32852, 86, 88, 32859, 32861, 32862, 95, 97, 32867, 99, 32870, 102, 32873, 32875, 111, 32881, 115, 124, 32895, 128, 32897, 32900, 134, 142, 148, 149, 150, 32918, 32920, 160, 161, 32932, 167, 168, 32942, 177, 32946, 178, 32949, 32951, 32955, 32956, 32958, 192, 194, 32963, 198, 200, 202, 203, 32972, 32974, 208, 32977, 210, 211, 32983, 217, 32986, 32989, 226, 32995, 32996, 227, 32998, 232, 33001, 234, 33004, 237, 241, 33012, 244, 246, 248, 33017, 250, 257, 261, 264, 265, 33034, 266, 33046, 33050, 283, 287, 33056, 33057, 33069, 33070, 33078, 33083, 317, 322, 33096, 328, 33099, 332, 33103, 337, 341, 33114, 351, 33125, 360, 364, 33134, 33137, 33139, 372, 33141, 374, 376, 33145, 379, 33147, 33148, 382, 33155, 33156, 33159, 395, 33164, 33163, 33166, 33168, 401, 33171, 405, 416, 435, 439, 33209, 33210, 445, 447, 33215,

In [33]:
warm_interactions = [interaction for interaction in filtered_interactions if interaction[1] in set_warm_items]
train_warm_interactions, val_warm_interactions = train_test_split(warm_interactions, test_size=TEST_WARM_INTERACTIONs_PROPORTION, random_state=SEED, shuffle=True)
val_warm_interactions, test_warm_interactions = train_test_split(val_warm_interactions, test_size=0.5, random_state=SEED, shuffle=True)
print(len(train_warm_interactions), len(val_warm_interactions), len(test_warm_interactions))

val_cold_interactions = [interaction for interaction in filtered_interactions if interaction[1] in set_val_cold_items]
test_cold_interactions = [interaction for interaction in filtered_interactions if interaction[1] in set_test_cold_items]
print(len(val_cold_interactions), len(test_cold_interactions))

val_interactions = val_warm_interactions + val_cold_interactions
test_interactions = test_warm_interactions + test_cold_interactions
print(len(val_interactions), len(test_interactions))

assert len(val_cold_interactions) + len(test_cold_interactions) + len(warm_interactions) == len(filtered_interactions)

104695 5816 5817
13516 15297
19332 21114


# User and Item Dictionary

In [34]:
users_items_interactions = {}
items_users_interactions = {}

for interaction in warm_interactions:
    if interaction[0] not in users_items_interactions:
        users_items_interactions[interaction[0]] = set()
    users_items_interactions[interaction[0]].add(interaction[1])
    if interaction[1] not in items_users_interactions:
        items_users_interactions[interaction[1]] = set()
    items_users_interactions[interaction[1]].add(interaction[0])

# Negative User


In [35]:
items_users_set = {}
for interaction in filtered_interactions:
    if interaction[1] not in items_users_set:
        items_users_set[interaction[1]] = set()
    items_users_set[interaction[1]].add(interaction[0])

In [36]:
all_users = np.arange(n_filtered_users)
warm_interactions_negative_users = []
for id, interaction in enumerate(tqdm(warm_interactions)):
    user_id = interaction[0]
    item_id = interaction[1]

    np.random.seed(SEED + id)
    while (user_id in items_users_set[item_id]):
        neg_user_id = np.random.choice(all_users)
        if neg_user_id not in items_users_set[item_id]:
            break

    warm_interactions_negative_users.append([user_id, item_id, neg_user_id])

print(len(warm_interactions_negative_users))
print(warm_interactions_negative_users[:10])

100%|██████████| 116328/116328 [00:03<00:00, 29253.05it/s]

116328
[[1, 1, 2355], [2, 2, 3298], [3, 2, 7481], [4, 3, 3921], [3, 4, 11873], [5, 5, 5481], [6, 5, 1244], [7, 5, 4396], [8, 5, 5083], [9, 6, 13238]]





In [37]:
print(len(warm_interactions))
print(len(warm_interactions_negative_users))
print(warm_interactions_negative_users[np.random.randint(len(warm_interactions_negative_users))])

116328
116328
[343, 37474, 4256]


In [38]:
np.save(os.path.join(SAVE_PATH, DATASET, "train_all_warm_interactions_negative_users.npy"), warm_interactions_negative_users)

# Onehot Features Vector

In [39]:
onehot_features = np.zeros((n_filtered_items, len(categories_id)))
for item_id in range(n_filtered_items):
    onehot_features[item_id][filtered_items_categories[item_id]] = 1

In [40]:
print(onehot_features.shape)
print(onehot_features[0])

(37956, 793)
[1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

# Create Interactions with Provider

In [41]:
test_cold_interactions_provider = []
for interaction in test_cold_interactions:
    user_id = interaction[0]
    item_id = interaction[1]
    timestamp = interaction[2]
    provider_id = filtered_items_brands[item_id]
    test_cold_interactions_provider.append([user_id, item_id, timestamp, provider_id])

print(len(test_cold_interactions_provider))
print(test_cold_interactions_provider[:10])

15297
[[18, 17, 1388016000, 0], [21, 20, 1381276800, 0], [22, 20, 1299888000, 0], [9, 20, 1289433600, 0], [23, 20, 1404259200, 0], [24, 20, 1377129600, 0], [25, 20, 1358294400, 0], [26, 20, 1354060800, 0], [27, 20, 1356739200, 0], [46, 37, 1388361600, 1]]


In [42]:
formatted_test_cold_item_id = {item_id: i for i, item_id in enumerate(set_test_cold_items)}
test_cold_interactions_provider_formated = []
for interaction in test_cold_interactions_provider:
    user_id = interaction[0]
    item_id = formatted_test_cold_item_id[interaction[1]]
    timestamp = interaction[2]
    provider_id = interaction[3]
    test_cold_interactions_provider_formated.append([user_id, item_id, timestamp, provider_id])

print(len(set_test_cold_items))
print(len(test_cold_interactions_provider_formated))
print(test_cold_interactions_provider_formated)

3796
15297
[[18, 12, 1388016000, 0], [21, 14, 1381276800, 0], [22, 14, 1299888000, 0], [9, 14, 1289433600, 0], [23, 14, 1404259200, 0], [24, 14, 1377129600, 0], [25, 14, 1358294400, 0], [26, 14, 1354060800, 0], [27, 14, 1356739200, 0], [46, 25, 1388361600, 1], [57, 30, 1398124800, 8], [58, 30, 1385510400, 8], [59, 30, 1400025600, 8], [63, 31, 1319846400, -1], [70, 35, 1389484800, -1], [90, 40, 1397606400, -1], [91, 40, 1397606400, -1], [90, 41, 1397606400, -1], [93, 41, 1405728000, -1], [92, 43, 1397606400, -1], [93, 43, 1405728000, -1], [92, 45, 1397606400, -1], [91, 45, 1397606400, -1], [95, 46, 1252454400, -1], [102, 50, 1329091200, 27], [103, 50, 1305331200, 27], [104, 54, 1372723200, 27], [105, 55, 1391472000, -1], [106, 55, 1388275200, -1], [125, 61, 1043625600, -1], [140, 64, 1364428800, 45], [144, 65, 1350518400, -1], [158, 70, 1399248000, 52], [187, 77, 1336176000, 57], [188, 82, 1390435200, -1], [214, 97, 1320883200, 10], [216, 98, 1359676800, 10], [220, 100, 1362096000, 10],

In [43]:
item_providers = set((inter[1], inter[3]) for inter in test_cold_interactions_provider_formated)
print(len(np.unique([inter[1] for inter in test_cold_interactions_provider_formated])))
print(len(np.unique([inter[2] for inter in test_cold_interactions_provider_formated])))
print(len(item_providers))
print(item_providers)
print(np.sum(np.array(list(item_providers))[:, 1] == -1))

3796
2400
3796
{(342, -1), (1160, 890), (435, 305), (2257, 1840), (685, 108), (3203, -1), (1812, -1), (1079, -1), (421, -1), (555, -1), (2470, 492), (1577, 131), (3282, -1), (1072, 224), (3529, 1187), (2287, -1), (3416, -1), (3193, 972), (1547, 108), (1030, -1), (2159, -1), (1608, 4176), (3667, 1157), (2008, 174), (768, -1), (372, -1), (1023, 862), (3143, 742), (3251, 275), (2238, -1), (847, -1), (1543, 488), (585, -1), (3021, -1), (1496, -1), (1630, -1), (2239, 228), (2045, 97), (239, -1), (1953, 130), (2055, -1), (2451, -1), (3100, -1), (1315, 108), (2704, -1), (3514, 132), (1931, 111), (689, 91), (3234, -1), (798, -1), (1843, -1), (3371, 2447), (1447, -1), (1709, -1), (1581, -1), (56, -1), (3019, 3573), (190, -1), (109, 10), (3393, 665), (2585, 83), (960, 145), (3794, 1354), (551, 1554), (734, 98), (2789, -1), (2393, -1), (1264, -1), (1398, -1), (1002, -1), (3438, -1), (1798, 197), (3388, 177), (141, 227), (2675, 1144), (1190, 234), (3212, 711), (1477, -1), (2606, -1), (2288, 97), (

# Extract Text Vector Features

In [None]:
BATCH_SIZE = 128
model = SentenceTransformer(TEXT_MODEL_NAME, device=DEVICE)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
text_features = np.zeros((n_filtered_items, TEXT_FEATURES))
for i in tqdm(range(0, n_filtered_items, BATCH_SIZE)):
    s, e = i, min(n_filtered_items, i + BATCH_SIZE)
    text_features[s:e] = model.encode(filtered_items_contexts[s:e], batch_size=e-s)

100%|██████████| 148/148 [00:52<00:00,  2.80it/s]


In [None]:
x = 1
y = 5
sim = np.dot(text_features[x], text_features[y]) / (np.linalg.norm(text_features[x]) * np.linalg.norm(text_features[y]))
print(contexts[x])
print(contexts[y])
print(sim)

123GetInk -14-pack 5-black 3-cyan 3-magenta 3-yellow Epson refurbished t0691-t0694 t069 t069120 t069420 ink cartridges Office Products Office & School Supplies Printer Ink & Toner Inkjet Printer Ink High quality inkjet cartridges use high-density, dye-base ink to produce crisp, sharp characters and deliver great-looking results time after time. quality professional printing , Ideal for office and home users. Package includes: 5 x T069120 (Black) 3 x T069220 (Cyan) 3 x T069320 (Magenta) 3 x T069420 (Yellow) T069120 (T0691) T069220 (T0692) T069320 (T0693) T069420 (T0694) Compatible with: Epson All-in-One Machines: Stylus CX5000, Stylus CX6000, Stylus CX7000F, Stylus CX7400, Stylus CX7450, Stylus CX8400, Stylus CX9400F, Stylus CX9475F, Stylus NX100, Stylus NX105, Stylus NX400, Stylus NX415, Stylus NX515, WorkForce 30, WorkForce 310, WorkForce 40, WorkForce 500, WorkForce 600
Scholastic TF3078 Happy Easter! Bulletin Board Office Products Office & School Supplies Education & Crafts Arts & C

# Extract Image Vector Features


In [None]:
idx = 0
has_features = []
image_features = np.zeros((n_filtered_items, 4096))
with open(f"image_features_{DATASET}.b", "rb") as f:
    while True:
        asin = f.read(10)
        if asin == '': break

        try:
            a = array.array('f')
            a.fromfile(f, 4096)
        except Exception as e:
            print(e)
            break

        idx += 1
        asin = asin.decode('utf-8')
        features = a.tolist()
        if items_id[asin] in filtered_items_id:
            has_features.append(asin)
            image_features[filtered_items_id[items_id[asin]]] = features

print(idx)
print(len(has_features))
print(image_features.shape)

read() didn't return enough bytes
133871
18693
(18823, 4096)


# Verify Consistency

In [44]:
warm_interaction_old = np.load(os.path.join(SAVE_PATH, DATASET, "train_all_warm_interactions.npy"))
assert np.array_equal(warm_interaction_old, np.array(warm_interactions))

val_cold_interactions_old = np.load(os.path.join(SAVE_PATH, DATASET, "val_cold_interactions.npy"))
assert np.array_equal(val_cold_interactions_old, np.array(val_cold_interactions))

test_cold_interactions_old = np.load(os.path.join(SAVE_PATH, DATASET, "test_cold_interactions.npy"))
assert np.array_equal(test_cold_interactions_old, np.array(test_cold_interactions))

In [45]:
categories_id_old = np.load(os.path.join(SAVE_PATH, DATASET, "categories_id.npy"), allow_pickle=True)
assert np.array_equal(categories_id_old, categories_id)

In [46]:
items_categories_old = np.load(os.path.join(SAVE_PATH, DATASET, "items_categories.npy"), allow_pickle=True)
for i in range(len(items_categories_old)):
    assert np.array_equal(items_categories_old[i], filtered_items_categories[i])

In [47]:
onehot_features_old = np.load(os.path.join(SAVE_PATH, DATASET, "onehot_features.npy"))
assert np.array_equal(onehot_features_old, onehot_features)

In [38]:
test_cold_interactions_provider_old = np.load(os.path.join(SAVE_PATH, DATASET, "test_cold_interactions_provider.npy"))
assert np.array_equal(test_cold_interactions_provider_old, test_cold_interactions_provider[:, :4])

TypeError: list indices must be integers or slices, not tuple

In [None]:
assert np.array_equal(test_cold_interactions_provider_old[:, :2], test_cold_interactions_old[:, :2])

In [None]:
images_features_old = np.load(os.path.join(SAVE_PATH, DATASET, "v_features.npy"))
assert np.array_equal(images_features_old, image_features)

# Save Data

In [43]:
metadata = {}
metadata["n_users"] = n_filtered_users
metadata["n_items"] = n_filtered_items
metadata["n_text_features"] = TEXT_FEATURES
metadata["n_image_features"] = IMAGE_FEATURES
metadata["n_categories"] = len(categories_id)

metadata["n_warm_items"] = len(warm_items)
metadata["n_cold_items"] = len(cold_items)
metadata["n_val_cold_items"] = len(val_cold_items)
metadata["n_test_cold_items"] = len(test_cold_items)

metadata["n_train_interactions"] = len(train_warm_interactions)
metadata["n_val_interactions"] = len(val_interactions)
metadata["n_test_interactions"] = len(test_interactions)

metadata["n_val_warm_interactions"] = len(val_warm_interactions)
metadata["n_val_cold_interactions"] = len(val_cold_interactions)
metadata["n_test_warm_interactions"] = len(test_warm_interactions)
metadata["n_test_cold_interactions"] = len(test_cold_interactions)

# np.save(os.path.join(SAVE_PATH, DATASET, "metadata.npy"), metadata)
print(metadata)

{'n_users': 31027, 'n_items': 33899, 'n_text_features': 384, 'n_image_features': 4096, 'n_categories': 413, 'n_warm_items': 27119, 'n_cold_items': 6780, 'n_val_cold_items': 3390, 'n_test_cold_items': 3390, 'n_train_interactions': 216727, 'n_val_interactions': 41766, 'n_test_interactions': 41510, 'n_val_warm_interactions': 12040, 'n_val_cold_interactions': 29726, 'n_test_warm_interactions': 12041, 'n_test_cold_interactions': 29469}


In [None]:
# np.save(os.path.join(SAVE_PATH, DATASET, "train_interactions.npy"), train_warm_interactions)
# np.save(os.path.join(SAVE_PATH, DATASET, "train_all_warm_interactions.npy"), warm_interactions)
# np.save(os.path.join(SAVE_PATH, DATASET, "train_all_warm_interactions_negative_users.npy"), warm_interactions_negative_users)

# np.save(os.path.join(SAVE_PATH, DATASET, "val_interactions.npy"), val_interactions)
# np.save(os.path.join(SAVE_PATH, DATASET, "val_warm_interactions.npy"), val_warm_interactions)
# np.save(os.path.join(SAVE_PATH, DATASET, "val_cold_interactions.npy"), val_cold_interactions)

# np.save(os.path.join(SAVE_PATH, DATASET, "test_interactions.npy"), test_interactions)
# np.save(os.path.join(SAVE_PATH, DATASET, "test_warm_interactions.npy"), test_warm_interactions)
# np.save(os.path.join(SAVE_PATH, DATASET, "test_cold_interactions.npy"), test_cold_interactions)

# np.save(os.path.join(SAVE_PATH, DATASET, "warm_items.npy"), set_warm_items)
# np.save(os.path.join(SAVE_PATH, DATASET, "cold_items.npy"), set_cold_items)
# np.save(os.path.join(SAVE_PATH, DATASET, "val_cold_items.npy"), set_val_cold_items)
# np.save(os.path.join(SAVE_PATH, DATASET, "test_cold_items.npy"), set_test_cold_items)

# np.save(os.path.join(SAVE_PATH, DATASET, "categories_id.npy"), categories_id)
# np.save(os.path.join(SAVE_PATH, DATASET, "items_categories.npy"), np.asarray(filtered_items_categories, dtype=object))

In [None]:
# np.save(os.path.join(SAVE_PATH, DATASET, "onehot_features.npy"), onehot_features)

In [None]:
# np.save(os.path.join(SAVE_PATH, DATASET, "item_brands.npy"), item_brands)
# np.save(os.path.join(SAVE_PATH, DATASET, "item_brands_id.npy"), brands_id)

In [48]:
np.save(os.path.join(SAVE_PATH, DATASET, "test_cold_interactions_provider.npy"), test_cold_interactions_provider)
np.save(os.path.join(SAVE_PATH, DATASET, "test_cold_interactions_provider_formated.npy"), test_cold_interactions_provider_formated)

In [None]:
# np.save(os.path.join(SAVE_PATH, DATASET, "t_features.npy"), text_features)

In [None]:
# np.save(os.path.join(SAVE_PATH, DATASET, "v_features.npy"), image_features)

In [None]:
test = np.load(os.path.join(SAVE_PATH, DATASET, "train_all_warm_interactions.npy"), allow_pickle=True)
print(test.shape)
assert test.shape[0] == len(train_warm_interactions) + len(val_warm_interactions) + len(test_warm_interactions)

(45575, 3)


# Terminate

In [None]:
from google.colab import runtime
runtime.unassign()