In [2]:
class Argument:
    config = '../../config/beauty/graph_reasoning/UPGPR.json'
    seed = 0
    domain = 'Beauty'
    preference = 'positive'

# Example of creating an instance of the class with the default values
args = Argument()

In [6]:
from __future__ import absolute_import, division, print_function

import os
import json
import argparse
import torch
import torch.optim as optim
import numpy as np
from copy import deepcopy
from tqdm.auto import tqdm
from utils import *
from easydict import EasyDict as edict


with open(args.config, "r") as f:
    config = edict(json.load(f))

config.seed = args.seed
# config.TRAIN_EMBEDS.epochs = args.epochs
# config.TRAIN_EMBEDS.min_epochs = args.min_epochs

transe_config = config.TRAIN_EMBEDS
transe_config.use_user_relations = config.use_user_relations
transe_config.use_entity_relations = config.use_entity_relations

assert (
    transe_config.min_epochs <= transe_config.epochs
), "Minimum number of epochs should be lower than total number of epochs."

if config.use_wandb:
    wandb.init(
        project=config.wandb_project_name, name=config.wandb_run_name, config=config
    )

os.environ["CUDA_VISIBLE_DEVICES"] = transe_config.gpu

transe_config.device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

set_name = "test"

In [10]:
config.processed_data_dir = '../../data/beauty/Amazon_Beauty_01_01'

In [11]:
from make_cold_start_kg import InitalUserEmbedding, UserPreferences

In [15]:
embeds["user"].shape

(22363, 100)

In [12]:
init_embed = InitalUserEmbedding(
    set_name=set_name,
    config=config
)
embeds = init_embed.embeds

transe_config = config.TRAIN_EMBEDS

# load cold start users
cold_users_path = os.path.join(config.processed_data_dir, "cold_start_users.json")
cold_users = json.load(open(cold_users_path, "r"))

# load cold start items
cold_items_path = os.path.join(config.processed_data_dir, "cold_start_items.json")
cold_items = json.load(open(cold_items_path, "r"))

# set all cold start users embeddings to 0
tmp_cold_users = cold_users["test"] + cold_users["validation"]
embeds["user"][tmp_cold_users] = 0

# # set all cold start items embeddings to 0
# tmp_cold_items = cold_items["test"] + cold_items["validation"]
# embeds["item"][tmp_cold_items] = 0

tmp_cold_users = cold_users[set_name]
tmp_cold_items = cold_items[set_name]
# making a copy of the embeddings to avoid using the modified cold start embeddings in the next iteration
tmp_embeds = deepcopy(embeds)

nb_relations = 0
user_preferences = UserPreferences()

Load embedding: ../../data/beauty/Amazon_Beauty_01_01/test_transe_embed.pkl


In [14]:
def load_user_pref(path, domain):
    user_pref_path = os.path.join(path)
    # Load JSON data from a file
    user_pref = json.load(open(f'{user_pref_path}/user_preference_{domain}.json', 'r'))
    return user_pref

domain = args.domain
user_pref = load_user_pref(config.processed_data_dir, domain)
print('user_pref', len(user_pref))

cold_start_uids = {}
for idx in tqdm(range(len(user_pref))):
    user_id = user_pref[str(idx)]['idx_user']
    target_item = user_pref[str(idx)]['idx_item']
    user_acc_feature = user_pref[str(idx)]['user_acc_feature']
    user_rej_feature = user_pref[str(idx)]['user_rej_feature']
    user_rej_items = user_pref[str(idx)]['user_rej_items']
    
    user_preferred = init_embed.user_preference_config(
        user_acc_feature = user_acc_feature, 
        user_rej_feature = user_rej_feature, 
        user_rej_items = user_rej_items, 
    )
    
    user_key = user_pref[str(idx)]['idx_user']
    if user_key in cold_start_uids:
        for key, value in user_preferred.items():
            if isinstance(value, list):
                cold_start_uids[user_key][key].extend(value)
                # Remove redundant values
                cold_start_uids[user_key][key] = list(set(cold_start_uids[user_key][key]))
    else:
        cold_start_uids[user_key] = user_preferred

print('cold_start_uids', len(cold_start_uids))

user_pref 2251


  0%|          | 0/2251 [00:00<?, ?it/s]

cold_start_uids 2079


In [None]:
# # Accessing items in the dictionary:
# for idx, user in enumerate(cold_start_uids):
#     # for relation, entity in dataset.data_args.item_relation.items():
#     for relation, entity in user_preferences.items():
#         # print(f'RELATION : {relation.ljust(16)} | ENTITY : {entity}')
#         if relation == 'disinterested_in':
#             relation = 'interested_in'
#             continue
#         entities = user_preferred[relation]
#         all_related_emb = (
#             embeds[entity[1]][entities] - embeds[relation][0]
#         )
#         nb_relations += all_related_emb.shape[0]
#         # sum all related entities embeddings
#         if relation in ['interested_in', 'like', 'dislike']:
#             tmp_embeds["user"][user] += all_related_emb.sum(axis=0)
#         # elif relation in ['disinterested_in']:
#         #     zero_embeds["user"] -= all_related_emb.sum(axis=0)
#     # divide by the number of relations to get the average
#     if nb_relations > 0:
#         tmp_embeds["user"][user] /= nb_relations 
    
# # save the embeddings
# save_embed(
#     config.processed_data_dir, f"{set_name}_cold_start_transe_embed.pkl", tmp_embeds
# )

In [28]:
import numpy as np

# Example vectors and matrix
A = np.random.rand(100)
B = np.random.rand(20, 100)  # n is 10 in this example
# Normalize A
A_norm = A / np.linalg.norm(A)
# Normalize B
B_norm = B / np.linalg.norm(B, axis=1)[:, np.newaxis]
# Compute cosine similarity
similarity = np.dot(B_norm, A_norm)
print(similarity)

[0.7700074  0.7353711  0.75064622 0.7338551  0.80718175 0.77831027
 0.79847464 0.73965569 0.7497185  0.73536031 0.81858419 0.76926914
 0.79451204 0.76544339 0.73352902 0.81425411 0.72757791 0.78073842
 0.71517224 0.7387626 ]


In [40]:
import numpy as np

def top_k_argmax(A, B, k):
    # Normalize A
    A_norm = A / np.linalg.norm(A)
    # Normalize B
    B_norm = B / np.linalg.norm(B, axis=1)[:, np.newaxis]
    # Compute cosine similarity
    similarity = np.dot(B_norm, A_norm)
    # Get top-k argmax indices
    top_k_max_indices = np.argpartition(similarity, -k)[-k:]
    top_k_max_indices = top_k_max_indices[np.argsort(similarity[top_k_max_indices])[::-1]]
    
    # Get top-k max similarities
    top_k_max_values = similarity[top_k_max_indices]

    return top_k_max_indices, top_k_max_values

def top_k_argmin(A, B, k):
    # Normalize A
    A_norm = A / np.linalg.norm(A)
    # Normalize B
    B_norm = B / np.linalg.norm(B, axis=1)[:, np.newaxis]
    # Compute cosine similarity
    similarity = np.dot(B_norm, A_norm)
    # Get top-k argmin indices
    top_k_min_indices = np.argpartition(similarity, k)[:k]
    top_k_min_indices = top_k_min_indices[np.argsort(similarity[top_k_min_indices])]
    
    # Get top-k min similarities
    top_k_min_values = similarity[top_k_min_indices]

    return top_k_min_indices, top_k_min_values

# Example usage
A = np.random.rand(100)
C = np.random.rand(100)
B = np.random.rand(20000, 100)  # n is 30 in this example
k = 1000

top_k_max_indices, top_k_max_values = top_k_argmax(A, B, k)
top_k_min_indices, top_k_min_values = top_k_argmin(C, B, k)

# Find overlapping indices
overlap_indices = np.intersect1d(top_k_max_indices, top_k_min_indices)

print("Top-k maximum similarity indices:", top_k_max_indices)
# print("Top-k maximum similarity values:", top_k_max_values)
print("Top-k minimum similarity indices:", top_k_min_indices)
# print("Top-k minimum similarity values:", top_k_min_values)
print("Overlapping indices:", overlap_indices)


Top-k maximum similarity indices: [14303  6842  7624  7128  8609  8178  4333  3354  4245  3406 16142  3703
 10244  3624 16943 10652  1867  6799  9411  5805  8386 15607  2963 17509
   236 10772   146  2218 18630 17511  9964    72   726 12140 14050 14695
 14251 12646  5105  5518  3812 11910  9369  3125  3730  7912 11579  7839
  4111  9958 12253 17350 13398  3372  3877 15618  2925 15193  1092 10013
  5032 14283 11864 11622  4207  7641   249 15549  6663  8183 18045 14760
  9454 16849  3017 19661 12483  1280  5500 14786 15624  4482 11573  2605
  6755 13768 10410  8507 12758  2943  7058  5679  4980 19465 12571  9834
  6236 14511  2399  3334 11024  6696  7551  4100  5678  5629  7707  1059
  9408  3897 14926 12869 14124  9384  9890 19756 14203  7844 11823 17023
 19326 12979 13333 14310  7948 10086  1009  3453 14879  7734  6398 14692
  3749 13249  8760 15390  4406  3306  3789 11329 12771 13186  7348  5581
 17469  2729 15072 13962 16735 19820 11876 10679 11993 17566 13672  3117
  3889  2661  686

In [44]:
overlap_indices

array([  204,  1179,  1584,  2021,  2195,  3310,  4307,  5697,  7169,
        7271,  8268,  8507,  8588, 10826, 11254, 16491, 16943, 17330,
       19770])

In [45]:
overlap_indices

array([  204,  1179,  1584,  2021,  2195,  3310,  4307,  5697,  7169,
        7271,  8268,  8507,  8588, 10826, 11254, 16491, 16943, 17330,
       19770])