In [289]:
import pickle
import scipy.io as scio
import scipy.sparse as sp
import random
from collections import defaultdict
import numpy as np

In [290]:
ciao_ratings_mat = scio.loadmat(f"./sample_data/ciao/rating.mat")["rating"]
ciao_trustnetwork_mat = scio.loadmat(f"./sample_data/ciao/trustnetwork.mat")["trustnetwork"]
epinions_ratings_mat = scio.loadmat(f"./sample_data/epinions/rating.mat")["rating"]
epinions_trustnetwork_mat = scio.loadmat(f"./sample_data/epinions/trustnetwork.mat")["trustnetwork"]
history_u_lists, history_ur_lists, history_v_lists, history_vr_lists, train_u, train_v, train_r, test_u, test_v, test_r, social_adj_lists, ratings_list = pickle.load(open(f"./sample_data/toy_dataset.pickle", 'rb'))

# Toy dataset

In [291]:
# Check for 0 values
print(set(train_r + test_r))

{0.5, 1.0, 2.5, 2.0, 3.0, 4.0, 3.5, 1.5}


In [292]:
# Check for duplicates
# Zip the two lists
zipped_lists = list(zip(train_u + test_u, train_v + test_v))

# Check for duplicates
if len(zipped_lists) != len(set(zipped_lists)):
    print("There are duplicates!")
else:
    print("No duplicates found!")

No duplicates found!


In [293]:
# Check if undirected connections
users_with_connections = set()
is_undirected = True
for source in social_adj_lists:
  users_with_connections.add(source)
  for target in social_adj_lists[source]:
    users_with_connections.add(target)
    if source not in social_adj_lists[target]:
      is_undirected = False
print(f'Undirected: {is_undirected}')

Undirected: True


In [294]:
# Check if users with ratings but no connections
users_with_ratings = set(history_u_lists.keys())
difference = users_with_ratings - users_with_connections

print("Users with ratings but no connections:", difference)

Users with ratings but no connections: set()


In [295]:
# Check if users with connections but no ratings
users_with_ratings = set(history_u_lists.keys())
difference = users_with_ratings - users_with_connections

print("Users with ratings but no connections:", difference)

Users with ratings but no connections: set()


In [296]:
# Check if users in test_u but not in train_u
# Convert lists to sets
train_set = set(train_u)
test_set = set(test_u)

# Find users that are in test_u but not in train_u
missing_users = test_set - train_set

print("Users in test_u but not in train_u:", missing_users)

Users in test_u but not in train_u: set()


In [297]:
# Check if items in test_v but not in train_v
# Convert lists to sets
train_set = set(train_v)
test_set = set(test_v)

# Find users that are in test_u but not in train_u
missing_items = test_set - train_set

print("Items in test_v but not in train_v:", missing_items)

Items in test_v but not in train_v: set()


In [298]:
# Check for empty user and item profiles
empty_user_profiles = False
for user_id in history_u_lists:
  if len(history_u_lists[user_id]) == 0:
    empty_user_profiles = True

empty_item_profiles = False
for item_id in history_v_lists:
  if len(history_v_lists[item_id]) == 0:
    empty_item_profiles = True

print(f'{empty_user_profiles} {empty_item_profiles}')

False False


# Epinions

In [299]:
epinions_ratings_mat

array([[    1,     1,     3,     2],
       [    1,     2,     2,     2],
       [    1,     3,     3,     2],
       ...,
       [22166, 43538,     5,     5],
       [22166, 38711,     3,     4],
       [22166, 41790,     5,     3]], dtype=int32)

In [300]:
epinions_trustnetwork_mat

array([[15373,  9831],
       [ 4247,  9831],
       [ 4644,  9831],
       ...,
       [13181, 15645],
       [  897,  8000],
       [ 8000,   897]], dtype=uint16)

In [301]:
epinions_users = epinions_ratings_mat[:, 0]
epinions_items = epinions_ratings_mat[:, 1]
epinions_ratings = epinions_ratings_mat[:, 3]
epinions_source = epinions_trustnetwork_mat[:, 0]
epinions_target = epinions_trustnetwork_mat[:, 1]
users_to_remove = []

In [302]:
# Check for 0 values
print(set(epinions_ratings))

{1, 2, 3, 4, 5}


In [303]:
# Check for duplicates
# Zip the two lists
zipped_lists = list(zip(epinions_users, epinions_items))

# Check for duplicates
if len(zipped_lists) != len(set(zipped_lists)):
    print(f"There are {len(zipped_lists) - len(set(zipped_lists))} duplicates!")
else:
    print("No duplicates found!")

There are 9826 duplicates!


In [304]:
# Check if users with ratings but no connections
users_with_ratings = set(epinions_users)
users_with_connections = set(list(epinions_source) + list(epinions_target))
difference = users_with_ratings - users_with_connections
users_to_remove.extend(list(difference))
print("Users with ratings but no connections:", len(difference))

Users with ratings but no connections: 4067


In [305]:
# Check if users with connections but no ratings
difference = users_with_connections - users_with_ratings
users_to_remove.extend(list(difference))
print("Users with connections but no ratings:", len(difference))

Users with connections but no ratings: 1


In [306]:
# Check if undirected connections
users_with_connections = set()
is_undirected = True
zipped_list = list(zip(epinions_source, epinions_target))
for entry in zipped_list:
  print(entry)
  if (entry[1], entry[0]) not in zipped_list:
    is_undirected = False
    break
print(f'Undirected: {is_undirected}')

(15373, 9831)
Undirected: False


In [307]:
zipped_ratings = list(zip(list(epinions_users), list(epinions_items), list(epinions_ratings)))
print(len(zipped_ratings))
# Remove zeros
filtered_ratings = [(user, item, rating) for user, item, rating in zipped_ratings if rating != 0]
user_item_pairs = [(user, item) for user, item, rating in filtered_ratings]
print(len(user_item_pairs))
# Remove unwanted users
filtered_user_item_pairs = [(user, item) for user, item in user_item_pairs if user not in users_to_remove]
filtered_ratings = [(user, item, rating) for user, item, rating in filtered_ratings if user not in users_to_remove]
print(len(filtered_user_item_pairs))
# Remove duplicates
filtered_user_item_pairs = list(set(filtered_user_item_pairs))
print(len(filtered_user_item_pairs))
ratings_dict = {(user, item): rating for user, item, rating in filtered_ratings}
print(len(ratings_dict))

922267
922267
764693
756678
756678


In [308]:
# Create a dictionary of users and their items
user_to_items = defaultdict(list)
for user, item in filtered_user_item_pairs:
    user_to_items[user].append(item)

train_data = []
test_data = []

# For each user, split their items 80-20
for user, items in user_to_items.items():
    random.shuffle(items)
    split_idx = int(0.8 * len(items))
    if len(items) == 1:
      train_items = items
      test_items = []
    else:
      train_items = items[:split_idx]
      test_items = items[split_idx:]

    train_data.extend([(user, item) for item in train_items])
    test_data.extend([(user, item) for item in test_items])

# Ensure that all users in the test set also appear in the train set
assert set(user for user, _ in test_data).issubset(set(user for user, _ in train_data))

# Write the training data to a file
with open("epinions_filtered_ratings_train.tsv", "w") as f:
    for user, item in train_data:
        f.write(f"{user}\t{item}\n")

# Write the test data to a file
with open("epinions_filtered_ratings_test.tsv", "w") as f:
    for user, item in test_data:
        f.write(f"{user}\t{item}\n")

print(len(train_data))
print(len(test_data))

597820
158858


In [309]:
train_ratings = [(user, item, ratings_dict[(user, item)]) for user, item in train_data if (user, item) in ratings_dict]
test_ratings = [(user, item, ratings_dict[(user, item)]) for user, item in test_data if (user, item) in ratings_dict]
train_u = [user for user, item, rating in train_ratings]
train_v = [item for user, item, rating in train_ratings]
train_r = [rating for user, item, rating in train_ratings]

test_u = [user for user, item, rating in test_ratings]
test_v = [item for user, item, rating in test_ratings]
test_r = [rating for user, item, rating in test_ratings]

combined_test_data = list(zip(test_u, test_v, test_r))
# Split the data into two equal parts
split_index = len(combined_test_data) // 2
test_data, valid_data = combined_test_data[:split_index], combined_test_data[split_index:]

# Extract the data
test_u = [user for user, item, rating in test_data]
test_v = [item for user, item, rating in test_data]
test_r = [rating for user, item, rating in test_data]

valid_u = [user for user, item, rating in valid_data]
valid_v = [item for user, item, rating in valid_data]
valid_r = [rating for user, item, rating in valid_data]


unique_ratings = list(set(train_r + test_r))
unique_ratings = sorted(unique_ratings)
ratings_list = {rating: i for i, rating in enumerate(unique_ratings)}

# Initialize default dictionaries
history_u_lists = defaultdict(list)
history_ur_lists = defaultdict(list)
history_v_lists = defaultdict(list)
history_vr_lists = defaultdict(list)

# Fill history_u_lists and history_ur_lists
for u, v, r in zip(train_u, train_v, train_r):
    history_u_lists[u].append(v)
    history_ur_lists[u].append(ratings_list[r])

# Fill history_v_lists and history_vr_lists
for u, v, r in zip(train_u, train_v, train_r):
    history_v_lists[v].append(u)
    history_vr_lists[v].append(ratings_list[r])

# Convert back to regular dictionaries if needed
history_u_lists = dict(history_u_lists)
history_ur_lists = dict(history_ur_lists)
history_v_lists = dict(history_v_lists)
history_vr_lists = dict(history_vr_lists)

valid_users = set(history_u_lists.keys())
filtered_network = [row for row in epinions_trustnetwork_mat if row[0] in valid_users and row[1] in valid_users]

# Convert back to an array if needed
filtered_network = np.array(filtered_network)
filtered_network_list = filtered_network.tolist()

social_adj_lists = defaultdict(list)

for user_id in valid_users:
  social_adj_lists[user_id] = []

for source_user_id, target_user_id in filtered_network_list:
    social_adj_lists[source_user_id].append(target_user_id)

# Convert back to regular dictionary if needed
social_adj_lists = dict(social_adj_lists)

item_categories = {row[1]: row[2] for row in epinions_ratings_mat}
valid_items = set(train_v + test_v)
filtered_item_categories = {item: category for item, category in item_categories.items() if item in valid_items}

# Define the data to be pickled
pickle_data = [history_u_lists, history_ur_lists, history_v_lists, history_vr_lists, train_u, train_v, train_r, test_u, test_v, test_r, valid_u, valid_v, valid_r, social_adj_lists, ratings_list, filtered_item_categories]

# Open the file in write-binary mode and dump the data
with open('top_k_epinions.pickle', 'wb') as file:
    pickle.dump(pickle_data, file)

# Ciao

In [310]:
ciao_users = ciao_ratings_mat[:, 0]
ciao_items = ciao_ratings_mat[:, 1]
ciao_ratings = ciao_ratings_mat[:, 3]
ciao_source = ciao_trustnetwork_mat[:, 0]
ciao_target = ciao_trustnetwork_mat[:, 1]
users_to_remove = []

In [311]:
# Check for 0 values
print(set(ciao_ratings))
print(f"0 ratings: {list(ciao_ratings).count(0)}")

{0, 1, 2, 3, 4, 5}
0 ratings: 34


In [312]:
# Check for duplicates
# Zip the two lists
zipped_lists = list(zip(ciao_users, ciao_items))

# Check for duplicates
if len(zipped_lists) != len(set(zipped_lists)):
    print(f"There are {len(zipped_lists) - len(set(zipped_lists))} duplicates!")
else:
    print("No duplicates found!")

There are 1436 duplicates!


In [313]:
# Check if users with ratings but no connections
users_with_ratings = set(ciao_users)
users_with_connections = set(list(ciao_source) + list(ciao_target))
difference = users_with_ratings - users_with_connections

print("Users with ratings but no connections:", len(difference))
users_to_remove.extend(list(difference))

Users with ratings but no connections: 58


In [314]:
# Check if users with connections but no ratings
difference = users_with_connections - users_with_ratings

print("Users with connections but no ratings:", len(difference))
users_to_remove.extend(list(difference))

Users with connections but no ratings: 0


In [315]:
# Check if undirected connections
users_with_connections = set()
is_undirected = True
zipped_list = list(zip(ciao_source, ciao_target))
for entry in zipped_list:
  print(entry)
  if (entry[1], entry[0]) not in zipped_list:
    is_undirected = False
    break
print(f'Undirected: {is_undirected}')

(1, 2)
(1, 3)
Undirected: False


In [316]:
zipped_ratings = list(zip(list(ciao_users), list(ciao_items), list(ciao_ratings)))
print(len(zipped_ratings))
# Remove zeros
filtered_ratings = [(user, item, rating) for user, item, rating in zipped_ratings if rating != 0]
user_item_pairs = [(user, item) for user, item, rating in filtered_ratings]
print(len(user_item_pairs))
# Remove unwanted users
filtered_user_item_pairs = [(user, item) for user, item in user_item_pairs if user not in users_to_remove]
filtered_ratings = [(user, item, rating) for user, item, rating in filtered_ratings if user not in users_to_remove]
print(len(filtered_user_item_pairs))
# Remove duplicates
filtered_user_item_pairs = list(set(filtered_user_item_pairs))
print(len(filtered_user_item_pairs))
ratings_dict = {(user, item): rating for user, item, rating in filtered_ratings}
print(len(ratings_dict))

284086
284052
283286
281867
281867


In [317]:
# Create a dictionary of users and their items
user_to_items = defaultdict(list)
for user, item in filtered_user_item_pairs:
    user_to_items[user].append(item)

train_data = []
test_data = []

# For each user, split their items 80-20
for user, items in user_to_items.items():
    random.shuffle(items)
    split_idx = int(0.8 * len(items))
    if len(items) == 1:
      train_items = items
      test_items = []
    else:
      train_items = items[:split_idx]
      test_items = items[split_idx:]

    train_data.extend([(user, item) for item in train_items])
    test_data.extend([(user, item) for item in test_items])

# Ensure that all users in the test set also appear in the train set
assert set(user for user, _ in test_data).issubset(set(user for user, _ in train_data))

# Write the training data to a file
with open("ciao_filtered_ratings_train.tsv", "w") as f:
    for user, item in train_data:
        f.write(f"{user}\t{item}\n")

# Write the test data to a file
with open("ciao_filtered_ratings_test.tsv", "w") as f:
    for user, item in test_data:
        f.write(f"{user}\t{item}\n")

print(len(train_data))
print(len(test_data))

222434
59433


In [318]:
train_ratings = [(user, item, ratings_dict[(user, item)]) for user, item in train_data if (user, item) in ratings_dict]
test_ratings = [(user, item, ratings_dict[(user, item)]) for user, item in test_data if (user, item) in ratings_dict]
train_u = [user for user, item, rating in train_ratings]
train_v = [item for user, item, rating in train_ratings]
train_r = [rating for user, item, rating in train_ratings]

test_u = [user for user, item, rating in test_ratings]
test_v = [item for user, item, rating in test_ratings]
test_r = [rating for user, item, rating in test_ratings]

combined_test_data = list(zip(test_u, test_v, test_r))
# Split the data into two equal parts
split_index = len(combined_test_data) // 2
test_data, valid_data = combined_test_data[:split_index], combined_test_data[split_index:]

# Extract the data
test_u = [user for user, item, rating in test_data]
test_v = [item for user, item, rating in test_data]
test_r = [rating for user, item, rating in test_data]

valid_u = [user for user, item, rating in valid_data]
valid_v = [item for user, item, rating in valid_data]
valid_r = [rating for user, item, rating in valid_data]


unique_ratings = list(set(train_r + test_r))
unique_ratings = sorted(unique_ratings)
ratings_list = {rating: i for i, rating in enumerate(unique_ratings)}

# Initialize default dictionaries
history_u_lists = defaultdict(list)
history_ur_lists = defaultdict(list)
history_v_lists = defaultdict(list)
history_vr_lists = defaultdict(list)

# Fill history_u_lists and history_ur_lists
for u, v, r in zip(train_u, train_v, train_r):
    history_u_lists[u].append(v)
    history_ur_lists[u].append(ratings_list[r])

# Fill history_v_lists and history_vr_lists
for u, v, r in zip(train_u, train_v, train_r):
    history_v_lists[v].append(u)
    history_vr_lists[v].append(ratings_list[r])

# Convert back to regular dictionaries if needed
history_u_lists = dict(history_u_lists)
history_ur_lists = dict(history_ur_lists)
history_v_lists = dict(history_v_lists)
history_vr_lists = dict(history_vr_lists)

valid_users = set(history_u_lists.keys())
filtered_network = [row for row in ciao_trustnetwork_mat if row[0] in valid_users and row[1] in valid_users]

# Convert back to an array if needed
filtered_network = np.array(filtered_network)
filtered_network_list = filtered_network.tolist()

social_adj_lists = defaultdict(list)

for user_id in valid_users:
  social_adj_lists[user_id] = []

for source_user_id, target_user_id in filtered_network_list:
    social_adj_lists[source_user_id].append(target_user_id)

# Convert back to regular dictionary if needed
social_adj_lists = dict(social_adj_lists)

item_categories = {row[1]: row[2] for row in ciao_ratings_mat}
valid_items = set(train_v + test_v)
filtered_item_categories = {item: category for item, category in item_categories.items() if item in valid_items}

# Define the data to be pickled
pickle_data = [history_u_lists, history_ur_lists, history_v_lists, history_vr_lists, train_u, train_v, train_r, test_u, test_v, test_r, valid_u, valid_v, valid_r, social_adj_lists, ratings_list, filtered_item_categories]

# Open the file in write-binary mode and dump the data
with open('top_k_ciao.pickle', 'wb') as file:
    pickle.dump(pickle_data, file)