In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
# import required packages
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

In [3]:
base_path="/content/drive/MyDrive/Colab Notebooks/CRS"
domain = "conv_rec_sys_lastfm" # conv_rec_sys conv_rec_sys_amazon conv_rec_sys_lastfm

In [4]:
dataset = pd.read_csv(f"{base_path}/{domain}/data/ratings_filter.csv")

In [5]:
# get user distribution
user_dist = dataset['user_id'].value_counts() #user
num_users = len(user_dist)
print('No. users: ' + str(num_users))
print('Mean books per user: ' + str(user_dist.mean()))
print('Min books per user: ' + str(user_dist.min()))
print('Max books per user: ' + str(user_dist.max()))

No. users: 1801
Mean books per user: 42.5835646862854
Min books per user: 20
Max books per user: 50


In [6]:
# get item distribution
item_dist = dataset['business_id'].value_counts() #asin
num_items = len(item_dist)
print('No. items: ' + str(num_items))

No. items: 7123


In [35]:
item_popularity = item_dist/len(dataset)
item_popularity_df = pd.DataFrame(item_popularity).reset_index()
item_popularity_df.columns = ["business_id", "popularity"]
print(item_popularity_df.columns)
item_popularity_df.to_csv(f'{base_path}/{domain}/data/item_popularity.csv', index=False)

Index(['business_id', 'popularity'], dtype='object')


In [None]:
# get top items
top_fraction = 0.2
num_top = int(top_fraction * num_items)
top_item_dist = item_dist[:num_top]
print('No. top items: ' + str(num_top))

No. top items: 3990


In [None]:
top_item_set = set(top_item_dist.keys())

In [None]:
# Files to save items ids based on popularity
popular_items = open(f'{base_path}/{domain}/data/popular_items.txt', 'w')
popular_items.write('item_id' + '\n')
for item_id in top_item_set:
    popular_items.write(str(item_id) + '\n')
popular_items.close()

In [None]:
user_books = defaultdict(set)
for eachline in tqdm(dataset.iterrows()):
    # iid, uid, rating = eachline[1][0], eachline[1][1], eachline[1][2] #for amazon
    uid, iid, rating = eachline[1][0], eachline[1][1], eachline[1][2] 
    if uid in user_books.keys():
        user_books[uid].add(iid)
    else:
        user_books[uid] = {iid}

841754it [00:43, 19458.96it/s]


In [None]:
user_pop_book_ratio = {}

for user, books in tqdm(user_books.items()):
    if user not in user_pop_book_ratio.keys():
        user_pop_book_ratio[user] = (len(set(books) & set(top_item_set))) / len(set(books))

100%|██████████| 68864/68864 [00:06<00:00, 10307.11it/s]


In [None]:
sorted_user_pop_book_ratio = {k: v for k, v in sorted(user_pop_book_ratio.items(), key=lambda item: item[1], reverse=True)}

In [None]:
num_user = len(sorted_user_pop_book_ratio)
num_top_users = int(top_fraction * num_users)
num_top_users

13772

In [None]:
# Files to save users ids based on the number of checkins
inactive_users = open(f'{base_path}/{domain}/data/inactive_users.txt', 'w')
inactive_users.write('user_id' + '\n')

medium_users = open(f'{base_path}/{domain}/data//medium_users.txt', 'w')
medium_users.write('user_id' + '\n')

active_users = open(f'{base_path}/{domain}/data/active_users.txt', 'w')
active_users.write('user_id' + '\n')

8

In [None]:
for uid in list(sorted_user_pop_book_ratio.keys())[:num_top_users]:
    active_users.write(str(uid) + '\n')
active_users.close()

In [None]:
for uid in list(sorted_user_pop_book_ratio.keys())[num_top_users:len(sorted_user_pop_book_ratio) - num_top_users]:
    medium_users.write(str(uid) + '\n')
medium_users.close()

In [None]:
for uid in list(sorted_user_pop_book_ratio.keys())[len(sorted_user_pop_book_ratio) - num_top_users:len(sorted_user_pop_book_ratio)]:
    inactive_users.write(str(uid) + '\n')
inactive_users.close()