In [1]:
import os, sys
import json
from collections import defaultdict

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM


In [7]:
torch.manual_seed(303)
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
data_folder = "/home/doyooni303/experiments/LLMRec/data/amazon/Books"
fname = "Books"
meta_name_dict = json.load(
    open(os.path.join(data_folder, f"{fname}_meta_name_dict.json"), "r")
)

In [18]:
list(meta_name_dict['title'].items())[:4]

[('1', 'Get Untamed: The Journal (How to Quit Pleasing and Start Living)'),
 ('2',
  'Cute & Easy Crochet with Flowers: 35 beautiful projects using floral motifs'),
 ('3', 'The Pop Manga Sketchbook: A Guided Drawing Journal'),
 ('4',
  'Haiku Knits: 25 Serenely Beautiful Patterns Inspired by Japanese Design')]

In [9]:
similar_users = json.load(
    open(os.path.join(data_folder, f"{fname}_similar_users.json"), "r")
)

In [10]:
sim_users = list(similar_users.keys()); len(sim_users)

547534

In [11]:
import sys
sys.path.append("/home/doyooni303/experiments/LLMRec/ReLLMRec/src/dataset")
from utils import data_partition

train, valid, test = data_partition(fname, data_folder)

splitting data by user: 100%|██████████| 1188598/1188598 [00:02<00:00, 403198.23it/s]


In [12]:
len(train)

928205

In [13]:
import sys
sys.path.append("/home/doyooni303/experiments/LLMRec/ReLLMRec/src/dataset")
import torch
from torch.utils.data import Dataset, DataLoader
import json
import random
import numpy as np
from easydict import EasyDict as edict
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
from utils import data_partition

class AmazonDataset(Dataset):
    def __init__(self, config, flag:str = None) -> None:
        self.config = config
        self.path = config['path']
        self.fname = config['fname']
        self.max_items = config['max_items']
        self.min_items = config['min_items']
        self.flag = flag
        self.tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self._load_data()
        

    def _load_data(self,):
        def _load_json(path:str, name: str)-> Dict:
            return json.load(open(os.path.join(path, name), "r"))

        def _get_data(fname: str, path: str, min_items: int, flag: str):
            train, valid, test = data_partition(fname, path, min_items)
            if flag == "train":
                data = train
            else:
                data = train
                for user_id in data.keys():
                    if flag == "valid":
                        data[user_id].append(valid[user_id])
                    elif flag == "test":
                        data[user_id].extend([valid[user_id], test[user_id]])
                
            return data

        self.meta_name_dict = _load_json(self.path, f"{self.fname}_meta_name_dict.json")
        self.similar_users = _load_json(self.path, f"{self.fname}_similar_users.json")
        self.data = _get_data(self.fname, self.path, self.min_items, self.flag)
        self.usermap = {i:user_id for i, user_id in enumerate(self.data.keys())}

    def _get_item_metadata(self, meta_name_dict: dict, item_id: int) -> Dict:
        """Get metadata of the item"""
        return {key: meta_name_dict[key][str(item_id)] for key in meta_name_dict.keys()}
    
    def _get_item_text(self,item_id: int, metadata: dict) -> str:
            item_text = f"<Item {item_id}>\n"
            item_text += "".join([f"{key}: {value}\n" for key,value in metadata.items()])
            return item_text

    def _format_item_list_query(self, user_id: str, data: dict, meta_name_dict: dict) -> str:
        """Get the item list query for the user"""
        query = f"Here is a item list of <user {user_id}> that shows the preference of <user {user_id}> in a time-order, so the last item is the most recent item.\n"
        query += "[Item List]\n"
        # user_id = str(user_id)
        history_items = data[user_id][-(self.max_items+1):-1]
        for i,item_id in enumerate(history_items):
            metadata = self._get_item_metadata(meta_name_dict, item_id)        
            query += "\n".join([f"{i}.\n{self._get_item_text(item_id, metadata)}"])
        
        query += f"Please select the most related items based on the <user {user_id}>'s history item list."
        return query
    
    def _format_similar_users_query(self, user_id: str, data: dict, meta_name_dict: dict, similar_users: dict) -> str:
        """Get the similar users query for the user"""
        query = f"Here is a list of similar users of <user {user_id}> and their history item list. Items are represented with a title and the first user is the most similar user to <user {user_id}>.\n"
        query += "[Similar Users]\n"
        
        similar_user_list = [l[0] for l in similar_users[str(user_id)]]
        for i, sim_user_id in enumerate(similar_user_list):
            title_list = [meta_name_dict['title'][str(item_id)] for item_id in data[sim_user_id][-(self.max_items+1):]]
            query += f"{i}. <useer {sim_user_id}: {title_list}\n"
        
        query += f"Please select the most related items based on the similar users of <user {user_id}>."
        return query
    
    def __len__(self,) -> int:
         return len(self.data)
        
    def __getitem__(self, idx: int) -> Tuple:
        user_id = self.usermap[idx]
        user_item_query = self._format_item_list_query(user_id, self.data, self.meta_name_dict)
        similar_users_query = self._format_similar_users_query(user_id, self.data, self.meta_name_dict, self.similar_users)
         
        target_item_id = self.data[user_id][-1]
        target_item_title = self.meta_name_dict['title'][str(target_item_id)]

        tokenized_ui_query = self.tokenizer(user_item_query, return_tensors="pt", padding="max_length", max_length=self.config['max_length'], truncation=True)
        tokenized_su_query = self.tokenizer(similar_users_query, return_tensors="pt", padding="max_length", max_length=self.config['max_length'], truncation=True)

        return {
            'user_id': user_id,
            'ui_query_input_ids': tokenized_ui_query['input_ids'].squeeze(),
            'ui_query_attention_mask': tokenized_ui_query['attention_mask'].squeeze(),
            'su_query_input_ids': tokenized_su_query['input_ids'].squeeze(),
            'su_query_attention_mask': tokenized_su_query['attention_mask'].squeeze(),
            'target_item_id': target_item_id,
            'target_item_title': target_item_title
        }
         


In [14]:
config = {
    'path': "/home/doyooni303/experiments/LLMRec/data/amazon/Books",
    'fname': "Books",
    'max_items': 15,
    'min_items': 5,
    'model_name': "meta-llama/Llama-3.1-8B-Instruct",
    'max_length': 4096,
}

In [15]:
trainset = AmazonDataset(config, flag="train")
trainloader = DataLoader(trainset, batch_size=4, shuffle=True)
next(iter(trainloader))

splitting data by user: 100%|██████████| 1188598/1188598 [00:03<00:00, 338983.72it/s]


{'user_id': tensor([393856,  71642, 355763,  48404]),
 'ui_query_input_ids': tensor([[128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ...,     17,    198,  20922]]),
 'ui_query_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'su_query_input_ids': tensor([[128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009]]),
 'su_query_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'target_item_id': tensor([ 91337,  8577

In [16]:
validset = AmazonDataset(config, flag="valid")
validloader = DataLoader(trainset, batch_size=4, shuffle=True)
next(iter(validloader))

splitting data by user: 100%|██████████| 1188598/1188598 [00:04<00:00, 280163.39it/s]


{'user_id': tensor([461387, 110853, 218861, 422622]),
 'ui_query_input_ids': tensor([[128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009]]),
 'ui_query_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'su_query_input_ids': tensor([[128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009],
         [128000,   8586,    374,  ..., 128009, 128009, 128009]]),
 'su_query_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'target_item_id': tensor([ 29515,  7075