In [1]:
import os, sys

sys.path.append("/home/doyooni303/experiments/LLMRec/ReLLMRec")
import argparse
import yaml
import hashlib
import torch
from torch.optim import AdamW

from src.models import CandiRec
from src.train import Trainer
from src.utils import set_seed

In [2]:
os.chdir("/home/doyooni303/experiments/LLMRec/ReLLMRec")

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

parser = argparse.ArgumentParser()
parser.add_argument("--fname", type=str, default="Books")
parser.add_argument("--gpu", type=int, default=0)
parser.add_argument("--top_k", type=int, default=10)
parser.add_argument("--batch_size", type=int, default=2)
parser.add_argument("--max_epochs", type=int, default=10)
parser.add_argument("--max_steps", type=int, default=100)
parser.add_argument("--train_mode", type=str, default="epochs")

args = parser.parse_args([])
yaml_path = os.path.join("./configs", f"{args.fname}.yaml")
config = yaml.load(open(yaml_path, "r"), Loader=yaml.FullLoader)
for key, value in args.__dict__.items():
    config.update({key: value})

if config["train_mode"] == "epochs":
    name = f"-".join([f"{key}_{config[key]}" for key in ["top_k", "max_epochs"]])
    config.update({"eval_interval": config["max_epochs"] // 10})
elif config["train_mode"] == "steps":
    name = f"-".join([f"{key}_{config[key]}" for key in ["top_k", "max_steps"]])
    config.update({"eval_interval": config["max_steps"] // 10})
    
save_path = os.path.join(config["output"], config["fname"], name)
os.makedirs(save_path, exist_ok=True)
config.update({"save_path": save_path})

In [4]:
set_seed(config["seed"])

device = torch.device(
    f'cuda:{config["gpu"]}' if torch.cuda.is_available() else "cpu"
)

model = CandiRec(config).to(device=device, dtype=torch.float16)
optimizer = AdamW(model.parameters(), lr=float(config["lr"]))

trainer = Trainer(model, optimizer, config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

splitting data by user: 100%|██████████| 1188598/1188598 [00:03<00:00, 343013.97it/s]
splitting data by user: 100%|██████████| 1188598/1188598 [00:03<00:00, 347540.70it/s]


### Skip

In [5]:
from tqdm import tqdm
for i, batch in tqdm(enumerate(trainer.trainloader),total=len(trainer.trainloader)):
    if i == len(trainer.trainloader)-1:
        break

100%|█████████▉| 464102/464103 [29:30<00:00, 262.19it/s] 


In [10]:
normal_batch = {}
last_batch = {}
for k,v in batch.items():
    if ("input_ids" in k) or ("attention_mask" in k):
        normal_batch[k] = v[:2].to(device)
        last_batch[k] = v[-1,].to(device)
    else:
        normal_batch[k] = v[:2]
        last_batch[k] = v[-1]

In [9]:
for k,v in batch.items():
    if isinstance(v, torch.Tensor):
        batch[k] = v.to(device)

In [12]:
import json
dict_ = {}
for k, v in batch.items():
    if isinstance(v, torch.Tensor):
        dict_[k] = v.cpu().numpy().tolist()
    else:
        dict_[k] = v
json.dump(dict_, open("/home/doyooni303/experiments/LLMRec/ReLLMRec/notebook/last_batch.json","w"))

### Load

In [5]:
import json
batch = json.load(open("/home/doyooni303/experiments/LLMRec/ReLLMRec/notebook/last_batch.json","r"))
for k, v in batch.items():
    if ('input_ids' in k) or ('attention_mask' in k) or ('target_item_id' in k):
        batch[k] = torch.tensor(v).to(device)
    

In [6]:
model.train()
_, loss1 = model(batch)