In [1]:
import os,sys
sys.path.append("/home/doyooni303/experiments/LLMRec/ReLLMRec")
import json
from collections import defaultdict
from tqdm import tqdm
from src.utils import open_jsonl


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [2]:
import json
folder = "/home/doyooni303/experiments/LLMRec/data/amazon/Books"
fname = "Books"
itemmap = json.load(open(os.path.join(folder, f"{fname}_itemmap.json"),"r"))
meta_name_dict = json.load(open(os.path.join(folder, f"{fname}_meta_name_dict.json"),"r"))

In [3]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(device)
print(device)
model_name = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()

cuda:1


LongformerModel(
  (embeddings): LongformerEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (position_embeddings): Embedding(4098, 768, padding_idx=1)
  )
  (encoder): LongformerEncoder(
    (layer): ModuleList(
      (0-11): 12 x LongformerLayer(
        (attention): LongformerAttention(
          (self): LongformerSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (query_global): Linear(in_features=768, out_features=768, bias=True)
            (key_global): Linear(in_features=768, out_features=768, bias=True)
            (value_global): Linear(in_features=768, out_features=768, bias=True)
          )
    

In [5]:
iidx_list, text_list = [], [],
meta_keys = list(meta_name_dict.keys())
for i,(iid, idx) in tqdm(enumerate(itemmap.items())):
    item_text = ", ".join([f"{key.upper()}: {meta_name_dict[key][str(idx)]}" for key in meta_keys])
    iidx_list.append(idx)
    text_list.append(item_text)
    if i == 1000:
        break

1000it [00:00, 30371.50it/s]


In [5]:
lengths = []
for i in tqdm(range(0, len(text_list), 1000)):
    tokenized = tokenizer(text_list[i:i+1000], padding=False, truncation=False)
    lengths.extend([len(token) for token in tokenized['input_ids']])

100%|██████████| 970/970 [03:37<00:00,  4.46it/s]


In [6]:
import numpy as np
np.percentile(lengths, q=99.7) # 약 99.7%의 text는 1925 개 이하임

1925.0

In [None]:
torch.random.manual_seed(303)

dim = 768
embeddings = nn.ParameterList([[nn.Parameter(torch.randn(dim))]])

batch_size = 16
max_length = 2048

for i in tqdm(range(0, len(iidx_list), batch_size)):
    batch_idxs = [int(idx) for idx in iidx_list[i:i+batch_size]]
    batch_texts = text_list[i:i+batch_size]

    inputs = tokenizer(batch_texts, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length)
    outputs = model(**inputs.to(device)).pooler_output.cpu().detach()
    # embeddings.weight.data[batch_idxs] = outputs
    embeddings.append(nn.Parameter(outputs))
    torch.cuda.empty_cache()



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

class TextDataset(Dataset):
    def __init__(self, text_list, idx_list, tokenizer, max_length):
        self.texts = text_list
        self.idxs = idx_list
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, i):
        text = self.texts[i]
        idx = self.idxs[i]
        
        # Pre-tokenize the text
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # Squeeze to remove batch dimension since DataLoader will add it
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'idx': torch.tensor(idx)
        }

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    idxs = torch.stack([item['idx'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'idxs': idxs
    }

# Set random seed
torch.random.manual_seed(303)

# Initialize storage for embeddings
dim = 768
all_embeddings = torch.zeros((len(text_list)+1, dim))

# Create dataset and dataloader
dataset = TextDataset(text_list, iidx_list, tokenizer, max_length=2048)
dataloader = DataLoader(
    dataset,
    batch_size=128,  # Increased batch size
    shuffle=False,
    num_workers=2,  # Parallel data loading
    pin_memory=True,  # Faster data transfer to GPU
    collate_fn=collate_fn
)

# Set model to evaluation mode
model.eval()

# Process batches with gradient disabled
with torch.no_grad():
    for batch in tqdm(dataloader):
        # Move batch to GPU
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device)
        }
        
        # Forward pass
        outputs = model(**inputs).pooler_output
        
        # Store embeddings
        all_embeddings[batch['idxs']] = outputs.cpu()

# Convert to Parameter if needed
embeddings = nn.Parameter(all_embeddings)

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.cuda.amp import autocast
import gc

class TextDataset(Dataset):
    def __init__(self, text_list, idx_list, tokenizer):
        print("Pre-tokenizing all texts...")
        self.encodings = tokenizer(
            text_list,
            padding="max_length",
            truncation=True,
            max_length=1925,
            return_tensors="pt"
        )
        self.idxs = torch.tensor(idx_list, dtype=torch.long)
        
    def __len__(self):
        return len(self.idxs)
    
    def __getitem__(self, i):
        return {
            'input_ids': self.encodings['input_ids'][i],
            'attention_mask': self.encodings['attention_mask'][i],
            'idx': self.idxs[i]
        }

def process_large_dataset(text_list, iidx_list, model, tokenizer, device, 
                         batch_size=128, chunk_size=50000):
    dim = 768
    total_size = len(text_list)
    # Initialize as float16 to match autocast dtype
    all_embeddings = torch.zeros((total_size, dim), dtype=torch.float16)
    
    for chunk_start in range(0, total_size, chunk_size):
        chunk_end = min(chunk_start + chunk_size, total_size)
        
        print(f"Processing chunk {chunk_start//chunk_size + 1} of {(total_size-1)//chunk_size + 1}")
        
        chunk_dataset = TextDataset(
            text_list[chunk_start:chunk_end],
            iidx_list[chunk_start:chunk_end],
            tokenizer
        )
        
        dataloader = DataLoader(
            chunk_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=4,
            pin_memory=True
        )
        
        model.eval()
        with torch.no_grad(), autocast():
            for batch in tqdm(dataloader):
                inputs = {
                    'input_ids': batch['input_ids'].to(device, non_blocking=True),
                    'attention_mask': batch['attention_mask'].to(device, non_blocking=True)
                }
                
                outputs = model(**inputs).pooler_output
                # Store directly in half precision
                all_embeddings[batch['idx']] = outputs.cpu()
        
        del chunk_dataset, dataloader
        gc.collect()
        torch.cuda.empty_cache()
    
    # Convert back to float32 at the end if needed
    all_embeddings = all_embeddings.float()
    return nn.Parameter(all_embeddings)

# Main execution
torch.random.manual_seed(303)
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Process the texts
embeddings = process_large_dataset(
    text_list=text_list,
    iidx_list=iidx_list,
    model=model,
    tokenizer=tokenizer,
    device=device,
    batch_size=128,
    chunk_size=50000
)

Processing chunk 1 of 1
Pre-tokenizing all texts...


  0%|          | 0/8 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling pa

In [8]:
import torch
import torch.nn as nn

emb = torch.load("/home/doyooni303/experiments/LLMRec/data/amazon/Books/Books_item_embeddings.pt")
type(emb), emb.shape

(torch.nn.parameter.Parameter, torch.Size([969146, 768]))