# Dataset Setup

In [None]:
import os
import json
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import argparse
import sys

current_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(current_dir)
sys.path.insert(0, project_root)

from utils.parser import get_report_generation_args
sys.argv = ['run.py']
args = get_report_generation_args()
args.pre_processed_type = "one_kidney_tc_v3"

from datasets.data_loader_RenalCLIP_retrieval import ImageCaptionDataset
from torch.utils.data import DataLoader

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# offline save embeddings for llm2vec

In [None]:
import torch
from llm2vec import LLM2Vec
from tqdm import tqdm

In [None]:
TEXT_PRETRAINED_DIR = fr"/cpfs01/projects-HDD/cfff-bb5d866c17c2_HDD/taoyuhui/RenalCLIP/pretrained_models/language_family"
llm2vec_base_name = "hub/Meta-Llama-3-8B-Instruct-radiology-ext-long"
llm2vec_peft_name = "hub/Meta-Llama-3-8B-Instruct-radiology-simcse/checkpoint-1000"

l2v = LLM2Vec.from_pretrained(
    os.path.join(TEXT_PRETRAINED_DIR, llm2vec_base_name),
    peft_model_name_or_path=os.path.join(TEXT_PRETRAINED_DIR, llm2vec_peft_name),
    device_map=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    torch_dtype=torch.bfloat16,
    local_files_only=True,
    pooling_mode="mean",
    max_length=224,
)

In [None]:
TEXT_EMBEDDINGS_DIR = fr"/cpfs01/projects-SSD/cfff-bb5d866c17c2_SSD/public/RenalCLIP/retrieval_text_embeddings/llm2vec-rad"
os.makedirs(TEXT_EMBEDDINGS_DIR, exist_ok=True)

hospitals = ["internal", "瑞金", "山东", "张掖", "厦门"]
for hospital in hospitals:
    dataset = ImageCaptionDataset(args, hospital=hospital)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=8, drop_last=False)
    for _, captions, patient_ids in tqdm(dataloader):

        text_embeddings = l2v.encode(captions, show_progress_bar=False).float()
        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
        text_embeddings = text_embeddings.numpy()
        for text_embedding, patient_id in zip(text_embeddings, patient_ids):
            text_embedding = np.expand_dims(text_embedding, axis=0)

            save_dir = os.path.join(TEXT_EMBEDDINGS_DIR, patient_id)
            os.makedirs(save_dir, exist_ok=True)
            embedding_file = os.path.join(save_dir, f"text_embedding.npy")
            np.save(embedding_file, text_embedding)