In [None]:
import os
import json
import pandas as pd
import random
import numpy as np
import torch
from llm2vec import LLM2Vec
from tqdm import tqdm
import sys

sys.path.append("..")

from utils.data_util import *

In [None]:
def load_text_data(df):
    filenames = []
    for row in df.itertuples():
        patient_id, left_cls, right_cls = getattr(row, '医技号'), float(getattr(row, 'left_label')), float(getattr(row, 'right_label'))
        split_translation = json.loads(getattr(row, '拆分翻译'))
        split_findings, split_impression = split_translation['Report'], split_translation['Diagnosis']
        left_kidney_findings, left_kidney_impression = preprocess_text(split_findings.get('Left Kidney', '')), preprocess_text(split_impression.get('Left Kidney', ''))
        right_kidney_findings, right_kidney_impression = preprocess_text(split_findings.get('Right Kidney', '')), preprocess_text(split_impression.get('Right Kidney', ''))

        if len(left_kidney_findings) == 0:
            if left_cls == 0:
                left_kidney_findings = 'No abnormalities in the left kidney.'
            else:
                left_kidney_findings = 'Left kidney assessment unavailable.'

        if len(right_kidney_findings) == 0:
            if right_cls == 0:
                right_kidney_findings = 'No abnormalities in the right kidney.'
            else:
                right_kidney_findings = 'Right kidney assessment unavailable.'

        info_dict = {
            'patient_id': patient_id,
            'left_kidney_findings': left_kidney_findings,
            'left_kidney_impression': left_kidney_impression,
            'left_label': left_cls,
            'right_label': right_cls,
            'right_kidney_findings': right_kidney_findings,
            'right_kidney_impression': right_kidney_impression,
        }
        filenames.append(info_dict)
        
    return filenames

In [None]:
def _process_single_report(report, shuffle=False):
    report = preprocess_text(report)
    if shuffle:
        report = sentence_shuffling(report)
    report = restore_special_cases(report)
    
    return report

In [None]:
file_path = r"/cpfs01/projects-SSD/cfff-bb5d866c17c2_SSD/public/RenalCLIP/text_files/RenalCLIP.xlsx"
df_internal = pd.read_excel(file_path, sheet_name='internal')

filenames = load_text_data(df_internal)

In [None]:
TEXT_PRETRAINED_DIR = fr"/cpfs01/projects-HDD/cfff-bb5d866c17c2_HDD/taoyuhui/RenalCLIP/pretrained_models/language_family"
llm2vec_base_name = "hub/Meta-Llama-3-8B-Instruct-radiology-ext-long"
llm2vec_peft_name = "hub/Meta-Llama-3-8B-Instruct-radiology-simcse/checkpoint-1000"

l2v = LLM2Vec.from_pretrained(
    os.path.join(TEXT_PRETRAINED_DIR, llm2vec_base_name),
    peft_model_name_or_path=os.path.join(TEXT_PRETRAINED_DIR, llm2vec_peft_name),
    device_map=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    torch_dtype=torch.bfloat16,
    local_files_only=True,
    pooling_mode="mean",
    max_length=224,
)

In [None]:
save_root = fr"/cpfs01/projects-SSD/cfff-bb5d866c17c2_SSD/public/RenalCLIP/llm2vec_features/radiology"
for info in tqdm(filenames):
    patient_id = info["patient_id"]
    for i in range(5):
        if i == 0:
            shuffle = False
        else:
            shuffle = True
        left_kidney_findings = _process_single_report(info['left_kidney_findings'], shuffle)
        left_kidney_impression = _process_single_report(info['left_kidney_impression'], shuffle)
        right_kidney_findings = _process_single_report(info['right_kidney_findings'], shuffle)
        right_kidney_impression = _process_single_report(info['right_kidney_impression'], shuffle)
        
        left_report_list = ['Findings: ', left_kidney_findings, '\n', 'Impression:', left_kidney_impression]
        left_report_full = ''.join(left_report_list)
        right_report_list = ['Findings: ', right_kidney_findings, '\n', 'Impression:', right_kidney_impression]
        right_report_full = ''.join(right_report_list)

        left_report_feature, right_report_feature = l2v.encode([left_report_full], show_progress_bar=False).numpy(), l2v.encode([right_report_full], show_progress_bar=False).numpy()
        save_path = os.path.join(save_root, patient_id)
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        np.save(os.path.join(save_path, f"left_kidney_llm2vec_{i}.npy"), left_report_feature)
        np.save(os.path.join(save_path, f"right_kidney_llm2vec_{i}.npy"), right_report_feature)
        
        

In [None]:
file_path = r"/cpfs01/projects-SSD/cfff-bb5d866c17c2_SSD/public/RenalCLIP/text_files/RenalCLIP_4.0.xlsx"
df_internal = pd.read_excel(file_path, sheet_name='internal_pretrain')

filenames = load_text_data(df_internal)

save_root = fr"/cpfs01/projects-SSD/cfff-bb5d866c17c2_SSD/public/RenalCLIP/llm2vec_features/radiology"
for info in tqdm(filenames):
    patient_id = str(info["patient_id"])
    save_path = os.path.join(save_root, patient_id)

    if os.path.exists(save_path) and len(os.listdir(save_path)) > 0:
        continue

    for i in range(5):
        if i == 0:
            shuffle = False
        else:
            shuffle = True

        left_kidney_findings = _process_single_report(info['left_kidney_findings'], shuffle)
        left_kidney_impression = _process_single_report(info['left_kidney_impression'], shuffle)
        right_kidney_findings = _process_single_report(info['right_kidney_findings'], shuffle)
        right_kidney_impression = _process_single_report(info['right_kidney_impression'], shuffle)

        left_report_list = ['Findings: ', left_kidney_findings, '\n', 'Impression:', left_kidney_impression]
        left_report_full = ''.join(left_report_list)
        right_report_list = ['Findings: ', right_kidney_findings, '\n', 'Impression:', right_kidney_impression]
        right_report_full = ''.join(right_report_list)

        left_report_feature = l2v.encode([left_report_full], show_progress_bar=False).numpy()
        right_report_feature = l2v.encode([right_report_full], show_progress_bar=False).numpy()

        if not os.path.exists(save_path):
            os.makedirs(save_path)

        np.save(os.path.join(save_path, f"left_kidney_llm2vec_{i}.npy"), left_report_feature)
        np.save(os.path.join(save_path, f"right_kidney_llm2vec_{i}.npy"), right_report_feature)