In [18]:
import json
import random
import pandas as pd

dataset = "Chest-X-ray"

# convert csv to jsonl
data = pd.read_csv(f"../data/eval/{dataset}.csv")

# 将DataFrame转换为字典列表
data_list = data.to_dict(orient='records')

# 固定随机种子并打乱数据
random.seed(1234)
random.shuffle(data_list)

# 计算切分点，80%用于训练，10%用于验证，10%用于测试
train_split = int(len(data_list) * 0.8)
valid_split = int(len(data_list) * 0.9)  # 0.8 + 0.1 = 0.9

new_data = []
for idx, item in enumerate(data_list):
    # 提取img_path和Finding Label
    img_path = item.get("img_path")
    finding_label = item.get("Finding Label").lower()

    if isinstance(finding_label, str):
        finding_label = finding_label.replace('|', ', ')
        
    # 创建新的数据项
    new_item = {
        "image": img_path,
        "label": finding_label,
        "split": "train" if idx < train_split else ("valid" if idx < valid_split else "test")
    }
    
    # 添加到new_data列表中
    new_data.append(new_item)

# 打乱数据
random.shuffle(new_data)

# 将数据保存为JSONL文件
jsonl_file = f"../data/eval/{dataset}.jsonl"
with open(jsonl_file, mode='w', encoding='utf-8') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")

print(f"数据已成功保存到 {jsonl_file} 文件中。")


数据已成功保存到 ../data/eval/Chest-X-ray.jsonl 文件中。


In [19]:
import json
import random
import pandas as pd

dataset = "Chest-X-ray"
labels = set()

jsonl_file = f"../../data/eval/{dataset}.jsonl"
with open(jsonl_file, mode='w', encoding='utf-8') as f:
    for item in new_data:
        label = item.get("label")
        if label:  # 如果 label 存在
            split_labels = label.split(",")  # 根据逗号拆分标签
            split_labels = [lbl.strip() for lbl in split_labels]  # 去掉每个类别的前后空格
            labels.update(split_labels)  # 更新到集合中

# 将集合转换为列表并保存为 JSON 文件
labels_list = list(labels)
output_file = f"../../data/eval/{dataset}_classes.json"

with open(output_file, mode='w', encoding='utf-8') as f:
    json.dump(labels_list, f, ensure_ascii=False, indent=4)

print(f"Labels saved to {output_file}")

FileNotFoundError: [Errno 2] No such file or directory: '../../data/eval/Chest-X-ray.jsonl'

In [None]:
import json
import random

dataset = "Chest-X-ray"

# 读取 JSONL 文件数据
data = [json.loads(line) for line in open(f"../data/eval/{dataset}.jsonl")]

disease_list = [
    "fibrosis", "edema", "pneumothorax", "cardiomegaly", "atelectasis", "nodule",
    "emphysema", "no finding", "mass", "pleural_thickening", "effusion", "infiltration",
    "pneumonia", "hernia", "consolidation"
]

label_mapping = {disease: chr(65 + i) for i, disease in enumerate(disease_list)}

new_data = []
for idx, item in enumerate(data):
    # 处理 item["label"]，确保是正确的格式
    labels = [label.strip().lower() for label in item["label"].split(",")]  # 处理逗号分隔的字符串
    mapped_labels = ",".join([label_mapping[disease] for disease in labels if disease in label_mapping])
    
    new_item = {
        "image": item["image"].replace("/srv/lby/", ""),  # 修改路径
        "text": "What disease is indicated by the chest X-ray?\nA. fibrosis\nB. edema\nC. pneumothorax\nD. cardiomegaly\nE. atelectasis\nF. nodule\nG. emphysema\nH. no finding\nI. mass\nJ. pleural_thickening\nK. effusion\nL. infiltration\nM. pneumonia\nN. hernia\nO. consolidation\nChoose A, B, C, D, E, F, G, H, I, J, K, L, M, N, or O.",
        "category": "conv",
        "label": mapped_labels,
        "question_id": f'{idx}-{mapped_labels}',
    }
    new_data.append(new_item)

# 打乱数据
random.shuffle(new_data)

# 保存处理后的数据为新的JSONL文件
jsonl_file = f"../data/chest_xray/{dataset}_llava_origin_val.jsonl"
with open(jsonl_file, mode='w', encoding='utf-8') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")

print(f"数据已成功保存到 {jsonl_file} 文件中。")


数据已成功保存到 ../data/chest_xray/Chest-X-ray_llava_origin_val.jsonl 文件中。


In [9]:
import json
import pandas as pd
import numpy as np
from PIL import Image
from torchvision import transforms
import os

# Initialize image paths
img_root = '/srv/lby/'

# Define the disease label categories
disease_labels = [
    'no finding', 'enlarged cardiomediastinum', 'cardiomegaly', 
    'lung opacity', 'lung lesion', 'edema', 'consolidation', 
    'pneumonia', 'atelectasis', 'pneumothorax', 'pleural effusion', 
    'pleural other', 'fracture', 'support devices'
]

# Read CSV data
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/chexpertv1/filter_test_labels.csv'  # Update with your actual CSV path
data_info = pd.read_csv(csv_path)

# Prepare an empty list to store the new data items
new_data = []

# Loop through each row in the CSV file
for idx, row in data_info.iterrows():
    img_path = img_root + str(row[0])  # Assuming the first column is the image filename
    # Get the labels (multi-class)
    class_label = row[1:].values  # Assuming labels are in the columns starting from index 1
    disease_label = {disease_labels[i]: class_label[i] for i in range(len(disease_labels))}
    
    # Fixed category for all samples
    category = "conv"

    # Generate the text question for the image
    # text = "What disease is indicated by the chest X-ray?\nA. no finding\nB. enlarged cardiomediastinum\nC. cardiomegaly\nD. lung opacity\nE. lung lesion\nF. edema\nG. consolidation\nH. pneumonia\nI. atelectasis\nJ. pneumothorax\nK. pleural effusion\nL. pleural other\nM. fracture\nN. support devices\nChoose A, B, C, D, E, F, G, H, I, J, K, L, M, or N."
    question = "What disease is indicated by the chest X-ray? Choose the relevant disease from the given list: ['no finding', 'enlarged cardiomediastinum', 'cardiomegaly', 'lung opacity', 'lung lesion', 'edema', 'consolidation', 'pneumonia', 'atelectasis', 'pneumothorax', 'pleural effusion', 'pleural other', 'fracture', 'support devices'] Output only disease for the given list."
    # build the chat-style record
    new_item = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": str(row[0]),
            },
            {
                "type": "text",
                "text": question,
            },
        ],
        "label": disease_label
    }

    new_data.append(new_item)


# Save the data to a JSONL file
output_path = '/home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/chexpert/chexpert_llava_origin_val.jsonl'  # Update with the desired output file path
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")  # Write each item as a separate line

print(f"Data saved to {output_path}")


Data saved to /home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/chexpert/chexpert_llava_origin_val.jsonl


  img_path = img_root + str(row[0])  # Assuming the first column is the image filename
  "image": str(row[0]),


In [None]:
import os
import json
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms

img_root = '/srv/lby/COVIDx_CXR-4/'

disease_labels = ['covid19','non-covid19']
label_mapping = {disease: chr(65 + i) for i, disease in enumerate(disease_labels)}
# Read CSV data
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/covidx-cxr2/test_split.csv'  # Update with the actual path
data_info = pd.read_csv(csv_path, header=None, names=["id", "image_path", "class", "source"], delimiter=' ')

# Prepare an empty list to store the new data items
new_data = []

# Loop through each row in the CSV file
for idx, row in data_info.iterrows():
    img_path = 'COVIDx_CXR-4/test/' + str(row[1])   # Assuming 2nd column has the image filename
    class_label = row[2]   # Assuming labels are in the columns starting from index 1

    mapped_labels = label_mapping["covid19"] if class_label == "positive" else label_mapping["non-covid19"]

    
    # Generate the text question for the image
    text = "What disease is indicated by the chest X-ray? Choose covid19 or non-covid19."

    # Create the new JSON-like item
    new_item = {
        "image": img_path,  # Convert tensor to list to store in JSON
        "text": text,
        "category": "conv",  # Fixed category for all samples
        "label": mapped_labels,
    }

    # Append the new item to the list
    new_data.append(new_item)

# Save the data to a JSON file
output_path = '../data/COVIDx_CXR/COVIDx_CXR_llava_origin_val.jsonl'  # Update with the desired output file path
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")  # Write each item as a separate line

print(f"Data saved to {output_path}")

  img_path = 'COVIDx_CXR-4/test/' + str(row[1])   # Assuming 2nd column has the image filename
  class_label = row[2]   # Assuming labels are in the columns starting from index 1


Data saved to ../data/COVIDx_CXR/COVIDx_CXR_llava_origin_val.jsonl


In [None]:
import json
import pandas as pd
import numpy as np
from PIL import Image
from torchvision import transforms
import torch
import os


# Read CSV data
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/rsna/new_test.csv'  # Update with the actual path
data_info = pd.read_csv(csv_path)

# Prepare an empty list to store the new data items
new_data = []
disease_labels = ['pneumonia','normal']
# Loop through each row in the CSV file
for idx, row in data_info.iterrows():
    img_path = str(row[1]).replace("/srv/lby/", "")

    # Open and transform the image
    class_label = row[3]  # 获取标签
    # disease_label = {"pneumonia": class_label}  # 设置疾病标签为字典
    mapped_labels = label_mapping["pneumonia"] if class_label == 1 else label_mapping["normal"]
    # Fixed category for all samples
    category = "conv"

    # Generate the text question for the image
    question = "What disease is indicated by the chest X-ray? Choose pneumonia or normal. Only output the required result. Do not include any additional text."
    # text = "What disease is indicated by the chest X-ray?\nA. pneumonia\nB. normal\nChoose A or B."

    # build the chat-style record
    new_item = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": img_path,
            },
            {
                "type": "text",
                "text": question,
            },
        ],
        "label": mapped_labels
    }

    new_data.append(new_item)

# Save the data to a JSON file
output_path = '/home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/rsna/rsna_pneumonia_llava_origin_val.jsonl'  # Update with the desired output file path
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")  # Write each item as a separate line

print(f"Data saved to {output_path}")


  img_path = str(row[1]).replace("/srv/lby/", "")
  class_label = row[3]  # 获取标签


NameError: name 'label_mapping' is not defined

In [None]:
import os
import json
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms

img_root = '/srv/lby/SIIM_Pneumothorax/processed-images-train/'
seg_root = '/srv/lby/SIIM_Pneumothorax/train_mask/'

# Define transforms for image and segmentation map
normalize = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
transform = transforms.Compose([
    transforms.Resize([336, 336]),
    transforms.ToTensor(),
    normalize,
])

seg_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize([336, 336]),
])

# Read CSV data
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/siimacr/test.csv'  # Update with the actual path
data_info = pd.read_csv(csv_path)

# Prepare an empty list to store the new data items
new_data = []
disease_labels = ['pneumothorax','non-pneumothorax']
label_mapping = {disease: chr(65 + i) for i, disease in enumerate(disease_labels)}
# Loop through each row in the CSV file
for idx, row in data_info.iterrows():
    img_path = img_root + str(row[0]) + '.png'  # Assuming 2nd column has the image filename
    seg_path = seg_root + str(row[0]) + '.gif'  # Assuming 2nd column has the same name for segmentation file

    # Open and transform the image
    img = Image.open(img_path).convert('RGB')
    image = transform(img)

    # Open and transform the segmentation map
    seg_map = Image.open(seg_path)
    seg_map = seg_transform(seg_map)
    seg_map = (seg_map > 0).type(torch.int)
    
    # Binary classification: if segmentation map has positive area, set label to 1 (pneumothorax)
    class_label = int(torch.sum(seg_map) > 0)
    
    # Set the disease label as a dictionary
    # disease_label = {"pneumothorax": class_label}
    mapped_labels = label_mapping["pneumothorax"] if class_label == 1 else label_mapping["non-pneumothorax"]
    # disease_label = {"pneumonia": class_label}  # 设置疾病标签为字典
    # Generate the text question for the image
    text = "What disease is indicated by the chest X-ray? Choose pneumothorax or non-pneumothorax."

    # Create the new JSON-like item
    new_item = {
        "image": 'SIIM_Pneumothorax/processed-images-train/' +  str(row[0]) + '.png',  # Convert tensor to list to store in JSON
        "text": text,
        "category": "conv",  # Fixed category for all samples
        "label": mapped_labels,
    }

    # Append the new item to the list
    new_data.append(new_item)

# Save the data to a JSON file
output_path = '../data/SIIM_Pneumothorax/SIIM_Pneumothorax_llava_origin_val.jsonl'  # Update with the desired output file path
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")  # Write each item as a separate line

print(f"Data saved to {output_path}")

  img_path = img_root + str(row[0]) + '.png'  # Assuming 2nd column has the image filename
  seg_path = seg_root + str(row[0]) + '.gif'  # Assuming 2nd column has the same name for segmentation file
  "image": 'SIIM_Pneumothorax/processed-images-train/' +  str(row[0]) + '.png',  # Convert tensor to list to store in JSON


Data saved to ../data/SIIM_Pneumothorax/SIIM_Pneumothorax_llava_origin_val.jsonl


In [None]:
import json
import os
import pandas as pd

disease_labels = [
    "fibrosis","edema","pneumothorax","cardiomegaly","atelectasis","nodule",
    "emphysema","no finding","mass","pleural_thickening","effusion",
    "infiltration","pneumonia","hernia","consolidation"
]
img_root = '/srv/lby/'

# Read CSV data
csv_path = '/home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/process_data/xray14/official_test.csv'
data_info = pd.read_csv(csv_path)

# Prepare an empty list to store the new data items
new_data = []

for idx, row in data_info.iterrows():
    # fix up the image path
    img_path = str(row[0]).replace(
        "/DATA/Chestxray/ChestXray8",
        "Chest-X-ray-dataset"
    )
    # multi-label string like "Pneumonia|Effusion"
    diseases = str(row[2]).split('|')
    diseases_lower = [d.strip().lower() for d in diseases]

    # build the zero/one label dict
    disease_dict = {lbl: 0 for lbl in disease_labels}
    for d in diseases_lower:
        if d in disease_dict:
            disease_dict[d] = 1

    # fixed question prompt
    question = "What disease is indicated by the chest X-ray? Choose the relevant disease categories from the given list: ['no finding', 'enlarged cardiomediastinum', 'cardiomegaly', 'lung opacity', 'lung lesion', 'edema', 'consolidation', 'pneumonia', 'atelectasis', 'pneumothorax', 'pleural effusion', 'pleural other', 'fracture', 'support devices'] Output only disease for the given list. Do not include any additional text."

    # build the chat-style record
    new_item = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": img_path,
            },
            {
                "type": "text",
                "text": question,
            },
        ],
        "label": disease_dict
    }

    new_data.append(new_item)

# write out as JSONL
output_path = '/home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/chest_xray/Chest-X-ray_llava_origin_val.jsonl'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")

print(f"Reformatted data saved to {output_path}")



  img_path = str(row[0]).replace(
  diseases = str(row[2]).split('|')


Reformatted data saved to /home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/chest_xray/Chest-X-ray_llava_origin_val.jsonl


In [None]:
# 随机遍历jsonl文件1000行，获取每一行的question_id和text内容，question_id
from sklearn.metrics import roc_auc_score, precision_recall_curve, accuracy_score
import numpy as np
import torch
import json
import random

# jsonl_file = "/home/lby/llava_med/LLaVA-Med/llava/run/data/train/sft_data/modified_file.json"
jsonl_file = "/home/lby/llava_med/LLaVA-Med/llava/run/data/chest_xray/new_classify_mimic_file_clip.json"
with open(jsonl_file, "r", encoding="utf-8") as f:
    for i in range(20):  # 只读取前 10 行看看是否有问题
        print(f.readline())

# with open(jsonl_file, 'r', encoding='utf-8') as f:
#     data = json.load(f)
        
# for i, item in enumerate(data[:20]):
#     print(f"Item {i + 1}: {item}")


# import csv

# csv_file = "/srv/lby/llava_med/other_data/cleaned_file.csv"

# # 打开CSV文件
# with open(csv_file, 'r', encoding='utf-8') as f:
#     reader = csv.DictReader(f)  # 使用DictReader读取CSV文件，这样每一行会被解析成字典

#     # 遍历前20行数据
#     for i, item in enumerate(reader):
#         if i >= 20:
#             break  # 只处理前20行
#         print(f"Item {i + 1}: {item}")

[

    {

        "id": 0,

        "image": "p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg",

        "conversations": [

            {

                "from": "human",

                "value": "What disease is indicated by the chest X-ray?\n<image>"

            },

            {

                "from": "gpt",

                "value": "there is no focal consolidation pleural effusion or pneumothorax . bilateral nodular opacities that most likely represent nipple shadows. the cardiomediastinal silhouette is normal. clips project over the left lung potentially within the breast. the imaged upper abdomen is unremarkable.chronic deformity of the posterior left sixth and seventh ribs are noted. no acute cardio pulmonary process."

            }

        ]

    },

    {

        "id": 1,

        "image": "p10/p10000032/s53189527/2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab.jpg",

        "conversations": [

            {



In [None]:
import os
import json
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms

img_root = '/srv/lby/SIIM_Pneumothorax/processed-images-train/'
seg_root = '/srv/lby/SIIM_Pneumothorax/train_mask/'

# Define transforms for image and segmentation map
normalize = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
transform = transforms.Compose([
    transforms.Resize([336, 336]),
    transforms.ToTensor(),
    normalize,
])

seg_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize([336, 336]),
])

# Read CSV data
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/siimacr/test.csv'  # Update with the actual path
data_info = pd.read_csv(csv_path)

# Prepare an empty list to store the new data items
new_data = []

# Loop through each row in the CSV file
for idx, row in data_info.iterrows():
    img_path = img_root + str(row[0]) + '.png'  # Assuming 2nd column has the image filename
    seg_path = seg_root + str(row[0]) + '.gif'  # Assuming 2nd column has the same name for segmentation file

    # Open and transform the image
    img = Image.open(img_path).convert('RGB')
    image = transform(img)

    # Open and transform the segmentation map
    seg_map = Image.open(seg_path)
    seg_map = seg_transform(seg_map)
    seg_map = (seg_map > 0).type(torch.int)
    
    # Binary classification: if segmentation map has positive area, set label to 1 (pneumothorax)
    class_label = int(torch.sum(seg_map) > 0)
    
    # Set the disease label as a dictionary
    # disease_label = {"pneumothorax": class_label}

    # disease_label = {"pneumonia": class_label}  # 设置疾病标签为字典
    if class_label == 1:
        disease_label = {"pneumothorax": 1, "normal": 0}  # Set pneumonia to 1 if class_label is 1
    else:
        disease_label = {"pneumothorax": 0, "normal": 1} 
   
    # Generate the text question for the image
    question = "What disease is indicated by the chest X-ray? Choose pneumothorax or normal. Only output the required result. Do not include any additional text."

    # build the chat-style record
    new_item = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": 'SIIM_Pneumothorax/processed-images-train/' +  str(row[0]) + '.png',
            },
            {
                "type": "text",
                "text": question,
            },
        ],
        "label": disease_label
    }

    new_data.append(new_item)


# Save the data to a JSON file
output_path = '/home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/SIIM_Pneumothorax/SIIM_Pneumothorax_llava_origin_val.jsonl'  # Update with the desired output file path
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")  # Write each item as a separate line

print(f"Data saved to {output_path}")

  img_path = img_root + str(row[0]) + '.png'  # Assuming 2nd column has the image filename
  seg_path = seg_root + str(row[0]) + '.gif'  # Assuming 2nd column has the same name for segmentation file
  "image": 'SIIM_Pneumothorax/processed-images-train/' +  str(row[0]) + '.png',


In [None]:
import json
import pandas as pd
import numpy as np
from PIL import Image
from torchvision import transforms
import os

# Initialize image paths
img_root = '/srv/lby/'

# Define the disease label categories
disease_labels = [
    'no finding', 'enlarged cardiomediastinum', 'cardiomegaly', 
    'lung opacity', 'lung lesion', 'edema', 'consolidation', 
    'pneumonia', 'atelectasis', 'pneumothorax', 'pleural effusion', 
    'pleural other', 'fracture', 'support devices'
]

# Read CSV data
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/chexpertv1/filter_test_labels.csv'  # Update with your actual CSV path
data_info = pd.read_csv(csv_path)

# Prepare an empty list to store the new data items
new_data = []

# Loop through each row in the CSV file
for idx, row in data_info.iterrows():
    img_path = img_root + str(row[0])  # Assuming the first column is the image filename
    # Get the labels (multi-class)
    class_label = row[1:].values  # Assuming labels are in the columns starting from index 1
    disease_label = {disease_labels[i]: class_label[i] for i in range(len(disease_labels))}
    
    # Fixed category for all samples
    category = "conv"

    # Generate the text question for the image
    text = "What disease is indicated by the chest X-ray?"
    # Create the new JSON-like item
    new_item = {
        "image": str(row[0]),  # Image filename
        "text": text,
        "category": category,
        "label": disease_label
    }

    # Append the new item to the list
    new_data.append(new_item)

# Save the data to a JSONL file
output_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/chexpert/chexpert_llava_val.jsonl'  # Update with the desired output file path
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")  # Write each item as a separate line

print(f"Data saved to {output_path}")


Data saved to /home/lby/llava_med/LLaVA-Med/llava/run/data/chexpert/chexpert_llava_val.jsonl


  img_path = img_root + str(row[0])  # Assuming the first column is the image filename
  "image": str(row[0]),  # Image filename


In [12]:
import json
import pandas as pd
import numpy as np
from PIL import Image
from torchvision import transforms
import torch
import os


# Read CSV data
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/rsna/new_test.csv'  # Update with the actual path
data_info = pd.read_csv(csv_path)

# Prepare an empty list to store the new data items
new_data = []

# Loop through each row in the CSV file
for idx, row in data_info.iterrows():
    img_path = str(row[1]).replace("/srv/lby/", "")

    # Open and transform the image
    class_label = row[3]  # 获取标签
    # disease_label = {"pneumonia": class_label}  # 设置疾病标签为字典
    if class_label == 1:
        disease_label = {"pneumonia": 1, "normal": 0}  # Set pneumonia to 1 if class_label is 1
    else:
        disease_label = {"pneumonia": 0, "normal": 1} 
    # Fixed category for all samples
    category = "conv"

    question = "What disease is indicated by the chest X-ray? Choose pneumonia or normal. Only output the required result. Do not include any additional text."
    # text = "What disease is indicated by the chest X-ray?\nA. pneumonia\nB. normal\nChoose A or B."

    # build the chat-style record
    new_item = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": img_path,
            },
            {
                "type": "text",
                "text": question,
            },
        ],
        "label": disease_label
    }

    new_data.append(new_item)

# Save the data to a JSON file
output_path = '/home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/rsna/rsna_pneumonia_llava_origin_val.jsonl'  # Update with the desired output file path
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")  # Write each item as a separate line

print(f"Data saved to {output_path}")


  img_path = str(row[1]).replace("/srv/lby/", "")
  class_label = row[3]  # 获取标签


Data saved to /home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/rsna/rsna_pneumonia_llava_origin_val.jsonl


In [None]:
import json
import pandas as pd
import numpy as np
from PIL import Image
from torchvision import transforms
import os

# Initialize image paths
img_root = '/srv/lby/'

# Define transforms for image
normalize = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
transform = transforms.Compose([
    transforms.Resize([224, 224]),
    transforms.ToTensor(),
    normalize,
])

# Define the disease label categories
# disease_labels = [
#     "Normal", "Pulmonary Fibrosis", "Chronic Changes", "Kyphosis", "Pseudonodule", 
#     "Ground Glass Pattern", "Unchanged", "Alveolar Pattern", "Interstitial Pattern", 
#     "Laminar Atelectasis", "Pleural Effusion", "Apical Pleural Thickening", "Suture Material", 
#     "Sternotomy", "Endotracheal Tube", "Infiltrates", "Heart Insufficiency", "Hemidiaphragm Elevation", 
#     "Superior Mediastinal Enlargement", "Aortic Elongation", "Scoliosis", "Sclerotic Bone Lesion", 
#     "Supra Aortic Elongation", "Vertebral Degenerative Changes", "Goiter", "COPD Signs", 
#     "Air Trapping", "Descendent Aortic Elongation", "Aortic Atheromatosis", "Metal", "Hypoexpansion Basal", 
#     "Abnormal Foreign Body", "Central Venous Catheter via Subclavian Vein", "Central Venous Catheter", 
#     "Vascular Hilar Enlargement", "Pacemaker", "Atelectasis", "Vertebral Anterior Compression", 
#     "Hiatal Hernia", "Pneumonia", "Diaphragmatic Eventration", "Consolidation", "Calcified Densities", 
#     "Cardiomegaly", "Fibrotic Band", "Tuberculosis Sequelae", "Volume Loss", "Bronchiectasis", 
#     "Single Chamber Device", "Emphysema", "Vertebral Compression", "Bronchovascular Markings", 
#     "Bullas", "Hilar Congestion", "Exclude", "Axial Hyperostosis", "Aortic Button Enlargement", 
#     "Calcified Granuloma", "Clavicle Fracture", "Pulmonary Mass", "Dual Chamber Device", "Increased Density", 
#     "Surgery Neck", "Osteosynthesis Material", "Costochondral Junction Hypertrophy", "Segmental Atelectasis", 
#     "Costophrenic Angle Blunting", "Calcified Pleural Thickening", "Hyperinflated Lung", "Callus Rib Fracture", 
#     "Pleural Thickening", "Mediastinal Mass", "Nipple Shadow", "Surgery Heart", "Pulmonary Artery Hypertension", 
#     "Central Vascular Redistribution", "Tuberculosis", "Nodule", "Cavitation", "Granuloma", "Osteopenia", 
#     "Lobar Atelectasis", "Surgery Breast", "NSG Tube", "Hilar Enlargement", "Gynecomastia", "Atypical Pneumonia", 
#     "Cervical Rib", "Mediastinal Enlargement", "Major Fissure Thickening", "Surgery", "Azygos Lobe", "Adenopathy", 
#     "Miliary Opacities", "Suboptimal Study", "DAI", "Mediastinic Lipomatosis", "Surgery Lung", "Mammary Prosthesis", 
#     "Humeral Fracture", "Calcified Adenopathy", "Reservoir Central Venous Catheter", "Vascular Redistribution", 
#     "Hypoexpansion", "Heart Valve Calcified", "Pleural Mass", "Loculated Pleural Effusion", "Pectum Carinatum", 
#     "Subacromial Space Narrowing", "Central Venous Catheter via Jugular Vein", "Vertebral Fracture", "Osteoporosis", 
#     "Bone Metastasis", "Lung Metastasis", "Cyst", "Humeral Prosthesis", "Artificial Heart Valve", "Mastectomy", 
#     "Pericardial Effusion", "Lytic Bone Lesion", "Subcutaneous Emphysema", "Edema", "Flattened Diaphragm", 
#     "Asbestosis Signs", "Multiple Nodules", "Prosthesis", "Pulmonary Hypertension", "Soft Tissue Mass", 
#     "Tracheostomy Tube", "Endoprosthesis", "Post Radiotherapy Changes", "Air Bronchogram", "Pectum Excavatum", 
#     "Calcified Mediastinal Adenopathy", "Central Venous Catheter via Umbilical Vein", "Thoracic Cage Deformation", 
#     "Obesity", "Tracheal Shift", "External Foreign Body", "Atelectasis Basal", "Aortic Endoprosthesis", 
#     "Rib Fracture", "Calcified Fibroadenoma", "Pneumothorax", "Reticulonodular Interstitial Pattern", 
#     "Reticular Interstitial Pattern", "Chest Drain Tube", "Minor Fissure Thickening", "Fissure Thickening", 
#     "Hydropneumothorax", "Breast Mass", "Blastic Bone Lesion", "Respiratory Distress", "Azygoesophageal Recess Shift", 
#     "Ascendent Aortic Elongation", "Lung Vascular Paucity", "Kerley Lines", "Electrical Device", 
#     "Artificial Mitral Heart Valve", "Artificial Aortic Heart Valve", "Total Atelectasis", 
#     "Non Axial Articular Degenerative Changes", "Pleural Plaques", "Calcified Pleural Plaques", 
#     "Lymphangitis Carcinomatosa", "Lepidic Adenocarcinoma", "Mediastinal Shift", "Ventriculoperitoneal Drain Tube", 
#     "Esophagic Dilatation", "Dextrocardia", "End On Vessel", "Right Sided Aortic Arch", "Chilaiditi Sign", 
#     "Aortic Aneurysm", "Loculated Fissural Effusion", "Fracture", "Air Fluid Level", "Round Atelectasis", 
#     "Mass", "Double J Stent", "Pneumoperitoneo", "Abscess", "Pulmonary Artery Enlargement", "Bone Cement", 
#     "Pneumomediastinum", "Catheter", "Surgery Humeral", "Empyema", "Nephrostomy Tube", 
#     "Sternoclavicular Junction Hypertrophy", "Pulmonary Venous Hypertension", "Gastrostomy Tube", "Lipomatosis"
# ]


disease_labels = [
    "normal", "pulmonary fibrosis", "chronic changes", "kyphosis", "pseudonodule", 
    "ground glass pattern", "unchanged", "alveolar pattern", "interstitial pattern", 
    "laminar atelectasis", "pleural effusion", "apical pleural thickening", "suture material", 
    "sternotomy", "endotracheal tube", "infiltrates", "heart insufficiency", "hemidiaphragm elevation", 
    "superior mediastinal enlargement", "aortic elongation", "scoliosis", "sclerotic bone lesion", 
    "supra aortic elongation", "vertebral degenerative changes", "goiter", "copd signs", 
    "air trapping", "descendent aortic elongation", "aortic atheromatosis", "metal", "hypoexpansion basal", 
    "abnormal foreign body", "central venous catheter via subclavian vein", "central venous catheter", 
    "vascular hilar enlargement", "pacemaker", "atelectasis", "vertebral anterior compression", 
    "hiatal hernia", "pneumonia", "diaphragmatic eventration", "consolidation", "calcified densities", 
    "cardiomegaly", "fibrotic band", "tuberculosis sequelae", "volume loss", "bronchiectasis", 
    "single chamber device", "emphysema", "vertebral compression", "bronchovascular markings", 
    "bullas", "hilar congestion", "exclude", "axial hyperostosis", "aortic button enlargement", 
    "calcified granuloma", "clavicle fracture", "pulmonary mass", "dual chamber device", "increased density", 
    "surgery neck", "osteosynthesis material", "costochondral junction hypertrophy", "segmental atelectasis", 
    "costophrenic angle blunting", "calcified pleural thickening", "hyperinflated lung", "callus rib fracture", 
    "pleural thickening", "mediastinal mass", "nipple shadow", "surgery heart", "pulmonary artery hypertension", 
    "central vascular redistribution", "tuberculosis", "nodule", "cavitation", "granuloma", "osteopenia", 
    "lobar atelectasis", "surgery breast", "nsg tube", "hilar enlargement", "gynecomastia", "atypical pneumonia", 
    "cervical rib", "mediastinal enlargement", "major fissure thickening", "surgery", "azygos lobe", "adenopathy", 
    "miliary opacities", "suboptimal study", "dai", "mediastinic lipomatosis", "surgery lung", "mammary prosthesis", 
    "humeral fracture", "calcified adenopathy", "reservoir central venous catheter", "vascular redistribution", 
    "hypoexpansion", "heart valve calcified", "pleural mass", "loculated pleural effusion", "pectum carinatum", 
    "subacromial space narrowing", "central venous catheter via jugular vein", "vertebral fracture", "osteoporosis", 
    "bone metastasis", "lung metastasis", "cyst", "humeral prosthesis", "artificial heart valve", "mastectomy", 
    "pericardial effusion", "lytic bone lesion", "subcutaneous emphysema", "edema", "flattened diaphragm", 
    "asbestosis signs", "multiple nodules", "prosthesis", "pulmonary hypertension", "soft tissue mass", 
    "tracheostomy tube", "endoprosthesis", "post radiotherapy changes", "air bronchogram", "pectum excavatum", 
    "calcified mediastinal adenopathy", "central venous catheter via umbilical vein", "thoracic cage deformation", 
    "obesity", "tracheal shift", "external foreign body", "atelectasis basal", "aortic endoprosthesis", 
    "rib fracture", "calcified fibroadenoma", "pneumothorax", "reticulonodular interstitial pattern", 
    "reticular interstitial pattern", "chest drain tube", "minor fissure thickening", "fissure thickening", 
    "hydropneumothorax", "breast mass", "blastic bone lesion", "respiratory distress", "azygoesophageal recess shift", 
    "ascendent aortic elongation", "lung vascular paucity", "kerley lines", "electrical device", 
    "artificial mitral heart valve", "artificial aortic heart valve", "total atelectasis", 
    "non axial articular degenerative changes", "pleural plaques", "calcified pleural plaques", 
    "lymphangitis carcinomatosa", "lepidic adenocarcinoma", "mediastinal shift", "ventriculoperitoneal drain tube", 
    "esophagic dilatation", "dextrocardia", "end on vessel", "right sided aortic arch", "chilaiditi sign", 
    "aortic aneurysm", "loculated fissural effusion", "fracture", "air fluid level", "round atelectasis", 
    "mass", "double j stent", "pneumoperitoneo", "abscess", "pulmonary artery enlargement", "bone cement", 
    "pneumomediastinum", "catheter", "surgery humeral", "empyema", "nephrostomy tube", 
    "sternoclavicular junction hypertrophy", "pulmonary venous hypertension", "gastrostomy tube", "lipomatosis"
]


# Read CSV data
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/padchest/test.csv'  # Update with your actual CSV path
data_info = pd.read_csv(csv_path)

# Prepare an empty list to store the new data items
new_data = []

# Loop through each row in the CSV file
for idx, row in data_info.iterrows():
    img_path = img_root + str(row[0])  # Assuming the first column is the image filename

    # Open and transform the image

    # Get the labels (multi-class)
    class_label = row[3:].values  # Assuming labels are in the columns starting from index 1
    disease_label = {disease_labels[i]: class_label[i] for i in range(len(disease_labels))}
    
    # Fixed category for all samples
    category = "conv"

    # Generate the text question for the image
    text = "What disease is indicated by the chest X-ray?"

    # Create the new JSON-like item
    new_item = {
        "image": 'PadChest/' + str(row[0]),  # Image filename
        "text": text,
        "category": category,
        "label": disease_label
    }

    # Append the new item to the list
    new_data.append(new_item)

# Save the data to a JSONL file
output_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/padchest/padchest_llava_val.jsonl'  # Update with the desired output file path
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")  # Write each item as a separate line

print(f"Data saved to {output_path}")


  img_path = img_root + str(row[0])  # Assuming the first column is the image filename
  "image": 'PadChest/' + str(row[0]),  # Image filename


Data saved to /home/lby/llava_med/LLaVA-Med/llava/run/data/padchest/padchest_llava_val.jsonl


In [11]:
import os
import json
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms

img_root = '/srv/lby/COVIDx_CXR-4/'



# Read CSV data
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/covidx-cxr2/test_split.csv'  # Update with the actual path
data_info = pd.read_csv(csv_path, header=None, names=["id", "image_path", "class", "source"], delimiter=' ')

# Prepare an empty list to store the new data items
new_data = []

# Loop through each row in the CSV file
for idx, row in data_info.iterrows():
    img_path = 'COVIDx_CXR-4/test/' + str(row[1])   # Assuming 2nd column has the image filename
    class_label = row[2]   # Assuming labels are in the columns starting from index 1
    
    if class_label == 'positive':
        disease_label = {"covid-19": 1, "normal": 0}  # Set pneumonia to 1 if class_label is 1
    else:
        disease_label = {"covid-19": 0, "normal": 1} 
    
    
    # Generate the text question for the image
    question = "What disease is indicated by the chest X-ray? Choose covid-19 or normal. Only output the required result. Do not include any additional text."

    # build the chat-style record
    new_item = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": img_path,
            },
            {
                "type": "text",
                "text": question,
            },
        ],
        "label": disease_label
    }

    new_data.append(new_item)

# Save the data to a JSON file
output_path = '/home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/COVIDx_CXR/COVIDx_CXR_llava_origin_val.jsonl'  # Update with the desired output file path
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")  # Write each item as a separate line

print(f"Data saved to {output_path}")

Data saved to /home/lby/qwen_radz/Qwen2.5-VL/qwen-vl-finetune/eval_data/COVIDx_CXR/COVIDx_CXR_llava_origin_val.jsonl


  img_path = 'COVIDx_CXR-4/test/' + str(row[1])   # Assuming 2nd column has the image filename
  class_label = row[2]   # Assuming labels are in the columns starting from index 1


In [None]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, accuracy_score
import numpy as np
import torch
import json
import random

def compute_AUCs(gt, pred, n_class):
    """计算每个标签的 AUC"""
    AUROCs = []
    gt_np = gt.cpu().numpy()
    pred_np = pred.cpu().numpy()
    for i in range(n_class):
        try:
            AUROCs.append(roc_auc_score(gt_np[:, i], pred_np[:, i]))
        except ValueError:
            AUROCs.append(np.nan)  # 当 AUC 计算失败时，填充 NaN
    return AUROCs

def get_acc_and_auc():
    # 读取数据
    output_path = '../data/eval/test_prompt/Chest-X-ray_llava_val_ans.jsonl'
    answers = [json.loads(line) for line in open(output_path)]
    disease_list = ['fibrosis', 'edema', 'pneumothorax', 'cardiomegaly', 'atelectasis', 'nodule', 'emphysema', 'no finding', 'mass', 'pleural_thickening', 'effusion', 'infiltration', 'pneumonia', 'hernia', 'consolidation']

    print(f"Total number of answers: {len(answers)}")
    
    # 映射字典：将A, B, C, D...映射到疾病列表
    label_mapping = {chr(65 + i): disease for i, disease in enumerate(disease_list)}
    # 手动映射两个列表中的疾病

    # 随机选择 1000 行
    random.shuffle(answers)
    selected_answers = answers[:1000]

    # 初始化 ground truth (gt) 和 predictions (pred)
    n_classes = len(disease_list)
    gt = torch.zeros((len(selected_answers), n_classes), dtype=torch.float32)
    pred = torch.zeros((len(selected_answers), n_classes), dtype=torch.float32)

    error_count = 0
    error_question_ids = []

    # 遍历每个 answer，提取 labels 和预测类别
    for idx, item in enumerate(selected_answers):
        # 获取标签（label），labels 可能包含多个标签
        labels = ["-".join(item["question_id"].split("-")[1:])]  # 获取 label
        labels = [label.lower() for label in labels]

        # 获取预测的 text 并映射到疾病
        text = item["text"].strip().upper()

        # 设置 ground truth
        for label in labels:
            for i, disease in enumerate(disease_list):
                if disease in label:
                    gt[idx, i] = 1  # 对应疾病的 ground truth 置为 1

        # 获取预测的类别 (A, B, C, D...)
        for char in text:
            if char in label_mapping:
                disease = label_mapping[char]
                disease_idx = disease_list.index(disease)
                pred[idx, disease_idx] = 1  # 对应疾病的预测值置为 1

        # 检查是否没有预测到任何类别
        if torch.sum(pred[idx]) == 0:
            error_count += 1
            error_question_ids.append(item["question_id"])

    # 计算多标签 AUC
    AUROCs = compute_AUCs(gt, pred, n_classes)
    AUROC_avg = np.nanmean(AUROCs)  # 计算 AUC 的平均值，忽略 NaN

    # 计算每个类别的准确率
    accs = []
    gt_np_all = gt.cpu().numpy()
    pred_np_all = pred.cpu().numpy()

    for i in range(n_classes):
        gt_np = gt_np_all[:, i]
        pred_np = pred_np_all[:, i]
        
        # print(f"Class {i} - Ground truth (sample): {gt_np[:10]}")
        # print(f"Class {i} - Predictions (sample): {pred_np[:10]}")
        acc = accuracy_score(gt_np, pred_np)
        accs.append(acc)
    
    # 计算类别平均准确率
    acc_avg = np.mean(accs)

    # 计算 F1 分数
    f1s = []

    for i in range(n_classes):
        gt_np = gt_np_all[:, i]
        pred_np = pred_np_all[:, i]

        precision, recall, thresholds = precision_recall_curve(gt_np, pred_np)
        if len(precision) > 1:
            numerator = 2 * recall * precision
            denom = recall + precision
            f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom != 0))
            max_f1 = np.max(f1_scores)
            f1s.append(max_f1)
        else:
            f1s.append(np.nan)

    f1_avg = np.nanmean(f1s)  # 平均 F1

    # 输出结果
    print(f"Total labels: {len(selected_answers)}")
    print(f"Class distribution in ground truth: {np.sum(gt_np_all, axis=0)}")
    print(f"Class distribution in predictions: {np.sum(pred_np_all, axis=0)}")
    print(f"Average AUROC: {AUROC_avg:.4f}")
    print(f"Average F1: {f1_avg:.4f}")
    print(f"Average Accuracy: {acc_avg:.4f}")
    print(f"Number of errors: {error_count}")
    print(f"Error question IDs: {error_question_ids}")

# 调用函数进行计算
get_acc_and_auc()


Total number of answers: 1044
Total labels: 1000
Class distribution in ground truth: [ 13.  42.  96.  45. 134.  66.  37. 365.  64.  43. 191. 267.  28.   5.
  70.]
Class distribution in predictions: [296. 227.   0. 113.   0.   0.   2. 302. 421.   0.   3.  58. 320.   0.
   0.]
Average AUROC: 0.5012
Average F1: 0.1736
Average Accuracy: 0.8067
Number of errors: 0
Error question IDs: []


In [None]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, accuracy_score
import numpy as np
import torch
import json
import random

def compute_AUCs(gt, pred, n_class):
    """计算每个标签的 AUC"""
    AUROCs = []
    gt_np = gt.cpu().numpy()
    pred_np = pred.cpu().numpy()
    for i in range(n_class):
        if np.sum(gt_np[:, i]) == 0:  # 跳过全 0 的类别
            AUROCs.append(np.nan)
        else:
            try:
                AUROCs.append(roc_auc_score(gt_np[:, i], pred_np[:, i]))
            except ValueError:
                AUROCs.append(np.nan)  # 当 AUC 计算失败时，填充 NaN
    return AUROCs

def get_acc_and_auc():
    # 读取数据
    output_path = '../data/eval/test_prompt/Chest-X-ray_llava_val_ans.jsonl'
    answers = [json.loads(line) for line in open(output_path)]

    disease_list = ['fibrosis', 'edema', 'pneumothorax', 'cardiomegaly', 'atelectasis', 'nodule', 'emphysema', 'no finding', 'mass', 'pleural_thickening', 'effusion', 'infiltration', 'pneumonia', 'hernia', 'consolidation']
    print(f"Total number of answers: {len(answers)}")
    
    # 手动映射两个列表中的疾病
    # disease_mapping = {
    #     'atelectasis': 'Atelectasis',
    #     'cardiomegaly': 'Cardiomegaly',
    #     'consolidation': 'Consolidation',
    #     'edema': 'Edema',
    #     'pneumothorax': 'Pneumothorax',
    #     'effusion': 'Pleural Effusion',
    #     'pneumonia': 'Pneumonia',
    #     'no finding': 'Normal',
    #     'pleural_thickening': 'Pleural Other',
    #     'mass': 'Lung Lesion',
    #     'nodule': 'Lung Lesion',
    #     'infiltration': 'Lung Opacity',
    #     'fibrosis': None,      # 未映射的疾病
    #     'emphysema': None,     # 未映射的疾病
    #     'hernia': None         # 未映射的疾病
    # }

    # 随机选择 1000 行
    random.shuffle(answers)
    selected_answers = answers[:1000]

    # 初始化 ground truth (gt) 和 predictions (pred)
    n_classes = len(disease_list)
    gt = torch.zeros((len(selected_answers), n_classes), dtype=torch.float32)
    pred = torch.zeros((len(selected_answers), n_classes), dtype=torch.float32)

    error_count = 0
    error_question_ids = []

    # 遍历每个 answer，提取 labels 和预测类别
    for idx, item in enumerate(selected_answers):
        # 获取标签（label），labels 可能包含多个标签
        labels = ["-".join(item["question_id"].split("-")[1:])]  # 获取 label
        labels = [label.lower() for label in labels]

        # 设置 ground truth
        for label in labels:
            if label in disease_list:
                disease_idx = disease_list.index(label)
                gt[idx, disease_idx] = 1  # 对应疾病的 ground truth 置为 1

        # 获取预测的 text 并映射到疾病
        text = item["text"].strip().lower()

        # 设置预测的类别，映射预测类别为 ground truth 中的疾病
        for disease in disease_list:
            # mapped_disease = disease_mapping.get(disease)
            mapped_disease = disease
            if mapped_disease and mapped_disease.lower() in text:  # 确保映射后的疾病出现在预测中
                disease_idx = disease_list.index(disease)
                pred[idx, disease_idx] = 1  # 对应疾病的预测值置为 1

        # 检查是否没有预测到任何类别
        if torch.sum(pred[idx]) == 0:
            error_count += 1
            error_question_ids.append(item["question_id"])

    # 计算多标签 AUC
    AUROCs = compute_AUCs(gt, pred, n_classes)
    AUROC_avg = np.nanmean(AUROCs)  # 计算 AUC 的平均值，忽略 NaN

    # 计算每个类别的准确率
    accs = []
    gt_np_all = gt.cpu().numpy()
    pred_np_all = pred.cpu().numpy()

    for i in range(n_classes):
        gt_np = gt_np_all[:, i]
        pred_np = pred_np_all[:, i]
        
        if np.sum(gt_np) == 0:  # 跳过全 0 的类别
            accs.append(np.nan)
        else:
            acc = accuracy_score(gt_np, pred_np)
            accs.append(acc)
    
    for i, acc in enumerate(accs):
        print(f"{disease_list[i]}: {acc:.4f}")
    
    # 计算类别平均准确率
    acc_avg = np.nanmean(accs)

    # 计算 F1 分数
    f1s = []

    for i in range(n_classes):
        gt_np = gt_np_all[:, i]
        pred_np = pred_np_all[:, i]

        if np.sum(gt_np) == 0:  # 跳过全 0 的类别
            f1s.append(np.nan)
        else:
            precision, recall, thresholds = precision_recall_curve(gt_np, pred_np)
            if len(precision) > 1:
                numerator = 2 * recall * precision
                denom = recall + precision
                f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom != 0))
                max_f1 = np.max(f1_scores)
                f1s.append(max_f1)
            else:
                f1s.append(np.nan)

    f1_avg = np.nanmean(f1s)  # 平均 F1
    # 获取预测为 0 的类别索引
    zero_pred_indices = np.where(np.sum(pred_np_all, axis=0) == 0)[0]

    # 输出预测为 0 的类别
    zero_pred_diseases = [disease_list[i] for i in zero_pred_indices]
    print(f"Predicted as 0 for the following diseases: {zero_pred_diseases}")


    # 输出结果
    print(f"Total labels: {len(selected_answers)}")
    print(f"Class distribution in ground truth: {np.sum(gt_np_all, axis=0)}")
    print(f"Class distribution in predictions: {np.sum(pred_np_all, axis=0)}")
    print(f"Average AUROC: {AUROC_avg:.4f}")
    print(f"Average F1: {f1_avg:.4f}")
    print(f"Average Accuracy: {acc_avg:.4f}")
    print(f"Number of errors: {error_count}")
    print(f"Error question IDs: {error_question_ids}")

# 调用函数进行计算
get_acc_and_auc()


Total number of answers: 306
fibrosis: 0.9935
edema: 0.9902
pneumothorax: 0.8235
cardiomegaly: 0.9837
atelectasis: 0.0621
nodule: 0.9739
emphysema: 0.9967
no finding: 0.6340
mass: 0.7614
pleural_thickening: 0.9869
effusion: 0.0621
infiltration: 0.0980
pneumonia: 0.9967
hernia: 0.9935
consolidation: 0.8562
Predicted as 0 for the following diseases: ['fibrosis', 'edema', 'emphysema', 'no finding', 'pleural_thickening', 'pneumonia', 'hernia']
Total labels: 306
Class distribution in ground truth: [  2.   3.  12.   1.   8.   7.   1. 112.  11.   4.  18.  29.   1.   2.
   4.]
Class distribution in predictions: [  0.   0.  48.   6. 293.   1.   0.   0.  70.   0. 305. 305.   0.   0.
  42.]
Average AUROC: 0.5418
Average F1: 0.1019
Average Accuracy: 0.7475
Number of errors: 0
Error question IDs: []


In [None]:
import json
import random
import pandas as pd
from collections import defaultdict

dataset = "Chest-X-ray"
category_samples = defaultdict(list)

# 从 JSONL 文件中加载数据
data = [json.loads(line) for line in open(f"../data/eval/{dataset}.jsonl")]

# 目标疾病列表
disease_list = ['fibrosis', 'edema', 'pneumothorax', 'cardiomegaly', 'atelectasis', 'nodule', 'emphysema', 'no finding', 'mass', 'pleural_thickening', 'effusion', 'infiltration', 'pneumonia', 'hernia', 'consolidation']

# 收集每个类别的样本
for item in data:
    labels = item["label"]  # 假设每个数据项有一个 "label" 字段
    split = item["split"]  # 假设每个数据项有一个 "split" 字段
    if isinstance(labels, list) and split == "train":  # 如果标签是列表形式，说明可能多标签分类
        for label in labels:
            if label in disease_list:
                category_samples[label].append(item)
    else:
        if labels in disease_list and split == "train":
            category_samples[labels].append(item)

# 构建 few-shot 数据集，确保每个类别包含最多 10 个样本
few_shot_dataset = []

for category, samples in category_samples.items():
    few_shot_samples = random.sample(samples, min(10, len(samples)))  # 随机选择最多 10 个样本
    few_shot_dataset.extend(few_shot_samples)

# 将结果转换为 DataFrame 或者直接输出
df_few_shot = pd.DataFrame(few_shot_dataset)

# 保存 few-shot 数据集到 JSONL 文件
few_shot_file = f"../data/eval/{dataset}_few_shot.jsonl"
df_few_shot.to_json(few_shot_file, orient='records', lines=True)

print(f"Few-shot dataset saved to {few_shot_file}")

# import json
# import random
# import pandas as pd
# from collections import defaultdict

# dataset = "Chest-X-ray"
# category_samples = defaultdict(list)

# # 从 JSONL 文件中加载数据
# data = [json.loads(line) for line in open(f"../data/eval/{dataset}.jsonl")]

# disease_list = ['fibrosis', 'edema', 'pneumothorax', 'cardiomegaly', 'atelectasis', 'nodule', 'emphysema', 'no finding', 'mass', 'pleural_thickening', 'effusion', 'infiltration', 'pneumonia', 'hernia', 'consolidation']


# for item in data:
#     labels = item["label"]  # 假设每个数据项有一个 "label" 字段
#     split = item["split"]  # 假设每个数据项有一个 "split" 字段
#     if isinstance(labels, list) and split == "train":  # 如果标签是列表形式，说明可能多标签分类
#         for label in labels:
#             category_samples[label].append(item)
#     else:
#         category_samples[labels].append(item)

# # 构建 few-shot 数据集，确保每个类别包含 5 个样本
# few_shot_dataset = []

# for category, samples in category_samples.items():

#     few_shot_samples = random.sample(samples, min(10, len(samples)))
#     few_shot_dataset.extend(few_shot_samples)

# # 将结果转换为 DataFrame 或者直接输出
# df_few_shot = pd.DataFrame(few_shot_dataset)

# # 输出结果
# # print(df_few_shot)
# # 保存 few-shot 数据集
# few_shot_file = f"../data/eval/{dataset}_few_shot.jsonl"
# df_few_shot.to_json(few_shot_file, orient='records', lines=True)

# print(f"Few-shot dataset saved to {few_shot_file}")



Few-shot dataset saved to ../data/eval/Chest-X-ray_few_shot.jsonl


In [None]:
from transformers import AutoTokenizer

# 假设你使用的是某个特定模型的 tokenizer，例如 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained('/srv/lby/llava_med/llava-med-v1.5-mistral-7b')

# 你的 token ids
token_ids = [1, 851, 349, 264, 8118, 1500, 28733, 919, 8102]

# 使用 tokenizer 的 decode 方法还原
decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
print(decoded_text)

This is a chest X-ray showing


In [None]:
import json


# 创建一个空列表以存储结果
formatted_data = []

# 遍历 DataFrame 中的每一行
for idx, row in df_few_shot.iterrows():
    labels = row['label']
    # 提取需要的字段
    item = {
        'id': idx,
        'image': row['image'],  # 假设该列名为 'image'
        'conversations': [
            {
                'from': 'human',
                'value': "<image>\n Fill in the blank: this is a chest X-ray showing {}"
            },
            {
                'from': 'gpt',
                'value': f"This is a chest X-ray showing {labels}"  # 假设该列名为 'label'，对应疾病名称
            }
        ]
    }
    
    # 将构建的字典添加到列表中
    formatted_data.append(item)

# 将结果保存为 JSON 格式
output_json_path = "../data/train/sft_data/Chest-X-ray_few_shot_formatted.json"
with open(output_json_path, 'w') as json_file:
    json.dump(formatted_data, json_file, indent=4)

print(f"Formatted dataset saved to {output_json_path}")

Formatted dataset saved to ../data/train/sft_data/Chest-X-ray_few_shot_formatted.json


In [None]:

import torch

# 加载权重文件
file_path = '/home/lby/llava_med/LLaVA-Med/llava/run/checkpoints/llava-lora-new-clip-v1/non_lora_trainables.bin'
non_lora_trainables = torch.load(file_path, map_location='cpu')

# 打印所有权重的键名（模块名称）
print("Keys in non_lora_trainables:")
for key in non_lora_trainables.keys():
    print(key)

# 如果想统计权重的形状
print("\nWeight shapes:")
for key, value in non_lora_trainables.items():
    print(f"{key}: {value.shape}")

Keys in non_lora_trainables:
base_model.model.model.mm_projector.0.weight
base_model.model.model.mm_projector.0.bias
base_model.model.model.mm_projector.2.weight
base_model.model.model.mm_projector.2.bias
base_model.model.mis_mlp.out_mlp.0.weight
base_model.model.mis_mlp.out_mlp.0.bias
base_model.model.mis_mlp.out_mlp.1.weight
base_model.model.mis_mlp.out_mlp.1.bias
base_model.model.mis_mlp.out_mlp.3.weight
base_model.model.mis_mlp.out_mlp.3.bias

Weight shapes:
base_model.model.model.mm_projector.0.weight: torch.Size([4096, 1024])
base_model.model.model.mm_projector.0.bias: torch.Size([4096])
base_model.model.model.mm_projector.2.weight: torch.Size([4096, 4096])
base_model.model.model.mm_projector.2.bias: torch.Size([4096])
base_model.model.mis_mlp.out_mlp.0.weight: torch.Size([4096])
base_model.model.mis_mlp.out_mlp.0.bias: torch.Size([4096])
base_model.model.mis_mlp.out_mlp.1.weight: torch.Size([1024, 4096])
base_model.model.mis_mlp.out_mlp.1.bias: torch.Size([1024])
base_model.mode

In [None]:
from huggingface_hub import list_repo_files

# 仓库ID
repo_id = "liuhaotian/llava-v1.6-mistral-7b"

# 列出所有文件
file_list = list_repo_files(repo_id=repo_id)

# 保存文件URL到本地
base_url = f"https://huggingface.co/{repo_id}/resolve/main/"
with open("files.txt", "w") as f:
    for file in file_list:
        f.write(f"{base_url}{file}\n")

print("File list saved to files.txt")


File list saved to files.txt


In [None]:

import json
import os
from torch.nn.utils.rnn import pad_sequence
from PIL import Image
from tqdm import tqdm
import random
import numpy as np
from llava.constants import (
    DEFAULT_IM_END_TOKEN,
    DEFAULT_IM_START_TOKEN,
    DEFAULT_IMAGE_TOKEN,
    IMAGE_TOKEN_INDEX,
)
from llava.conversation import SeparatorStyle, conv_templates
from llava.mm_utils import (
    get_model_name_from_path,
    process_images,
    tokenizer_image_token,
    # eval_tokenizer_image_token,
)
from llava.model.clip_llava_builder import load_pretrained_model
from llava.utils import disable_torch_init
from dataclasses import dataclass
from dataclasses import asdict
from transformers import HfArgumentParser
from sklearn.metrics import accuracy_score, auc, precision_recall_curve, recall_score, f1_score, roc_auc_score


@dataclass
class SparseArguments:
    Imgcls_count: int = 4
    Txtcls_count: int = 4
    hidden_dim: int = 1024
    output_dim: int = 512
    img_mlp_type: int = 1
    txt_mlp_type: int = 1
    knowledge_mlp_type: int = 1
    loss_threshold: float = 0.5
    temperature: float = 0.05
    use_local_loss: bool = False
    feature_layer: int = 1
    special_tokens_mlp_type: int = 1
    use_ca_loss: bool = True
    inference_type: int = 2
    use_cat: bool = True
    use_prompt: bool = True
    Book_choice: int = 1

# List of diseases
disease_list = [
    'fibrosis', 'edema', 'pneumothorax', 'cardiomegaly', 'atelectasis', 'nodule',
    'emphysema', 'no finding', 'mass', 'pleural_thickening', 'effusion', 
    'infiltration', 'pneumonia', 'hernia', 'consolidation'
]

# Prompt template with Chain-of-Thought (CoT) strategy
PROMPT_TEMPLATE = """You are a senior medical imaging expert. Based on your expertise, please provide a detailed description of the disease "{disease}" from the following four aspects to ensure structured, comprehensive content that serves as a semantic anchor aligning medical images and textual reports:

[Step 1: Background Overview]
- Briefly define "{disease}", explain its pathological mechanism, and discuss its clinical significance.
- Describe common causes, pathogenesis, and their impact on patient health.

[Step 2: Imaging Characteristics]
- Describe the typical visual features of "{disease}" in medical imaging.
- Include details such as lesion location, shape, boundary characteristics, density variations, and common abnormal findings.

[Step 3: Text Report Correlation]
- Analyze common clinical symptoms, signs, and diagnostic indicators related to "{disease}" in textual reports.
- Explain how key information in these reports supports the diagnosis of "{disease}".

[Step 4: Comprehensive Reasoning & Expert Conclusion]
- Integrate the above information to systematically infer the diagnostic criteria for "{disease}".
- Provide a structured, detailed disease description covering background, imaging features, and textual report correlation to ensure alignment between multimodal information.

Please strictly follow the above reasoning steps and ensure the output maintains professional accuracy and authority.
"""

# Initialize LLaVA model
model_path = '/srv/lby/llava_med/llava-med-v1.5-mistral-7b'
model_path = os.path.expanduser(model_path)
model_name = get_model_name_from_path(model_path)

tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path, args.model_base, model_name, sparse_args, device_map='cuda:0'
)
model = LlavaModel("path/to/your/model")  # Adjust path to your LLaVA model

# Dictionary to store disease descriptions
disease_descriptions = {}

for disease in disease_list:
    prompt = PROMPT_TEMPLATE.format(disease=disease)
    generated_text = model.generate(prompt)  # Generate text using LLaVA
    disease_descriptions[disease] = generated_text.strip()

# Save results to JSON file
output_path = "mimic_disease_descriptions.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(disease_descriptions, f, ensure_ascii=False, indent=4)

print(f"Disease descriptions saved to {output_path}")


In [None]:
import os
import shutil
import pandas as pd

def extract_test_images(csv_path, src_root, dst_root):
    """
    将test.csv中指定的图片从all文件夹提取到test文件夹
    
    参数:
    csv_path : str   - test.csv文件路径
    src_root : str   - 原始图片所在的all文件夹根目录
    dst_root : str   - 目标test文件夹根目录
    """
    # 读取CSV文件
    try:
        df = pd.read_csv(csv_path, header=None, 
                        names=["id", "image_path", "class", "source"],
                        delimiter=' ')
    except Exception as e:
        print(f"CSV文件读取失败: {str(e)}")
        return

    # 创建目标文件夹
    os.makedirs(dst_root, exist_ok=True)
    
    # 统计变量
    success_count = 0
    missing_files = []
    
    # 遍历CSV中的每个图片路径
    for idx, row in df.iterrows():
        src_path = os.path.join(src_root, "all", row["image_path"])
        dst_path = os.path.join(dst_root, os.path.basename(row["image_path"]))
        
        # 检查源文件是否存在
        if not os.path.exists(src_path):
            missing_files.append(row["image_path"])
            continue
            
        # 执行文件复制
        try:
            shutil.copy2(src_path, dst_path)
            success_count += 1
        except Exception as e:
            print(f"复制失败 {row['image_path']}: {str(e)}")
            missing_files.append(row["image_path"])

    # 输出统计结果
    print(f"\n操作完成:")
    print(f"CSV总条目数: {len(df)}")
    print(f"成功复制文件: {success_count}")
    print(f"缺失文件数: {len(missing_files)}")
    
    # 如果存在缺失文件
    if len(missing_files) > 0:
        print("\n以下文件未找到:")
        for f in missing_files[:5]:  # 最多显示前5个缺失文件
            print(f" - {f}")
        if len(missing_files) > 5:
            print(f"（共{len(missing_files)}个缺失文件，仅显示前5个）")
    else:
        print("所有CSV中的图片已成功复制到test文件夹！")

# 使用示例
if __name__ == "__main__":
    extract_test_images(
        csv_path="/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/covidx-cxr2/test_split.csv",         # 替换为实际CSV路径
        src_root="/srv/lby/COVIDx_CXR-4",         # 保持与Dataset类相同的根目录
        dst_root="/srv/lby/COVIDx_CXR-4/test"     # 目标test文件夹路径
    )



操作完成:
CSV总条目数: 2969
成功复制文件: 2968
缺失文件数: 1

以下文件未找到:
 - 4c8dee3d9fe81567b98ed1b0b2b4c6_jumbo.jpg


In [None]:
import json

# 读取原始 JSON 文件
with open('/home/lby/llava_med/LLaVA-Med/llava/run/data/full_disease.json', 'r') as f:
    original_data = json.load(f)

# 转换格式：将数组内容用逗号拼接成字符串
processed_data = {
    key: ", ".join(value) 
    for key, value in original_data.items()
}

# 输出处理后的 JSON (indent=4 保持可读性)
with open('/home/lby/llava_med/LLaVA-Med/llava/run/data/new_full_disease.json.json', 'w') as f:
    json.dump(processed_data, f, indent=4)

# 若要直接查看结果可打印：
print(json.dumps(processed_data, indent=4))

{
    "normal": "border: clear and smooth, with the edge of the lung tissue appearing as a thin, curved line against the ribs., fluid: no fluid or effusion accumulation., location: fills the chest cavity, from just below the collarbones to just above the diaphragm., opacity: balanced, neither too opaque (white) nor too transparent (dark)., other: symmetric appearance between two chest's sides; clear visibility of the heart, ribs, spine, and diaphragm; bronchial tubes and blood vessels are visible as white lines or tree-branch patterns against the darker lung tissue., patterns: no cloudy or patchy areas, no concentrated white or black spots., shape: lungs appear as two large, oval or triangular areas on either side of the heart., texture: uniform with small, branching white lines representing the bronchi and blood vessels.",
    "clear": "border: clear, sharp borders of the heart, lungs, and diaphragm., fluid: no fluid or effusion accumulation., location: no abnormal shadows or spots in

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

def process_rsna_data():
    # 文件路径
    train_labels_path = "/srv/lby/RSNA_Pneumonia/stage_2_train_labels.csv"
    test_ids_path = "/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/rsna/new_test.csv"
    output_dir = "/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/rsna/"
    image_base_path = "/srv/lby/RSNA_Pneumonia/stage_2_train_images/"
    
    # 读取数据
    print("Loading train labels...")
    df = pd.read_csv(train_labels_path)
    print(f"Original training data size: {len(df)}")
    
    print("Loading test IDs to filter...")
    test_ids = pd.read_csv(test_ids_path)["ID"].tolist()
    print(f"Number of test IDs to filter out: {len(test_ids)}")
    
    # 过滤掉 test_ids 中的 patientId
    df_filtered = df[~df["patientId"].isin(test_ids)]
    print(f"Filtered training data size: {len(df_filtered)}")
    
    # 重新组织数据格式
    df_filtered = df_filtered.rename(columns={"patientId": "ID", "Target": "classes"})
    df_filtered["img_path"] = df_filtered["ID"].apply(lambda x: os.path.join(image_base_path, f"{x}.dcm"))
    df_filtered = df_filtered[["ID", "img_path", "classes"]]
    
    # 拆分训练集和验证集
    train_df, valid_df = train_test_split(df_filtered, test_size=0.2, random_state=42)
    
    print(f"Final train set size: {len(train_df)}")
    print(f"Final valid set size: {len(valid_df)}")
    
    # 保存结果
    train_df.to_csv(os.path.join(output_dir, "new_train.csv"), index=False)
    valid_df.to_csv(os.path.join(output_dir, "new_valid.csv"), index=False)
    
    print("Processing completed! Files saved.")

# 执行处理
process_rsna_data()

Loading train labels...
Original training data size: 30227
Loading test IDs to filter...
Number of test IDs to filter out: 5337
Filtered training data size: 24161
Final train set size: 19328
Final valid set size: 4833
Processing completed! Files saved.


In [None]:
import pandas as pd

# 文件路径
test_file = "/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/rsna/new_test.csv"
train_file = "/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/rsna/new_train.csv"

# 读取数据
test_ids = set(pd.read_csv(test_file)["ID"])
train_ids = set(pd.read_csv(train_file)["ID"])

# 计算交集
overlap_ids = test_ids.intersection(train_ids)

if overlap_ids:
    print(f"发现 {len(overlap_ids)} 个 ID 在 new_test 和 new_train 之间重叠！")
else:
    print("未发现重叠 ID，数据集划分正确。")


未发现重叠 ID，数据集划分正确。


In [34]:
import json
from dataclasses import dataclass
from glob import glob
from pathlib import Path
import random
from typing import Dict, List, Tuple

import pandas as pd
import torch
from PIL import Image
from torch.utils.data import Dataset
from transformers import AutoProcessor

import os

# 读取CSV文件
csv_file_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/rsna/new_train.csv'  # 替换为实际的CSV文件路径
df = pd.read_csv(csv_file_path)

# 采样比例
data_ratios = [0.01, 0.1, 1.0]  # 1%、10%、100%
json_base_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/rsna'

# 创建必要的目录
os.makedirs(json_base_path, exist_ok=True)

disease_columns = [
    "normal", "pneumonia"
]
prefix = "/srv/lby/"

for ratio in data_ratios:
    sampled_df = df.sample(frac=ratio, random_state=42)  # 采样
    json_data = []

    for index, row in sampled_df.iterrows():
        label = disease_columns[row['classes']]
        conversations = [
            {"from": "human", "value": "What disease is indicated by the chest X-ray?\n<image>"},
            {"from": "gpt", "value": "This is a chest X-ray showing " + label}
        ]

        path = row["img_path"]
        entry = {
            "id": index,
            "image": path[len(prefix):] if path.startswith(prefix) else path,
            "conversations": conversations
        }
        json_data.append(entry)

    json_file_path = os.path.join(json_base_path, f'rsna_train_{int(ratio * 100)}.json')
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(json_data, json_file, ensure_ascii=False, indent=4)
    
    print(f" 采样 {ratio * 100:.1f}% 数据完成，数据量: {len(sampled_df)}，保存至: {json_file_path}")

print(" 所有采样数据已处理完成！")


 采样 1.0% 数据完成，数据量: 193，保存至: /home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/rsna/rsna_train_1.json
 采样 10.0% 数据完成，数据量: 1933，保存至: /home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/rsna/rsna_train_10.json
 采样 100.0% 数据完成，数据量: 19328，保存至: /home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/rsna/rsna_train_100.json
 所有采样数据已处理完成！


In [1]:
import os
import json
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms

img_root = '/srv/lby/SIIM_Pneumothorax/processed-images-train/'
seg_root = '/srv/lby/SIIM_Pneumothorax/train_mask/'

# Define transforms for image and segmentation map
normalize = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
transform = transforms.Compose([
    transforms.Resize([336, 336]),
    transforms.ToTensor(),
    normalize,
])

seg_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize([336, 336]),
])

# Read CSV data
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/siimacr/train.csv'
data_info = pd.read_csv(csv_path)

def process_and_save_data(sample_ratio, output_file):
    sampled_data = data_info.sample(frac=sample_ratio, random_state=42)
    new_data = []
    
    for idx, row in sampled_data.iterrows():
        img_path = img_root + str(row[0]) + '.png'
        seg_path = seg_root + str(row[0]) + '.gif'

        # Open and transform the image
        img = Image.open(img_path).convert('RGB')
        image = transform(img)

        # Open and transform the segmentation map
        seg_map = Image.open(seg_path)
        seg_map = seg_transform(seg_map)
        seg_map = (seg_map > 0).type(torch.int)
        
        # Binary classification: if segmentation map has positive area, set label to 1 (pneumothorax)
        class_label = int(torch.sum(seg_map) > 0)
        disease_label = "pneumothorax" if class_label == 1 else "normal"

        conversations = [
            {"from": "human", "value": "What disease is indicated by the chest X-ray?\n<image>"},
            {"from": "gpt", "value": "This is a chest X-ray showing " + disease_label}
        ]

        entry = {
            "id": idx,
            "image": 'SIIM_Pneumothorax/processed-images-train/' + str(row[0]) + '.png',
            "conversations": conversations
        }
        new_data.append(entry)

    # Ensure output directory exists
    json_dir = os.path.dirname(output_file)
    if not os.path.exists(json_dir):
        os.makedirs(json_dir)

    # Save as JSON file
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(new_data, json_file, ensure_ascii=False, indent=4)
    
    print(f"采样比例: {sample_ratio * 100}% | 采样数据量: {len(new_data)} | 保存路径: {output_file}")

# Process and save data for 1%, 10%, and 100% samples
process_and_save_data(0.01, '/home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/siim/siim_train_1.json')
process_and_save_data(0.1, '/home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/siim/siim_train_10.json')
process_and_save_data(1.0, '/home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/siim/siim_train_100.json')

print("所有采样数据处理完成！")

  img_path = img_root + str(row[0]) + '.png'
  seg_path = seg_root + str(row[0]) + '.gif'
  "image": 'SIIM_Pneumothorax/processed-images-train/' + str(row[0]) + '.png',


采样比例: 1.0% | 采样数据量: 69 | 保存路径: /home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/siim/siim_train_1.json
采样比例: 10.0% | 采样数据量: 695 | 保存路径: /home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/siim/siim_train_10.json
采样比例: 100.0% | 采样数据量: 6949 | 保存路径: /home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/siim/siim_train_100.json
所有采样数据处理完成！


In [None]:
import os
import json
import pandas as pd
import random

# 设置随机种子以保证可复现性
random.seed(42)

# 图片根目录
img_root = '/srv/lby/COVIDx_CXR-4/'

# 读取 CSV 数据
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/covidx-cxr2/train_split.csv'
data_info = pd.read_csv(csv_path, header=None, names=["id", "image_path", "class", "source"], delimiter=' ')

# 采样比例
sample_ratios = [0.01, 0.1, 1.0]  # 1%、10%、100%

# 遍历不同的采样比例
for ratio in sample_ratios:
    sampled_data = data_info.sample(frac=ratio, random_state=42).reset_index(drop=True)
    sampled_json_data = []
    
    # 处理数据
    for idx, row in sampled_data.iterrows():
        img_path = 'COVIDx_CXR-4/test/' + str(row["image_path"])   # 处理图片路径
        class_label = row["class"]   # 获取分类标签
        
        # 设定疾病标签
        disease_label = "covid-19" if class_label == 'positive' else "normal"
        
        # 创建对话数据
        conversations = [
            {"from": "human", "value": "What disease is indicated by the chest X-ray?\n<image>"},
            {"from": "gpt", "value": "This is a chest X-ray showing " + disease_label}
        ]
        
        entry = {
            "id": idx,
            "image": img_path,
            "conversations": conversations
        }
        
        sampled_json_data.append(entry)
    
    # 检查目录路径是否存在，如果不存在则创建
    json_file_path = f'/home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/covid/covid_train_{int(ratio*100)}.json'
    json_dir = os.path.dirname(json_file_path)
    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    
    # 保存 JSON 文件
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(sampled_json_data, json_file, ensure_ascii=False, indent=4)
    
    print(f"采样比例: {ratio*100:.1f}% | 采样数据量: {len(sampled_json_data)} | 保存路径: {json_file_path}")

print("所有采样数据处理完成！")

  img_path = 'COVIDx_CXR-4/test/' + str(row[1])   # Assuming 2nd column has the image filename
  class_label = row[2]   # Assuming labels are in the columns starting from index 1


COVIDx_CXR-4/test/A517635-01-11-1901-NA-CHEST_AP_PORTABLE-16479-2.000000-AP-26259-1-1.jpg
COVIDx_CXR-4/test/3d505bb2-544e-4e2d-8859-ef33bf31c49f.png
COVIDx_CXR-4/test/17c749dc-e97a-4c5c-b4f4-c68b135a1a64.png
COVIDx_CXR-4/test/6a66b5bb-54f3-46b0-9675-9d137cb3f895.png
COVIDx_CXR-4/test/e17bacff-85d3-47b3-b2cb-2a3e123a6912.png
COVIDx_CXR-4/test/COVID(257).png
COVIDx_CXR-4/test/29a2fb4c-481c-4147-89ad-5b6d1e343903.png
COVIDx_CXR-4/test/88070db5-6752-437e-ad22-3f0f2451180a.png
COVIDx_CXR-4/test/A358050-01-21-1901-NA-CHEST_AP_PORT-10491-2.000000-AP-31581-1-1.jpg
COVIDx_CXR-4/test/A525539-12-31-1900-NA-CHEST_AP_PORT-95216-2.000000-AP-09880-1-1.jpg
COVIDx_CXR-4/test/A727037-01-27-1901-NA-CHEST_AP_PORT-24596-3.000000-AP-74839-1-1.jpg
COVIDx_CXR-4/test/b9a05d0d-8f3f-40db-8114-c3fd1a4792b0.png
COVIDx_CXR-4/test/99af6822-9ace-4a71-9e1e-4de315370ad4.png
COVIDx_CXR-4/test/A274721-01-24-1901-NA-CHEST_AP_PORT-51406-1.000000-AP-55736-1-1.jpg
COVIDx_CXR-4/test/A413445-01-09-1901-NA-CHEST_AP_PORT-58344-3

In [None]:
import json
import random
import os
import pandas as pd

# 设定随机种子，保证可复现性
random.seed(42)

# 定义疾病标签
disease_labels = [
    "fibrosis", "edema", "pneumothorax", "cardiomegaly", "atelectasis", "nodule",
    "emphysema", "no finding", "mass", "pleural_thickening", "effusion", "infiltration",
    "pneumonia", "hernia", "consolidation"
]

# 读取 CSV 数据
csv_path = '/home/lby/llava_med/LLaVA-Med/llava/run/data/process_data/xray14/official_train.csv'
data_info = pd.read_csv(csv_path)

# 设定采样比例
sampling_ratios = {"1": 0.01, "10": 0.1, "100": 1.0}
output_dir = '/home/lby/llava_med/LLaVA-Med/llava/run/data/fine_tuning/chest_xray'
os.makedirs(output_dir, exist_ok=True)

for ratio_name, ratio in sampling_ratios.items():
    sampled_data = data_info.sample(frac=ratio, random_state=42)
    new_data = []
    
    for idx, row in sampled_data.iterrows():
        img_path = str(row[0]).replace("/DATA/Chestxray/ChestXray8", "Chest-X-ray-dataset")
        diseases = [d.strip().lower() for d in row[2].split('|')]
        
        conversations = [
            {"from": "human", "value": "What disease is indicated by the chest X-ray?\n<image>"},
            {"from": "gpt", "value": "This is a chest X-ray showing " + ", ".join(diseases) + "."}
        ]
        
        entry = {"id": idx, "image": img_path, "conversations": conversations}
        new_data.append(entry)
    
    # 保存 JSON 文件
    json_file_path = os.path.join(output_dir, f'chest_xray_train_{ratio_name}.json')
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(new_data, json_file, ensure_ascii=False, indent=4)
    
    print(f"采样比例: {ratio_name}%，采样数据量: {len(new_data)}，文件已保存至: {json_file_path}")


  img_path = str(row[0]).replace("/DATA/Chestxray/ChestXray8","Chest-X-ray-dataset")  # Assuming the first column is the image filename
  disease_str = row[2]  # Assuming labels are in the columns starting from index 1


采样数据前5条: [{'id': 19327, 'image': 'Chest-X-ray-dataset/images_001/images/00000001_001.png', 'conversations': [{'from': 'human', 'value': 'What disease is indicated by the chest X-ray?\n<image>'}, {'from': 'gpt', 'value': 'This is a chest X-ray showing cardiomegaly, emphysema.'}]}, {'id': 19327, 'image': 'Chest-X-ray-dataset/images_001/images/00000001_002.png', 'conversations': [{'from': 'human', 'value': 'What disease is indicated by the chest X-ray?\n<image>'}, {'from': 'gpt', 'value': 'This is a chest X-ray showing cardiomegaly, effusion.'}]}, {'id': 19327, 'image': 'Chest-X-ray-dataset/images_001/images/00000002_000.png', 'conversations': [{'from': 'human', 'value': 'What disease is indicated by the chest X-ray?\n<image>'}, {'from': 'gpt', 'value': 'This is a chest X-ray showing no finding.'}]}, {'id': 19327, 'image': 'Chest-X-ray-dataset/images_001/images/00000004_000.png', 'conversations': [{'from': 'human', 'value': 'What disease is indicated by the chest X-ray?\n<image>'}, {'from