In [73]:
import json
import random
import pandas as pd

dataset = "Chest-X-ray"

# convert csv to jsonl
data = pd.read_csv(f"../../data/eval/{dataset}.csv")

# 将DataFrame转换为字典列表
data_list = data.to_dict(orient='records')

# 固定随机种子并打乱数据
random.seed(1234)
random.shuffle(data_list)
split_point = int(len(data_list) * 0.75)

new_data = []
for idx, item in enumerate(data_list):
    # 提取img_path和Finding Label
    img_path = item.get("img_path")
    finding_label = item.get("Finding Label").lower()

    if isinstance(finding_label, str):
        finding_label = finding_label.replace('|', ', ')
        
    # 创建新的数据项
    new_item = {
        "image": img_path,
        "label": finding_label,
        "split": "train" if idx < split_point else "test" 
    }
    
    # 添加到new_data列表中
    new_data.append(new_item)
random.shuffle(new_data)
# 将数据保存为JSONL文件
jsonl_file = f"../../data/eval/{dataset}.jsonl"
with open(jsonl_file, mode='w', encoding='utf-8') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")

print(f"数据已成功保存到 {jsonl_file} 文件中。")

数据已成功保存到 ../../data/eval/Chest-X-ray.jsonl 文件中。


In [72]:
import json
import random
import pandas as pd

dataset = "Chest-X-ray"
labels = set()

jsonl_file = f"../../data/eval/{dataset}.jsonl"
with open(jsonl_file, mode='w', encoding='utf-8') as f:
    for item in new_data:
        label = item.get("label")
        if label:  # 如果 label 存在
            split_labels = label.split(",")  # 根据逗号拆分标签
            split_labels = [lbl.strip() for lbl in split_labels]  # 去掉每个类别的前后空格
            labels.update(split_labels)  # 更新到集合中

# 将集合转换为列表并保存为 JSON 文件
labels_list = list(labels)
output_file = f"../../data/eval/{dataset}_classes.json"

with open(output_file, mode='w', encoding='utf-8') as f:
    json.dump(labels_list, f, ensure_ascii=False, indent=4)

print(f"Labels saved to {output_file}")

Labels saved to ../../data/eval/Chest-X-ray_classes.json


In [1]:
import json
import random
import pandas as pd

dataset = "Chest-X-ray"

data = [json.loads(line) for line in open(f"../data/eval/{dataset}.jsonl")]


new_data = []
for idx, item in enumerate(data):
    new_item = {
        "image": item["image"].replace("/srv/lby/", ""),
        "text": "What type of disease is shown in this chest x-ray image? Here is the given list: ['fibrosis', 'edema', 'pneumothorax', 'cardiomegaly', 'atelectasis', 'nodule', 'emphysema', 'no finding', 'mass', 'pleural_thickening', 'effusion', 'infiltration', 'pneumonia', 'hernia', 'consolidation']. \nAnswer the question using a single word or phrase from the given list directly.", 
        "category": "conv",
        "label": item["label"],
        "question_id": f'{idx}-{item["label"]}',
    }
    new_data.append(new_item)

random.shuffle(new_data)

jsonl_file = f"../data/eval/test_prompt/{dataset}_llava_val.jsonl"
with open(jsonl_file, mode='w', encoding='utf-8') as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")
        
print(f"数据已成功保存到 {jsonl_file} 文件中。")


# You are now acting as a knowledgeable radiologist. Please analyze the provided medical image and identify the most appropriate disease category or categories present in the patient. The diagnosis can involve one or more conditions. You must select the relevant categories from the following list: ['atelectasis', 'cardiomegaly', 'pleural effusion', 'infiltration', 'lung mass', 'lung nodule', 'pneumonia', 'pneumothorax', 'consolidation', 'edema', 'emphysema', 'fibrosis', 'pleural thicken', 'hernia', 'no finding']. Remember, you should only output the categories from the list, and no additional content.
## 只输出类别的prompt
# You are now acting as a knowledgeable radiologist. Please analyze the provided medical image and identify the most appropriate disease category or categories present in the patient. You can only output the corresponding index number of the disease in the list from 0 to 14 from the given list: ['atelectasis', 'cardiomegaly', 'pleural effusion', 'infiltration', 'lung mass', 'lung nodule', 'pneumonia', 'pneumothorax', 'consolidation', 'edema', 'emphysema', 'fibrosis', 'pleural thicken', 'hernia', 'no finding'].
##ABCD尝试 
# What type of disease is shown in this chest x-ray image? Choose one from A. Atelectasis.\n B. Cardiomegaly.\n C. Pleural Effusion.\nD. Infiltration.
# ['fibrosis', 'edema', 'pneumothorax', 'cardiomegaly', 'atelectasis', 'nodule', 'emphysema', 'no finding', 'mass', 'pleural_thickening', 'effusion', 'infiltration', 'pneumonia', 'hernia', 'consolidation']


数据已成功保存到 ../data/eval/test_prompt/Chest-X-ray_llava_val.jsonl 文件中。


In [None]:
# 随机遍历jsonl文件1000行，获取每一行的question_id和text内容，question_id

In [55]:
import json
import re
import numpy as np

# 读取数据
output_path = '/home/lby/llava_med/LLaVA-Med/data/eval/test_fine_tuning/Chest-X-ray_llava_val_ans.jsonl'
answers = [json.loads(line) for line in open(output_path)]

disease_list = ['fibrosis', 'edema', 'pneumothorax', 'cardiomegaly', 'atelectasis', 'nodule', 'emphysema', 'no finding', 'mass', 'pleural_thickening', 'effusion', 'infiltration', 'pneumonia', 'hernia', 'consolidation']

# 随机选择 1000 行
random.shuffle(answers)
selected_answers = answers[:1000]

# 初始化变量
correct_predictions = 0
total_predictions = len(selected_answers)
error_count = 0
error_question_ids = []

# 遍历每个 answer，提取 labels 和预测类别
for item in selected_answers:
    # 获取标签（label），labels 可能包含多个标签，以逗号或其他符号分隔
    labels = ["-".join(item["question_id"].split("-")[1:])]  # 获取 label
    labels = [label.lower() for label in labels]  # 转为小写以方便比较

    # 获取预测的 text
    text = item["text"].lower()

    # 尝试在 text 中找到疾病列表中的元素作为预测结果
    predicted_categories = [disease for disease in disease_list if disease in text]

    if predicted_categories:
        predicted_category = predicted_categories[0]  # 假设预测类别为匹配到的第一个疾病
    else:
        # 如果无法提取预测类别，统计为出错
        error_count += 1
        error_question_ids.append(item["question_id"])
        continue  # 跳过此项

    # 检查预测类别是否在 labels 列表中
    if any(predicted_category in label for label in labels):
        correct_predictions += 1
    else:
        # 如果预测错误，统计出错信息
        error_count += 1
        error_question_ids.append(item["question_id"])

# 计算准确率
accuracy = (correct_predictions / total_predictions) * 100

# 输出结果
print(f"Total labels: {total_predictions}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"Number of errors: {error_count}")
print(f"Error question IDs: {error_question_ids}")

Total labels: 1000
Accuracy: 6.10%
Number of errors: 939
Error question IDs: ['16031-no finding', '303-no finding', '15387-atelectasis', '2569-infiltration, pneumothorax', '15203-effusion, infiltration, pneumothorax', '9561-effusion, infiltration', '6817-no finding', '19636-infiltration, mass', '22920-infiltration, pneumothorax', '14558-no finding', '9379-no finding', '4594-no finding', '25223-atelectasis, infiltration', '15018-no finding', '22358-no finding', '2495-no finding', '3063-no finding', '22065-no finding', '21779-no finding', '19239-no finding', '21500-nodule', '5753-atelectasis, cardiomegaly', '7966-no finding', '7273-no finding', '25244-atelectasis, infiltration, pneumothorax', '21851-mass, nodule', '15219-no finding', '21672-infiltration', '15312-infiltration', '10924-no finding', '11146-nodule', '17668-no finding', '1376-infiltration, pneumothorax', '14631-effusion', '7867-no finding', '13664-infiltration', '7149-no finding', '11532-atelectasis', '246-mass', '15859-effus