In [1]:
import sys

HISTO_PATH = "/data/mn27889/pathrag-synthetic-data-dpo/histocartography"
PVQA_DATA_PATH = "/data/mn27889/pathrag-synthetic-data-dpo/data/pvqa"
PVQA_EVAL_DATA_PATH = "/data/mn27889/pathrag-synthetic-data-dpo/path-vqa/evaluation_data/pvqa_evaluation/images"

sys.path.insert(0, HISTO_PATH)

In [2]:
from PIL import Image
import numpy as np
import os
import pickle
from histocartography.preprocessing import NucleiExtractor
import shutil
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################



### Nuclei Detector to identify the H&E stained images

In [3]:
nuclei_detector = NucleiExtractor()

File already downloaded.
/data/mn27889/pathrag-synthetic-data-dpo/histocartography/histocartography/preprocessing/../../checkpoints/pannuke.pt




### PathVQA Evaluation Dataset

In [4]:
img_val_path = os.path.join(PVQA_DATA_PATH, "images/val")
qas_val_path = os.path.join(PVQA_DATA_PATH, "qas/val/val_qa.pkl")
with open(qas_val_path, 'rb') as file:
    pvqa_val_qas = pickle.load(file)

In [5]:
# Reading only open-ended questions/answers
qas_general = [qas for qas in pvqa_val_qas if qas['answer'] != 'yes' and qas['answer'] != 'no']

# Sorting the VQA pairs by name of the image
qas_general = sorted(qas_general, key=lambda d: d['image'])

# Getting the image labels
img_general = [qas['image'] for qas in qas_general]

# Preparing the unique image labels
img_general = list(set(img_general))
img_general = sorted(img_general, key=str)

# Appending to path 
img_general_path = [os.path.join(img_val_path, img_name + '.jpg') for img_name in img_general]

### Extracting all the H&E stained images

For this, use the Nuclei count and only consider the images whose nuclei count is greater than 5 to be the H&E stained images

In [6]:
he_img_general = []
he_img_general_path = []

for img_index in range(len(img_general_path)):
    query_img = Image.open(img_general_path[img_index]).convert(mode="RGB")
    image = np.array(query_img)
    # nuclei_map, nuclei_centers = nuclei_detector.process(image)

    # Only consider if more than 5 nuclei are detected since knn needs to form a graph using 5 neighbors.
    # If less than 5 nuclei are present, most of the images are not pathology related
    # if nuclei_centers.shape[0] > 5:
    if True:
        he_img_general.append(img_general[img_index])
        he_img_general_path.append(img_general_path[img_index])

Now extract all the questions which are related to H&E stained images

In [7]:
he_qas_general = [qas for qas in qas_general if qas['image'] in he_img_general]

Creating the list of data

In [8]:
image_id = []
question = []
answer = []

for qas_index in range(len(he_qas_general)):
    image_id.append(he_qas_general[qas_index]['image'])
    question.append(he_qas_general[qas_index]['question'])
    answer.append(he_qas_general[qas_index]['answer'])

Copying the image files

In [9]:
for img_index in range(len(he_img_general_path)):
    
    if os.path.isfile(he_img_general_path[img_index]):
        img_path = he_img_general_path[img_index]
        
        eval_img_path = os.path.join(PVQA_EVAL_DATA_PATH, he_img_general[img_index] + '.jpg')
        
    shutil.copyfile(img_path, eval_img_path)

Combining all the data and saving it in excel file

In [10]:
final_df = pd.DataFrame({'Image ID': image_id, 'Question': question, 'Answer': answer})

In [11]:
final_df

Unnamed: 0,Image ID,Question,Answer
0,val_0000,What has pseudohyphae and budding yeasts?,candida organism
1,val_0000,What does candida organism have?,pseudohyphae and budding yeasts
2,val_0000,What shows septate hyphae with acute-angle bra...,gomori methenamine-silver (gms) stain
3,val_0000,What does gomori methenamine-silver (GMS) stai...,septate hyphae with acute-angle branching
4,val_0000,How does gomori methenamine-silver (GMS) stain...,with acute-angle branching
...,...,...,...
3139,val_0990,What are present?,extremities
3140,val_0990,What are present ?,extremities
3141,val_0991,What does this image show?,disseminated intravascular coagulation dic
3142,val_0991,What is present?,eye


In [12]:
final_df.to_excel('pvqa-evaluation.xlsx')