### PathVQA Data Filtering for Histopathology Images

This code will filter down all the histopathology images and relevant questions from the PathVQA dataset since PathVQA contains gross and histopathology images. But we only want to do the evaluation on histopathology images

In [None]:
import sys
from PIL import Image
import pandas as pd
import os
import pickle
from collections import Counter
import numpy as np
import shutil

sys.path.append(os.path.join(os.getcwd(), 'histocartography'))
from histocartography.preprocessing import NucleiExtractor

#### Defining a function to detect the nuclei in the image

In [None]:
# Cell Graph Generation Definitions
nuclei_detector = NucleiExtractor()

def detect_histopathology_images(img_path):    
    query_img = Image.open(img_path).convert(mode="RGB")
    image = np.array(query_img)
    nuclei_map, nuclei_centers = nuclei_detector.process(image)

    # Only consider if more than 5 nuclei are detected since knn needs to form a graph using 5 neighbors.
    # If less than 5 nuclei are present, most of the images are not pathology related
    if nuclei_centers.shape[0] > 5:
        return True
    else:
        return False

#### Defining the paths for PVQA dataset

In [None]:
pvqa_data_path = "/data/mn27889/pvqa"
pvqa_images = os.path.join(pvqa_data_path, "images")
pvqa_qas = os.path.join(pvqa_data_path, "qas")

### Defining the paths of PathVQA images to filter down the histopathology images from gross images

In [None]:
pvqa_histo_data_path = "/data/mn27889/path-open-data/pathvqa-histopathology"
pvqa_histo_images = os.path.join(pvqa_histo_data_path, "images")
pvqa_histo_qas = os.path.join(pvqa_histo_data_path, "qas")
os.makedirs(pvqa_histo_images, exist_ok=True)
os.makedirs(pvqa_histo_qas, exist_ok=True)

#### 1. Filtering the images/qas from `train` subset

Defining the source directories

In [None]:
pvqa_subset = "train"
pvqa_images_subset_path = os.path.join(pvqa_images, pvqa_subset)
pvqa_qas_subset_path = os.path.join(pvqa_qas, pvqa_subset)

Defining the destination directories

In [None]:
pvqa_histo_images_subset_path = os.path.join(pvqa_histo_images, pvqa_subset)
pvqa_histo_qas_subset_path = os.path.join(pvqa_histo_qas, pvqa_subset)
os.makedirs(pvqa_histo_images_subset_path, exist_ok=True)
os.makedirs(pvqa_histo_qas_subset_path, exist_ok=True)

Detecting all the histopathology images

In [None]:
image_list = os.listdir(pvqa_images_subset_path)
histo_images = []
for image in image_list:
    image_path = os.path.join(pvqa_images_subset_path, image)
    is_image_histo = detect_histopathology_images(image_path)
    print(image)
    if is_image_histo:
        histo_images.append(image)

Filtering all the questions related to histopathology images

In [None]:
file_name = "train_qa.pkl"
qas_file_path = os.path.join(pvqa_qas_subset_path, file_name)
with open(qas_file_path, 'rb') as file:
    pvqa_qas_subset = pickle.load(file)

histo_images_without_extension = [os.path.splitext(img_name)[0] for img_name in histo_images]
pvqa_histo_qas_subset = [qa_sample for qa_sample in pvqa_qas_subset if qa_sample['image'] in histo_images_without_extension]

Total Samples: 19755


Moving all the histo images to destination directory

In [None]:
for image_name in histo_images:
    src_image_path = os.path.join(pvqa_images_subset_path, image_name)
    shutil.copy(src_image_path, pvqa_histo_images_subset_path)

Moving the updated qa_samples to destination directory

In [None]:
pvqa_histo_qas_subset_path_file = os.path.join(pvqa_histo_qas_subset_path, file_name)
with open(pvqa_histo_qas_subset_path_file, 'wb') as file:
        pickle.dump(pvqa_histo_qas_subset, file)

#### 2. Filtering the images/qas from `test` subset

Defining the source directories

In [None]:
pvqa_subset = "test"
pvqa_images_subset_path = os.path.join(pvqa_images, pvqa_subset)
pvqa_qas_subset_path = os.path.join(pvqa_qas, pvqa_subset)

Defining the destination directories

In [None]:
pvqa_histo_images_subset_path = os.path.join(pvqa_histo_images, pvqa_subset)
pvqa_histo_qas_subset_path = os.path.join(pvqa_histo_qas, pvqa_subset)
os.makedirs(pvqa_histo_images_subset_path, exist_ok=True)
os.makedirs(pvqa_histo_qas_subset_path, exist_ok=True)

Detecting all the histopathology images

In [None]:
image_list = os.listdir(pvqa_images_subset_path)
histo_images = []
for image in image_list:
    image_path = os.path.join(pvqa_images_subset_path, image)
    is_image_histo = detect_histopathology_images(image_path)
    print(image)
    if is_image_histo:
        histo_images.append(image)

Filtering all the questions related to histopathology images

In [None]:
file_name = "test_qa.pkl"
qas_file_path = os.path.join(pvqa_qas_subset_path, file_name)
with open(qas_file_path, 'rb') as file:
    pvqa_qas_subset = pickle.load(file)

histo_images_without_extension = [os.path.splitext(img_name)[0] for img_name in histo_images]
pvqa_histo_qas_subset = [qa_sample for qa_sample in pvqa_qas_subset if qa_sample['image'] in histo_images_without_extension]

Total Samples: 19755


Moving all the histo images to destination directory

In [None]:
for image_name in histo_images:
    src_image_path = os.path.join(pvqa_images_subset_path, image_name)
    shutil.copy(src_image_path, pvqa_histo_images_subset_path)

Moving the updated qa_samples to destination directory

In [None]:
pvqa_histo_qas_subset_path_file = os.path.join(pvqa_histo_qas_subset_path, file_name)
with open(pvqa_histo_qas_subset_path_file, 'wb') as file:
        pickle.dump(pvqa_histo_qas_subset, file)

#### 3. Filtering the images/qas from `val` subset

Defining the source directories

In [None]:
pvqa_subset = "val"
pvqa_images_subset_path = os.path.join(pvqa_images, pvqa_subset)
pvqa_qas_subset_path = os.path.join(pvqa_qas, pvqa_subset)

Defining the destination directories

In [None]:
pvqa_histo_images_subset_path = os.path.join(pvqa_histo_images, pvqa_subset)
pvqa_histo_qas_subset_path = os.path.join(pvqa_histo_qas, pvqa_subset)
os.makedirs(pvqa_histo_images_subset_path, exist_ok=True)
os.makedirs(pvqa_histo_qas_subset_path, exist_ok=True)

Detecting all the histopathology images

In [None]:
image_list = os.listdir(pvqa_images_subset_path)
histo_images = []
for image in image_list:
    image_path = os.path.join(pvqa_images_subset_path, image)
    is_image_histo = detect_histopathology_images(image_path)
    print(image)
    if is_image_histo:
        histo_images.append(image)

Filtering all the questions related to histopathology images

In [None]:
file_name = "val_qa.pkl"
qas_file_path = os.path.join(pvqa_qas_subset_path, file_name)
with open(qas_file_path, 'rb') as file:
    pvqa_qas_subset = pickle.load(file)

histo_images_without_extension = [os.path.splitext(img_name)[0] for img_name in histo_images]
pvqa_histo_qas_subset = [qa_sample for qa_sample in pvqa_qas_subset if qa_sample['image'] in histo_images_without_extension]

Total Samples: 19755


Moving all the histo images to destination directory

In [None]:
for image_name in histo_images:
    src_image_path = os.path.join(pvqa_images_subset_path, image_name)
    shutil.copy(src_image_path, pvqa_histo_images_subset_path)

Moving the updated qa_samples to destination directory

In [None]:
pvqa_histo_qas_subset_path_file = os.path.join(pvqa_histo_qas_subset_path, file_name)
with open(pvqa_histo_qas_subset_path_file, 'wb') as file:
        pickle.dump(pvqa_histo_qas_subset, file)