### PathVQA Data Preparation

This code will prepare the PathVQA data for evaluation. We need to evaluate PathVQA as following:

Close-Ended: 50 \
Open-Ended: 550
- What
- Where
- How
- When
- Why

Since we want only the questions/answers displaying sensitive knowledge, we will only consider the data samples for which we have the length of both the answers and questions more than 5 words. If we are not able to find the respective number of samples, we can consider those data samples for which we have anwers or questions having more than 5 words.

In [None]:
import sys
from PIL import Image
import pandas as pd
import os
import pickle
from collections import Counter
import numpy as np
import shutil

#### Defining the paths for PVQA dataset

In [None]:
pvqa_data_path = "/data/mn27889/path-open-data/pathvqa-histopathology"
pvqa_images = os.path.join(pvqa_data_path, "images")
pvqa_qas = os.path.join(pvqa_data_path, "qas")

#### Considering images/qas from `train` subset

In [None]:
pvqa_subset = "train"
pvqa_images_subset_path = os.path.join(pvqa_images, pvqa_subset)
pvqa_qas_subset_path = os.path.join(pvqa_qas, pvqa_subset)

#### Reading the QAS

In [None]:
file_name = "train_qa.pkl"
qas_file_path = os.path.join(pvqa_qas_subset_path, file_name)
with open(qas_file_path, 'rb') as file:
    pvqa_qas_subset = pickle.load(file)

print('Total Samples:',len(pvqa_qas_subset))

In [None]:
pvqa_qas_subset[0:5]

#### Differentiating the open-ended and close-ended questions

In [None]:
pvqa_qas_close_ended = [sample for sample in pvqa_qas_subset if sample['answer'].lower() == 'yes' or sample['answer'].lower() == 'no']
len(pvqa_qas_close_ended)

In [None]:
pvqa_qas_open_ended = [sample for sample in pvqa_qas_subset if sample not in pvqa_qas_close_ended]
len(pvqa_qas_open_ended)

In [None]:
assert len(pvqa_qas_open_ended) + len(pvqa_qas_close_ended) == len(pvqa_qas_subset)

#### Finding the samples for which question and answer are equal to or more than 5 words.
- For close-ended questions, only check question text since answer would be 'yes' or 'no'
- For open-ended questions, check the both the question and answer

In [None]:
valid_close_ended_samples = [sample for sample in pvqa_qas_close_ended if len(sample['question'].split()) >= 5]
len(valid_close_ended_samples)

In [None]:
valid_open_ended_samples = [sample for sample in pvqa_qas_open_ended if len(sample['question'].split()) >= 5 and len(sample['answer'].split()) >= 5]
len(valid_open_ended_samples)

#### From open-ended questions, separating the questions starting with the following words:
- What
- Where
- How
- When
- Why

In [None]:
first_word = [sample['question'].split()[0].lower() for sample in valid_open_ended_samples]
counts = Counter(first_word)
print(counts)

In [None]:
what_question_samples = [sample for sample in valid_open_ended_samples if sample['question'].lower().startswith('what')]
how_question_samples = [sample for sample in valid_open_ended_samples if sample['question'].lower().startswith('how')]
why_question_samples = [sample for sample in valid_open_ended_samples if sample['question'].lower().startswith('why')]
where_question_samples = [sample for sample in valid_open_ended_samples if sample['question'].lower().startswith('where')]
when_question_samples = [sample for sample in valid_open_ended_samples if sample['question'].lower().startswith('when')]
print('Total What Questions:', len(what_question_samples))
print('Total How Questions:', len(how_question_samples))
print('Total Why Questions:', len(why_question_samples))
print('Total Where Questions:', len(where_question_samples))
print('Total When Questions:', len(when_question_samples))

### Compiling the data into a single dataframe

Selecting the Top 50 samples of Close-Ended

In [None]:
pvqa_eval_data_close_ended = pd.DataFrame(columns=['image_path', 'question', 'answer', 'question_type'])

for sample in valid_close_ended_samples[0:50]:
    image_path = os.path.join(pvqa_images_subset_path, sample['image'] + '.jpg')
    question = sample['question']
    answer = sample['answer']
    question_type = 'close-ended'
    pvqa_eval_data_close_ended.loc[len(pvqa_eval_data_close_ended)] = [image_path, question, answer, question_type]

pvqa_eval_data_close_ended.head()

Selecting the Top 400 samples of What Open-Ended

In [None]:
pvqa_eval_data_open_what = pd.DataFrame(columns=['image_path', 'question', 'answer', 'question_type'])

for sample in what_question_samples[0:400]:
    image_path = os.path.join(pvqa_images_subset_path, sample['image'] + '.jpg')
    question = sample['question']
    answer = sample['answer']
    question_type = 'open-what'
    pvqa_eval_data_open_what.loc[len(pvqa_eval_data_open_what)] = [image_path, question, answer, question_type]

pvqa_eval_data_open_what.head()

Selecting all samples of How Open-Ended

In [None]:
pvqa_eval_data_open_how = pd.DataFrame(columns=['image_path', 'question', 'answer', 'question_type'])

for sample in how_question_samples:
    image_path = os.path.join(pvqa_images_subset_path, sample['image'] + '.jpg')
    question = sample['question']
    answer = sample['answer']
    question_type = 'open-how'
    pvqa_eval_data_open_how.loc[len(pvqa_eval_data_open_how)] = [image_path, question, answer, question_type]

pvqa_eval_data_open_how.head()

Selecting all samples of Why Open-Ended

In [None]:
pvqa_eval_data_open_why = pd.DataFrame(columns=['image_path', 'question', 'answer', 'question_type'])

for sample in why_question_samples:
    image_path = os.path.join(pvqa_images_subset_path, sample['image'] + '.jpg')
    question = sample['question']
    answer = sample['answer']
    question_type = 'open-why'
    pvqa_eval_data_open_why.loc[len(pvqa_eval_data_open_why)] = [image_path, question, answer, question_type]

pvqa_eval_data_open_why.head()

Selecting all samples of Where Open-Ended

In [None]:
pvqa_eval_data_open_where = pd.DataFrame(columns=['image_path', 'question', 'answer', 'question_type'])

for sample in where_question_samples:
    image_path = os.path.join(pvqa_images_subset_path, sample['image'] + '.jpg')
    question = sample['question']
    answer = sample['answer']
    question_type = 'open-where'
    pvqa_eval_data_open_where.loc[len(pvqa_eval_data_open_where)] = [image_path, question, answer, question_type]

pvqa_eval_data_open_where.head()

Selecting all samples of When Open-Ended

In [None]:
pvqa_eval_data_open_when = pd.DataFrame(columns=['image_path', 'question', 'answer', 'question_type'])

for sample in when_question_samples:
    image_path = os.path.join(pvqa_images_subset_path, sample['image'] + '.jpg')
    question = sample['question']
    answer = sample['answer']
    question_type = 'open-when'
    pvqa_eval_data_open_when.loc[len(pvqa_eval_data_open_when)] = [image_path, question, answer, question_type]

pvqa_eval_data_open_when.head()

### Uploading all the images to Google Drive and get the drive links

Since we will be using the Google Form for the evaluation, we need to upload all the images to a specific Google Drive Folder. Then we need to get the drive link of each image and provide it to evaluators.

1. Move all the PathVQA images from server into a specific folder
2. Upload the Folder to google drive
3. Prepare a Google App Script to get the name and links (URL) of those files from the google drive folder in a google sheet
4. Map the names from Google Sheet and Dataframes to get the URLs of each image onto Google Drive
5. The resulting dataframes will be the final csv files which will be provided to evaluators

Firstly moving all the images for all question types in a specific folder to be uploaded to Google Driver

In [None]:
unique_images_path_all = np.concatenate([pvqa_eval_data_close_ended['image_path'].unique(),
                                        pvqa_eval_data_open_what['image_path'].unique(),
                                        pvqa_eval_data_open_how['image_path'].unique(),
                                        pvqa_eval_data_open_why['image_path'].unique(),
                                        pvqa_eval_data_open_where['image_path'].unique(),
                                        pvqa_eval_data_open_when['image_path'].unique()])
unique_images_path_all = np.unique(unique_images_path_all)
print('Total Unique Images for Evaluation:', len(unique_images_path_all))

Moving all these images in a folder

In [None]:
pvqa_eval_images_dir = 'PathVQA_Eval_Images'
os.makedirs(pvqa_eval_images_dir, exist_ok=True)
for image_path in unique_images_path_all:
    shutil.copy(image_path, pvqa_eval_images_dir)

Now upload this folder onto the Google Driver. Then run the following scritpin Apps Script (script.google.com)

In [None]:
# function listFolderContents2() {
#   var foldername = 'PathVQA_Eval_Images';
#   var folderlisting = 'File Names and Links - '+ foldername;

#   var folders = DriveApp.getFoldersByName(foldername);
#   var folder = folders.next();
#   var contents = folder.getFiles();

#   var ss = SpreadsheetApp.create(folderlisting);
#   var sheet = ss.getActiveSheet();
#   sheet.appendRow(['name','link']);

#   var file;
#   var name;
#   var link;
#   var row;

#   while(contents.hasNext()) {
#     file = contents.next();
#     name = file.getName();
#     link = file.getUrl();
#     sheet.appendRow([name,link]);
#   }
# };

After running the above script, a new excel file will be created with the names and Google Drive Links of the files. That excel sheet needs to be downloaded and mapped back to all the individual question sets to finalize the image URLs in the Google Drive

In [None]:
data_eval_dir = 'data_eval'
pathvqa_drive_links_file = os.path.join(data_eval_dir, "file_names_and_links_PathVQA_Eval_Images.csv")
pathvqa_drive_links = pd.read_csv(pathvqa_drive_links_file)
pathvqa_drive_links.head()

Changing the name of each file to complete path for correct mapping later on

In [None]:
pathvqa_drive_links['image_path'] = pathvqa_drive_links['name'].apply(lambda x: os.path.join(pvqa_images_subset_path, x))
pathvqa_drive_links['image_id'] = pathvqa_drive_links['name'].apply(lambda x: x.split('.')[0])
pathvqa_drive_links.head()

### Mapping the Google Drive Links with each question set separately

In [None]:
pvqa_eval_data_close_ended = pd.merge(pvqa_eval_data_close_ended, pathvqa_drive_links, on='image_path', how='left')
pvqa_eval_data_close_ended.head()

In [None]:
pvqa_eval_data_open_what = pd.merge(pvqa_eval_data_open_what, pathvqa_drive_links, on='image_path', how='left')
pvqa_eval_data_open_what.head()

In [None]:
pvqa_eval_data_open_how = pd.merge(pvqa_eval_data_open_how, pathvqa_drive_links, on='image_path', how='left')
pvqa_eval_data_open_how.head()

In [None]:
pvqa_eval_data_open_where = pd.merge(pvqa_eval_data_open_where, pathvqa_drive_links, on='image_path', how='left')
pvqa_eval_data_open_where.head()

In [None]:
pvqa_eval_data_open_why = pd.merge(pvqa_eval_data_open_why, pathvqa_drive_links, on='image_path', how='left')
pvqa_eval_data_open_why.head()

In [None]:
pvqa_eval_data_open_when = pd.merge(pvqa_eval_data_open_when, pathvqa_drive_links, on='image_path', how='left')
pvqa_eval_data_open_when.head()

### Creating the final dataset



Joining all these datasets into one dataframe to extract the final dataset to be used for evaluation of PathVQA

In [None]:
vqa_data_pathvqa = pd.concat([pvqa_eval_data_close_ended.head(),
                            pvqa_eval_data_open_what.head(),
                            pvqa_eval_data_open_how.head(),
                            pvqa_eval_data_open_why.head(),
                            pvqa_eval_data_open_where.head(),
                            pvqa_eval_data_open_when.head()]).reset_index(drop=True)

vqa_data_pathvqa.head()

In [None]:
vqa_data_pathvqa[['image_id', 'link', 'question_type', 'question', 'answer']].to_csv('data_eval/pathvqa_data.csv')