In [1]:
import os
import csv
import random

In [2]:
# Input files
paper_metadata_file = "papers.csv"
figure_metadata_file = "figures.csv"
table_metadata_file = "tables.csv"

# Output files
output_task2 = "qa_pairs_task2.csv"
output_task3 = "qa_pairs_task3.csv"

# Fixed values
FIXED_PROBABILITY_FIGURE = 0.009
FIXED_PROBABILITY_TABLE = 0.04
FIXED_MAX_WORD_COUNT = 100

In [3]:
# Script for task 2
input_files = [figure_metadata_file, table_metadata_file]
probabilities = [FIXED_PROBABILITY_FIGURE, FIXED_PROBABILITY_TABLE]
object_ids = set()
output_dict = {}

# Selecting tables and figures
for x in range(len(input_files)):
    with open(input_files[x], "r", encoding="utf-8") as file:
        csv_reader = csv.reader(file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        
        for row in csv_reader:
            if random.random() < probabilities[x]:
                object_ids.add((row[0], row[1]))
                
# Obtaining paper titles
paper_dict = {}
with open(paper_metadata_file, "r", encoding="utf-8") as file:
    csv_reader = csv.reader(file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    for row in csv_reader:
        paper_dict[row[0]] = row[1]
        
# Putting both together
for obj_id in object_ids:
    paper_id = obj_id[1]
    if paper_id in paper_dict:
        output_dict[obj_id[0]] = paper_dict[paper_id]
        
# Storing in csv file
with open(output_task2, "w", newline="", encoding="utf-8") as file:
    csv_writer = csv.writer(file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    paper_name_list = list(paper_dict.values())
    
    for key in output_dict:
        correct_title = output_dict[key]
        random_selections = None
        while True:
            random_selections = random.sample(paper_name_list, 3)
            if correct_title not in random_selections:
                break
        csv_writer.writerow([key, correct_title, random_selections[0], random_selections[1], random_selections[2]])

# Print message
print(f"csv file with {len(object_ids)} tables/figures was successfully created.")

csv file with 2172 tables/figures was successfully created.


In [4]:
# Script for task 3
input_files = [figure_metadata_file, table_metadata_file]
probabilities = [FIXED_PROBABILITY_FIGURE*2, FIXED_PROBABILITY_TABLE*2]
output_dict = {}
text_mentions_list = []

# Selecting tables and figures
for x in range(len(input_files)):
    with open(input_files[x], "r", encoding="utf-8") as file:
        csv_reader = csv.reader(file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        
        for row in csv_reader:
            text_mention = row[-1]
            
            if len(text_mention.split()) <= FIXED_MAX_WORD_COUNT:
                text_mentions_list.append(text_mention)
                if random.random() < probabilities[x]:
                    output_dict[row[0]] = text_mention
                
# Storing in csv file
with open(output_task3, "w", newline="", encoding="utf-8") as file:
    csv_writer = csv.writer(file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    for key in output_dict:
        correct_tm = output_dict[key]
        random_selections = None
        while True:
            random_selections = random.sample(paper_name_list, 3)
            if correct_tm not in random_selections:
                break
        csv_writer.writerow([key, correct_tm, random_selections[0], random_selections[1], random_selections[2]])

# Print message
print(f"csv file with {len(output_dict.keys())} tables/figures was successfully created.")

csv file with 1746 tables/figures was successfully created.
