In [1]:
import os
import csv
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# File and Directory Paths
qa_pair_file = "qa_pairs.csv"
qa_pair_filtered_file = "qa_pairs_filtered.csv"

In [39]:
import random

PROBABILITY = 0.4
counter = 0

with open(qa_pair_file, "r", encoding="utf-8") as metadata:
    spamreader = csv.reader(metadata, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    with open(qa_pair_filtered_file, "w", newline='', encoding="utf-8") as output_file:
        csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        
        for row in spamreader:
            if row[1] == "Figure":
                if random.random() > PROBABILITY:
                    csv_writer.writerow(row)
                else:
                    counter += 1
            else:
                csv_writer.writerow(row)
                
print(f"{counter} figures have been randomly filtered.")

39843 figures have been randomly filtered.


In [3]:
FIXED_ROW_LENGTH = 4

def check_row_length(file_path, row_length):
    """
    Checks if all rows in a csv file have the expected length.
    
    Args:
        file_path (str): The path to the csv file.
        row_length (int): The expected number of columns in each row.
        
    Returns:
        set: A set of corrupted rows that do not match the expected length.
    """
    corrupted_objects = set()
    
    with open(file_path, "r", encoding="utf-8") as metadata:
        spamreader = csv.reader(metadata, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in spamreader:
            if len(row) != row_length:
                corrupted_objects.add((row[0], row[1], row[2]))
                print(f"Unexpected row length of {len(row)} for: {row[0]}")
                
    print(f"Check for {file_path} complete. {len(corrupted_objects)} rows were corrupted.")
    return corrupted_objects

check_row_length(qa_pair_file, FIXED_ROW_LENGTH)

Check for qa_pairs.csv complete. 0 rows were corrupted.


set()

In [4]:
def check_question_ending(file_path, verbose=False):
    """
    Checks if the questions end with a question mark.
    
    Args:
        file_path (str): The path to the QA-pair csv file.
        verbose (bool, optional): Whether to print corrupted rows. Defaults to False.
        
    Returns:
        set[tuple(str, str)]: Set of tuples used to identify
    """
    corrupted_objects = set()
    
    with open(file_path, "r", encoding="utf-8") as metadata:
        spamreader = csv.reader(metadata, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in spamreader:
            if row[2][-1] != '?':  # Check if the question does not end with a question mark
                corrupted_objects.add((row[0], row[1]))
                if verbose:
                    print(f"Missing question mark for: {row[2]}")
    
    print(f"Check for {file_path} complete. {len(corrupted_objects)} rows were corrupted.")
    return corrupted_objects
                
check_question_ending(qa_pair_file, verbose=False)

Check for qa_pairs.csv complete. 384 rows were corrupted.


{('2201.01641_TAB_4', 'Table'),
 ('2201.04363_TAB_5', 'Table_02'),
 ('2201.07601_TAB_6', 'Table_02'),
 ('2202.00137_TAB_1', 'Table_02'),
 ('2202.04528_TAB_7', 'Table_02'),
 ('2202.10007_TAB_6', 'Table_02'),
 ('2203.00488_TAB_1', 'Table_02'),
 ('2203.04602_TAB_4', 'Table_02'),
 ('2203.10358_TAB_1', 'Table_02'),
 ('2203.16280_TAB_3', 'Table'),
 ('2203.17053_TAB_4', 'Table_02'),
 ('2204.00497_TAB_2', 'Table_02'),
 ('2204.03762_TAB_7', 'Table_02'),
 ('2204.11164_TAB_3', 'Table_02'),
 ('2205.01759_TAB_8', 'Table_02'),
 ('2205.07050_TAB_2', 'Table_02'),
 ('2205.07139_TAB_2', 'Table_02'),
 ('2205.08569_TAB_4', 'Table_02'),
 ('2205.08584_TAB_3', 'Table_02'),
 ('2205.11019_TAB_1', 'Table_02'),
 ('2206.01674_TAB_1', 'Table_02'),
 ('2206.01674_TAB_4', 'Table_02'),
 ('2206.01932_FIG_3', 'Figure'),
 ('2206.15096_TAB_1', 'Table_02'),
 ('2207.01816_TAB_1', 'Table_02'),
 ('2207.04739_TAB_12', 'Table_02'),
 ('2207.05566_TAB_6', 'Table_02'),
 ('2207.06396_TAB_1', 'Table_02'),
 ('2207.07288_TAB_1', 'Tabl

In [None]:
def count_labels(file_path):
    df = pd.read_csv(file_path, delimiter=";", quotechar="|")
    return df.iloc[:, 1].value_counts()

count_labels(qa_pair_file)

In [33]:
def count_not_unique_questions(file_path):
    df = pd.read_csv(file_path, delimiter=";", quotechar="|")
    value_counts = df.iloc[:, 2].value_counts()
    not_unique_questions = value_counts[value_counts >= 2]
    
    print(f"Number of not unique questions: {len(not_unique_questions)}")
    print(f"Number of affected pairs: {not_unique_questions.sum()}")
    print("\nExamples for not unique questions:")
    print(not_unique_questions[:10])
    
    question_set = set()
    for question in not_unique_questions.index:
        question_set.add(question)
    return question_set

count_not_unique_questions(qa_pair_file)

Number of not unique questions: 657
Number of affected pairs: 1613

Examples for not unique questions:
According to Table                                                                                                        221
During which phase does the UAV move along the positive direction of the y-axis in the world frame?                        16
According to the ablation study in Table                                                                                   14
What does Table                                                                                                             8
How should figures be included and referenced in a LaTeX document according to the provided text mentions?                  5
What metric in Table                                                                                                        4
How should figures be referenced in the text according to the provided instructions?                                        4
What is the ord

{'According to Table ',
 'According to Table \\ref{Conf}, how many more true negatives does the MLP algorithm have compared to the Manual Rules algorithm?',
 'According to Table \\ref{S1-vol-is3}, by what percentage does the LDA approximation underestimate the equilibrium volume compared to the experimental value?',
 'According to Table \\ref{T-3}, which activation function shows the best performance in terms of loss components at the end of training for solving the Biharmonic PDE using the proposed PINNs framework?',
 'According to Table \\ref{T-5}, which metric is used to compare the average detection time among TPH-YOLOv5, Dense-YOLOv4, and DenseSPH-YOLOv5?',
 "According to Table \\ref{Table-2}, which subgroups' orders are provided for the Sporadic simple groups?",
 'According to Table \\ref{Table4.2}, which method consistently supports the result of $\\alpha=0$ across different sample sizes, bandwidth parameters, and kernel functions?',
 'According to Table \\ref{Table::DataUsage},

In [None]:
def get_qa_pair_statistics(file_path, object_type):
    """
    Analyzes the qa_pairs and prints statistics.
    
    Args:
        file_path (str): The path to the QA-pair csv file.
        object_type (str): The type of the object which shall be analyzed. Must be of {'Table', 'Figure', 'Overall'}.
        
    Raises:
        ValueError: If 'object_type' is not one of the expected values.
    """
    # Checking object_type
    if object_type not in ["Table", "Figure", "Table_02", "Overall"]:
        raise ValueError("Object_type must be of {'Table', 'Figure', 'Table_02', 'Overall'}.")
    
    # Defining column indexes
    object_type_index = 1
    question_index = 2
    answer_index = 3
    
    # Defining bins for grouping
    bins_questions = [0, 5, 10, 15, 20, 25, 30, 35, 40, float('inf')]
    labels_questions = ["<=5", "6-10", "11-15", "16-20", "21-25", "26-30", "31-35", "36-40", ">40"]
    bins_answers = [0, 1, 2, 3, 4, 5, float('inf')]
    labels_answers = ["1", "2", "3", "4", "5", ">5"]
    bins_answers_02 = [0, 3, 6, 9, 12, 15, 18, 21, float('inf')]
    labels_answers_02 = ["<=3", "4-6", "7-9", "10-12", "13-15", "16-18", "19-21", ">21"]
    
    # Loading and filtering
    df = pd.read_csv(file_path, delimiter=";", quotechar="|")
    if object_type != "Overall":
        df = df[df.iloc[:, object_type_index] == object_type]
    
    # Getting word counts
    word_counts_question = df.iloc[:, question_index].astype(str).apply(lambda x: len(x.split()))
    word_counts_answer = df.iloc[:, answer_index].astype(str).apply(lambda x: len(x.split()))
    
    # Getting min, mean, median and max word lenghts
    min_question_length = word_counts_question.min()
    min_answer_length = word_counts_answer.min()
    average_question_length = word_counts_question.mean()
    average_answer_length = word_counts_answer.mean()
    median_question_length = word_counts_question.median()
    median_answer_length = word_counts_answer.median()
    max_question_length = word_counts_question.max()
    max_answer_length = word_counts_answer.max()
    
    # Getting word distribtuions for questions
    word_count_distribution_question = pd.cut(word_counts_question, bins=bins_questions, labels=labels_questions, right=True)
    distribution_counts_question = word_count_distribution_question.value_counts().sort_index()
    distribution_counts_question.name = "Question word count"
    
    # Getting word distribtuions for answers (small range)
    word_count_distribution_answer = pd.cut(word_counts_answer, bins=bins_answers, labels=labels_answers, right=True)
    distribution_counts_answer = word_count_distribution_answer.value_counts().sort_index()
    distribution_counts_answer.name = "Answer word count"
    
    # Getting word distribtuions for answers (big range)
    word_count_distribution_answer_02 = pd.cut(word_counts_answer, bins=bins_answers_02, labels=labels_answers_02, right=True)
    distribution_counts_answer_02 = word_count_distribution_answer_02.value_counts().sort_index()
    distribution_counts_answer_02.name = "Answer word count"
    
    # Print average and max values
    print(f"{object_type} results:\n")
    print(f"Min question length: {min_question_length} words")
    print(f"Median question length: {median_question_length} words")
    print(f"Average question length: {round(average_question_length, 2)} words")
    print(f"Max question length: {max_question_length} words\n")
    print(f"Min answer length: {min_answer_length} words")
    print(f"Median answer length: {median_answer_length} words")
    print(f"Average answer length: {round(average_answer_length, 2)} words")
    print(f"Max answer length: {max_answer_length} words\n")
    
    # Print word count distributions
    print("Word count distribution for questions:")
    print(distribution_counts_question)
    print("\nWord count distributions for answers (small range):")
    print(distribution_counts_answer)
    print("\nWord count distributions for answers (big range):")
    print(distribution_counts_answer_02)
    
    # Plot distributions
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    distribution_counts_question.plot(kind="bar",
                                      ax=axes[0][0],
                                      title="Question Word Count Distribution",
                                      ylabel="Number of Questions",
                                      xlabel="Word Count Range")
    distribution_counts_answer.plot(kind="bar",
                                    ax=axes[0][1],
                                    title="Answer Word Count Distribution",
                                    ylabel="Number of Answers",
                                    xlabel="Word Count Range",
                                    color="forestgreen")
    distribution_counts_answer_02.plot(kind="bar",
                                    ax=axes[1][1],
                                    title="Answer Word Count Distribution",
                                    ylabel="Number of Answers",
                                    xlabel="Word Count Range",
                                    color="forestgreen")
    plt.tight_layout()
    plt.show()

In [None]:
get_qa_pair_statistics(qa_pair_file, "Overall")

In [None]:
get_qa_pair_statistics(qa_pair_file, "Figure")

In [None]:
get_qa_pair_statistics(qa_pair_file, "Table")

In [None]:
get_qa_pair_statistics(qa_pair_file, "Table_02")