In [2]:
import json

def load_json_file_to_dict(file_name):
    return json.load(open(file_name))
data_dict = load_json_file_to_dict("data/covid-qa/covid-qa-dev.json")


data = data_dict["data"]
sample_article = data[0]
sample_paragraph = sample_article['paragraphs'][0]

In [3]:
"# of articles: ", len(data)

('# of articles: ', 21)

In [4]:
# qas: question/answer sentence
# context: the article in string
# document_id: id of the article in integer
"# of paragraphs of 1st article: ", len(sample_article['paragraphs'])

('# of paragraphs of 1st article: ', 1)

In [7]:
sample_article['paragraphs'][0].keys()

dict_keys(['qas', 'context', 'document_id'])

In [76]:
# question: sentence of the question
# answers: list of the answer, each has text: answer response and answer_start: position of answer start in the article
# is_impossible: bool value for whether the question is impossible to answer
"sample for QA pair", sample_paragraph['qas'][0]

('sample for QA pair',
 {'question': 'When is this especially true?',
  'id': '3963',
  'answers': [{'text': 'when not all exacerbation events occurred during the viral infection but may also occur well after viral clearance (Kim et al., 2008; Stolz et al., 2019) in particular the late onset of a bacterial infection',
    'answer_start': 15414}],
  'is_impossible': False})

In [77]:
def divide(a, b, round_val=6):
    val = a / b if b != 0 else 0
    return round(val, round_val)

In [78]:
from collections import Counter

In [79]:
def count_stats(data_dict):
    stats_counter = Counter()
    stats_counter["article_count"] += len(data_dict['data'])
    for article in data_dict['data']:
        stats_counter["paragraph_count"] += len(article['paragraphs'])
        for paragraph in article['paragraphs']:
            stats_counter["qa_count"] += len(paragraph['qas'])
            for qa_pair in paragraph['qas']:
                stats_counter["answer_count"] += len(qa_pair["answers"])
                if qa_pair["is_impossible"]:
                    stats_counter["impossible_count"] += 1

    stats_counter["avg_paragraph"] = divide(stats_counter["paragraph_count"], stats_counter["article_count"])
    stats_counter["avg_qa"] = divide(stats_counter["qa_count"], stats_counter["paragraph_count"])
    stats_counter["avg_answer"] = divide(stats_counter["answer_count"], stats_counter["qa_count"])
    return stats_counter

In [80]:
count_stats(data_dict)

Counter({'article_count': 21,
         'paragraph_count': 21,
         'qa_count': 375,
         'answer_count': 375,
         'avg_paragraph': 1.0,
         'avg_qa': 17.857143,
         'avg_answer': 1.0})

In [81]:
data_dict = load_json_file_to_dict("data/covid-qa/covid-qa-train.json")
"Train data stats: ", count_stats(data_dict)

('Train data stats: ',
 Counter({'article_count': 103,
          'paragraph_count': 103,
          'qa_count': 1417,
          'answer_count': 1417,
          'avg_paragraph': 1.0,
          'avg_qa': 13.757282,
          'avg_answer': 1.0}))

In [82]:
data_dict = load_json_file_to_dict("data/covid-qa/covid-qa-dev.json")
"Dev data stats: ", count_stats(data_dict)

('Dev data stats: ',
 Counter({'article_count': 21,
          'paragraph_count': 21,
          'qa_count': 203,
          'answer_count': 203,
          'avg_paragraph': 1.0,
          'avg_qa': 9.666667,
          'avg_answer': 1.0}))

In [83]:
data_dict = load_json_file_to_dict("data/covid-qa/covid-qa-test.json")
"Test data stats: ", count_stats(data_dict)

('Test data stats: ',
 Counter({'article_count': 21,
          'paragraph_count': 21,
          'qa_count': 375,
          'answer_count': 375,
          'avg_paragraph': 1.0,
          'avg_qa': 17.857143,
          'avg_answer': 1.0}))