In [1]:
import json
import os
import time
import re
import logging
import random

from itertools import chain
from string import punctuation


import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup)
import pytorch_lightning as pl

from termcolor import colored
from sklearn.model_selection import train_test_split
from pathlib import Path

In [2]:
pl.seed_everything(42)

Global seed set to 42


42

In [7]:
with Path("./datasets/QA/BioASQ/BioASQ-train-factoid-4b.json").open() as json_file:
    data = json.load(json_file)

In [10]:
data.keys()

dict_keys(['data', 'version'])

In [None]:
data["version"]

In [12]:
data["data"][0].keys()

dict_keys(['paragraphs', 'title'])

In [14]:
data["data"][0]["title"]

'BioASQ6b'

In [16]:
len(data["data"][0]["paragraphs"])

3266

In [17]:
questions = data["data"][0]["paragraphs"]

In [18]:
questions[0]

{'qas': [{'id': '52bf208003868f1b06000019_002',
   'question': 'What is the inheritance pattern of Li–Fraumeni syndrome?',
   'answers': [{'text': 'autosomal dominant', 'answer_start': 213}]}],
 'context': 'Balanced t(11;15)(q23;q15) in a TP53+/+ breast cancer patient from a Li-Fraumeni syndrome family. Li-Fraumeni Syndrome (LFS) is characterized by early-onset carcinogenesis involving multiple tumor types and shows autosomal dominant inheritance. Approximately 70% of LFS cases are due to germline mutations in the TP53 gene on chromosome 17p13.1. Mutations have also been found in the CHEK2 gene on chromosome 22q11, and others have been mapped to chromosome 11q23. While characterizing an LFS family with a documented defect in TP53, we found one family member who developed bilateral breast cancer at age 37 yet was homozygous for wild-type TP53. Her mother also developed early-onset primary bilateral breast cancer, and a sister had unilateral breast cancer and a soft tissue sarcoma. Cytog

In [19]:
def extract_questions_and_answer(factoid_path: Path):
    with factoid_path.open() as json_file:
        data = json.load(json_file)

    questions = data["data"][0]["paragraphs"]
    data_rows = []

    for question in questions:
        context = question["context"]
        for qa in question["qas"]:
            question_text = qa["question"]
            answer_text = qa["answers"]
            for answer in answer_text:
                answer_text = answer["text"]
                answer_start = answer["answer_start"]
                answer_end = answer_start + len(answer_text)

                data_rows.append({
                    "question": question_text,
                    "context": context,
                    "answer": answer_text,
                    "answer_start": answer_start,
                    "answer_end": answer_end
                })

    return pd.DataFrame(data_rows)

In [21]:
extract_questions_and_answer(Path("./datasets/QA/BioASQ/BioASQ-train-factoid-4b.json")).head()

Unnamed: 0,question,context,answer,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [25]:
factoid_paths = sorted(list(Path("./datasets/QA/BioASQ").glob("BioASQ-train-factoid-*")))
factoid_paths

[PosixPath('datasets/QA/BioASQ/BioASQ-train-factoid-4b.json'),
 PosixPath('datasets/QA/BioASQ/BioASQ-train-factoid-5b.json'),
 PosixPath('datasets/QA/BioASQ/BioASQ-train-factoid-6b.json'),
 PosixPath('datasets/QA/BioASQ/BioASQ-train-factoid-7b.json')]

In [27]:
dfs = []
for factoid_path in factoid_paths:
    dfs.append(extract_questions_and_answer(factoid_path))

df = pd.concat(dfs)

In [28]:
df.head()

Unnamed: 0,question,context,answer,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [29]:
df.shape

(17219, 5)

In [30]:
len(df.question.unique())

557

In [31]:
len(df.context.unique())

6161

3431677