In [1]:
import pandas as pd
import os
import random
import json

# Formatting BioASQ data to be used in BioBERT

## Train

In [34]:
df = pd.read_json('data/train_data/training11b.json')
df.head()

for i in df.index:
    questions = df.loc[i, 'questions']
    if questions['type'] != 'yesno':
        df.drop(i, inplace=True)

df.reset_index(drop=True, inplace=True)
df.to_json('data/train_data/training11b-yesno.json', indent=4)
df

Unnamed: 0,questions
0,"{'body': 'Is the protein Papilin secreted?', '..."
1,"{'body': 'Are long non coding RNAs spliced?', ..."
2,"{'body': 'Is RANKL secreted from the cells?', ..."
3,{'body': 'Does metformin interfere thyroxine a...
4,{'body': 'Has Denosumab (Prolia) been approved...
...,...
1266,{'body': 'Is ASF1 phopshorylated by the Tousl...
1267,{'body': 'Does sphingosine-1 phosphoate suppre...
1268,{'body': 'Can whole genome sequencing be used ...
1269,{'body': 'Can METTL3 methylate long noncoding ...


In [26]:
for x in pd.read_json('biobert/datasets/QA/BioASQ/BioASQ-train-yesno-7b.json')['data'][0]['paragraphs']:
    if len(x['qas']) > 1:
        print(1)

In [75]:
result = {
    'data': {
        'paragraphs': [],
        'title': 'BioASQ11b'
    },
    'version': 'BioASQ11b'
}

# Formatting
for data in df['questions']:
    idx = 0
    for snip in data['snippets']:
        idx += 1
        dct = {
            'qas': [
                {
                    'id': f"{data['id']}_{str(idx).zfill(3)}",
                    'question': data['body'],
                    'is_impossible': 'false' if data['exact_answer'] == 'yes' else 'true',
                    'answers': data['exact_answer']
                }
            ],
            'context': snip['text']
        }
        result['data']['paragraphs'].append(dct)

result

{'data': {'paragraphs': [{'qas': [{'id': '54e25eaaae9738404b000017_001',
      'question': 'Is the protein Papilin secreted?',
      'is_impossible': 'false',
      'answers': 'yes'}],
    'context': 'Using expression analysis, we identify three genes that are transcriptionally regulated by HLH-2: the protocadherin cdh-3, and two genes encoding secreted extracellular matrix proteins, mig-6/papilin and him-4/hemicentin. '},
   {'qas': [{'id': '54e25eaaae9738404b000017_002',
      'question': 'Is the protein Papilin secreted?',
      'is_impossible': 'false',
      'answers': 'yes'}],
    'context': 'We found that mig-6 encodes long (MIG-6L) and short (MIG-6S) isoforms of the extracellular matrix protein papilin, each required for distinct aspects of DTC migration. Both MIG-6 isoforms have a predicted N-terminal papilin cassette'},
   {'qas': [{'id': '54e25eaaae9738404b000017_003',
      'question': 'Is the protein Papilin secreted?',
      'is_impossible': 'false',
      'answers': 'yes

In [82]:
with open('data/train_data/BioASQ-train-yesno-11b.json', 'w') as outfile: 
    json.dump(result, outfile, indent=2)

# Test

In [2]:
df = pd.read_json('data/test_data/merged_yesno_test_7_11.json')
df.head()

result = {
    'data': {
        'paragraphs': [],
        'title': 'BioASQ11b'
    },
    'version': 'BioASQ11b'
}

# Formatting
for i in df.index:
    idx = 0
    for snip in df.loc[i, 'snippets']:
        idx += 1
        dct = {
            'qas': [
                {
                    'id': f"{df.loc[i, 'id']}_{str(idx).zfill(3)}",
                    'question': df.loc[i, 'body'],
                    'is_impossible': 'false' if df.loc[i, 'exact_answer'] == 'yes' else 'true',
                    'answers': df.loc[i, 'exact_answer']
                }
            ],
            'context': snip['text']
        }
        result['data']['paragraphs'].append(dct)

result

{'data': {'paragraphs': [{'qas': [{'id': '61f80d5d882a024a1000003c_001',
      'question': 'Do only changes in coding regions of MEF2C cause developmental disorders?',
      'is_impossible': 'true',
      'answers': 'no'}],
    'context': 'Non-coding region variants upstream of MEF2C cause severe developmental disorder through three distinct loss-of-function mechanisms.'},
   {'qas': [{'id': '61f80d5d882a024a1000003c_002',
      'question': 'Do only changes in coding regions of MEF2C cause developmental disorders?',
      'is_impossible': 'true',
      'answers': 'no'}],
    'context': "Clinical genetic testing of protein-coding regions identifies a likely causative variant in only around half of developmental disorder (DD) cases. The contribution of regulatory variation in non-coding regions to rare disease, including DD, remains very poorly understood. We screened 9,858 probands from the Deciphering Developmental Disorders (DDD) study for de novo mutations in the 5' untranslated regi

In [3]:
with open('data/test_data/BioASQ-test-yesno-11b.json', 'w') as outfile: 
    json.dump(result, outfile, indent=2)

# Convert QA data to documents data (for RAG)

In [9]:
path = "data/Task{}BGoldenEnriched/"

# 7B to 11B Version
for i in range(7, 12):

    # Combine all data
    df = pd.DataFrame()
    for file in os.listdir(path.format(i)):
        df = pd.concat([df, pd.read_json(path.format(i) + file)])

    df.reset_index(drop=True, inplace=True)

    # Extract documents
    data = []
    for j in df.index:
        body = df.loc[j, 'questions']
        for snippet in body['snippets']:
            dct = {
                'text': snippet['text'].strip(),
                'url': snippet['document'],
            }
            data.append(dct)
    
    random.shuffle(data)
    length_text = [len(d['text']) for d in data]
    print('-'*50)
    print(f"Max document length in {i}B version: {max(length_text)}")
    print(f"Average document length in {i}B version: {sum(length_text) / len(length_text)}")

    # Write documents
    with open(f'data/documents-{i}B.txt', 'w', encoding='utf-8') as f:
        for d in data:
            f.write(str(d) + '\n')

    print(f"Success on extracting {i}B JSON data!")


--------------------------------------------------
Max document length in 7B version: 2162
Average document length in 7B version: 213.10723789249602
Success on extracting 7B JSON data!
--------------------------------------------------
Max document length in 8B version: 1941
Average document length in 8B version: 202.48593228421555
Success on extracting 8B JSON data!
--------------------------------------------------
Max document length in 9B version: 2425
Average document length in 9B version: 201.36151960784315
Success on extracting 9B JSON data!
--------------------------------------------------
Max document length in 10B version: 2423
Average document length in 10B version: 194.6372942593336
Success on extracting 10B JSON data!
--------------------------------------------------
Max document length in 11B version: 1066
Average document length in 11B version: 195.11008353221956
Success on extracting 11B JSON data!
