In [1]:
import pandas as pd


In [2]:
import numpy as np

import matplotlib.pyplot as plt
import re

import json

In [3]:
import random 
def split_data(lst, train_ratio=0.75):
    random.seed(42)
    random.shuffle(lst)
    end_train = int(len(lst)*0.75)+1
    train_data = lst[:end_train]
    dev_data = lst[end_train:]
    return train_data, dev_data

def normalize_text(txt):
    txt = re.sub(r"\s+", " ",txt).strip() #remove whitespaces
    txt = re.sub(r"<.*?>", " ", txt).strip() #remove html tags
    #remove unicode
    txt = txt.encode("ascii", "ignore")
    txt = txt.decode()
    return txt

# 1. Mr Tydi Dataset

In [None]:
import csv

train_json = []
dev_json = []
test_json = []
with open('./Dataset/Questions/mrtydi-v1.1-indonesian/train.jsonl', 'r', encoding='utf-8', errors='ignore') as train:
    train_json = list(train)

with open('./Dataset/Questions/mrtydi-v1.1-indonesian/dev.jsonl', 'r', encoding='utf-8', errors='ignore') as dev:
    dev_json = list(dev)

with open('./Dataset/Questions/mrtydi-v1.1-indonesian/test.jsonl', 'r', encoding='utf-8', errors='ignore') as dev:
    test_json = list(dev)

In [None]:
train_new_json = []
dev_new_json = []

for json_str in dev_json:
    row = dict()
    result = json.loads(json_str)
    row['question'] = result['query']
    row['answers'] = []
    row['positive_ctxs'] = result['positive_passages']
    for i in range(len(row['positive_ctxs'])):
        row['positive_ctxs'][i]['text'] = corpus_map[row['positive_ctxs'][i]['docid']]['text']
        row['positive_ctxs'][i]['title'] = corpus_map[row['positive_ctxs'][i]['docid']]['title']
    row['negative_ctxs'] = result['negative_passages'] if 'negative_passages' in result.keys() else []
    row['hard_negative_ctxs'] = []
    dev_new_json.append(row)


for json_str in train_json:
    row = dict()
    result = json.loads(json_str)
    row['question'] = result['query']
    row['answers'] = []
    row['positive_ctxs'] = result['positive_passages']
    row['negative_ctxs'] = result['negative_passages'] if 'negative_passages' in result.keys() else []
    row['hard_negative_ctxs'] = []
    train_new_json.append(row)


In [None]:
with open('./Dataset/Questions/train_tydi.json', 'w') as outfile:
    json.dump(train_new_json, outfile)

with open('./Dataset/Questions/dev_tydi.json', 'w') as outfile:
    json.dump(dev_new_json, outfile)


# 2. MFQA Dataset

In [3]:
with open('./Dataset/Questions/train-mrqa.jsonl', 'r') as f:
    train_jsons_list = list(f)
train_json = []
for json_str in train_jsons_list:
    train_json.append(json.loads(json_str))

with open('./Dataset/Questions/dev-mrqa.jsonl', 'r') as f:
    dev_jsons_list = list(f)
dev_json = []
for json_str in dev_jsons_list:
    dev_json.append(json.loads(json_str))

In [4]:
from tqdm import tqdm




def parse_json(json_lst):
    ret_json = []
    for instance in tqdm(json_lst):
        for pair in instance['qa_pairs']:
            row_dict = {}
            q_text = normalize_text(pair['question'])
            doc_text = normalize_text(pair['answer'])
            row_dict['question'] = q_text
            row_dict['positive_ctxs'] = [{
                'title': '',
                'text': doc_text
            }]
            row_dict['answers'] = []
            row_dict['negative_ctxs'] = []
            row_dict['hard_negative_ctxs'] = []
            ret_json.append(row_dict)
    return ret_json

In [5]:
train_json_lst = parse_json(train_json)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7829/7829 [00:00<00:00, 8100.23it/s]


In [6]:
dev_json_lst = parse_json(dev_json)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 81/81 [00:00<00:00, 3254.39it/s]


In [7]:
len(train_json)

7829

In [8]:
with open('./Dataset/Questions/train-mfaq.json', 'w') as f:
    json.dump(train_json_lst, f)
    
with open('./Dataset/Questions/dev-mfaq.json', 'w') as f:
    json.dump(dev_json_lst, f)
    

# 3. Medical QA Dataset

In [49]:
from transformers import MarianTokenizer, MarianMTModel
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-id")

model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-id")

In [50]:


def translate_to_indo(sentence):
    sentence = normalize_text(sentence)
    model_inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding=True)


    generated_tokens = model.generate(
        **model_inputs
    )
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]


In [54]:
print(translate_to_indo("My name is Wolfgang and I live in Berlin"))

Namaku Wolfgang dan aku tinggal di Berlin.


## 1. LiveQA-MedicalTask

In [46]:
import json
import xml.etree.ElementTree as ET
import glob
from tqdm import tqdm

def parse_live_qa(filenames, mode):
    live_qa = []
    for filename in filenames:
        tree = ET.parse(filename)
        root = tree.getroot()
        for question in tqdm(root.findall('.//NLM-QUESTION')):
            title = "" if question.find('.//SUBJECT') is None else question.find('.//SUBJECT').text
            text = question.find('.//MESSAGE').text
            if title != "":
                full_text = f"{title}. {text}"
            else:
                full_text = text
            full_text = translate_to_indo(full_text)
            positive_ctxs = []
            for answer in question.findall('.//ANSWER'):
                if answer is None or answer.text is None or answer.text == '':
                    continue
                positive_ctxs.append({
                    'id': '',
                    'title': '',
                    'text' : translate_to_indo(answer.text)})
            if len(positive_ctxs)==0:
                continue
            row_dict = {
                'question': full_text,
                'positive_ctxs': positive_ctxs,
                'negative_ctxs': [],
                'answers': [],
                'hard_negative_ctxs': []
            }
            live_qa.append(row_dict)
    with open(f'./Dataset/Questions/{mode}_livemedqa.json', 'w') as f:
        json.dump(live_qa, f)

In [47]:
train_files = glob.glob('./Dataset/Questions/LiveQA_MedicalTask_TREC2017/TrainingDatasets/*.xml')
parse_live_qa(train_files, 'train')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 246/246 [16:38<00:00,  4.06s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [27:29<00:00,  8.25s/it]


In [48]:
dev_files = ['./Dataset/Questions/LiveQA_MedicalTask_TREC2017/TestDataset/TREC-2017-LiveQA-Medical-Test.xml']
parse_live_qa(train_files, 'dev')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 246/246 [18:38<00:00,  4.54s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [28:10<00:00,  8.45s/it]


## 2. MedQUAD

In [54]:
def parse_medquad():
    all_data = []
    for filename in tqdm(glob.glob('./Dataset/Questions/MedQuAD/*/*.xml')):
        tree = ET.parse(filename)
        root = tree.getroot()
        for question in root.findall('.//QAPair'):
            q_text = question.find('Question').text
            answer = question.find('Answer')
            if answer is None:
                continue
            answer = answer.text
            if answer == '' or answer is None:
                continue
            q_text = translate_to_indo(q_text)
            answer = translate_to_indo(answer)
            row_dict = {
                'question': q_text,
                'positive_ctxs': [{
                    'id': '',
                    'title': '',
                    'text': answer
                }],
                'negative_ctxs': [],
                'answers': [],
                'hard_negative_ctxs': []
            }
            all_data.append(row_dict)
    train_data, dev_data = split_data(all_data)
    with open(f'./Dataset/Questions/train_medquad.json', 'w') as f:
        json.dump(train_data, f)
    with open(f'./Dataset/Questions/dev_medquad.json', 'w') as f:
        json.dump(dev_data, f)

In [55]:
parse_medquad()

  7%|█████████▎                                                                                                                           | 787/11274 [1:12:35<16:07:16,  5.53s/it]


KeyboardInterrupt: 

In [57]:
test_lst = []
print(test_lst[:3])

[]


## 3. MEDIQA 2019

In [68]:
def parse_mediqa(filenames, mode):
    live_qa = []
    for filename in filenames:
        tree = ET.parse(filename)
        root = tree.getroot()
        for question in tqdm(root.findall('.//Question')):
            q_text = question.find('./QuestionText').text
            q_text = translate_to_indo(q_text)
            positive_ctxs = []
            negative_ctxs = []
            for answer in question.findall('.//Answer'):
                if answer is None:
                    continue
                is_pos = int(answer.get("ReferenceScore")) >= 3
                answer_text = answer.find("AnswerText").text
                if answer_text is None or answer_text == '':
                    continue
                if is_pos:
                    positive_ctxs.append({
                        'id': '',
                        'title': '',
                        'text' : translate_to_indo(answer_text)})
                else:
                    negative_ctxs.append({
                        'id': '',
                        'title': '',
                        'text' : translate_to_indo(answer_text)})
            if len(positive_ctxs)==0:
                continue
            row_dict = {
                'question': q_text,
                'positive_ctxs': positive_ctxs,
                'negative_ctxs': negative_ctxs,
                'answers': [],
                'hard_negative_ctxs': []
            }
            live_qa.append(row_dict)
    with open(f'./Dataset/Questions/{mode}_mediqa.json', 'w') as f:
        json.dump(live_qa, f)

In [69]:
train_files = ['./Dataset/Questions/MEDIQA2019/MEDIQA_Task3_QA/MEDIQA2019-Task3-QA-TrainingSet1-LiveQAMed.xml', './Dataset/Questions/MEDIQA2019/MEDIQA_Task3_QA/MEDIQA2019-Task3-QA-TrainingSet2-Alexa.xml']
parse_mediqa(train_files, 'train')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [51:25<00:00, 29.66s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [50:23<00:00, 29.07s/it]


In [70]:
dev_files = ['./Dataset/Questions/MEDIQA2019/MEDIQA_Task3_QA/MEDIQA2019-Task3-QA-ValidationSet.xml']
parse_mediqa(dev_files, 'dev')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [14:22<00:00, 34.49s/it]


## 4. MEDInfo 2019

In [86]:
def parse_medinfo():
    all_data = []
    filename = './Dataset/Questions/Medication_QA_MedInfo2019/MedInfo2019-QA-Medications.xlsx'
    df = pd.read_excel(filename)
    for index, row  in tqdm(df.iterrows()):
        if row['Question']==None or row['Answer']==None or row['Question'] == '' or row['Answer']==''or row['Question']!=row['Question'] or row['Answer']!=row['Answer']:
            continue
        q_text = translate_to_indo(row['Question'])
        pos_ctx = translate_to_indo(row['Answer'])
        row_dict = {
            'question': q_text,
            'positive_ctxs': [{
                'id': '',
                'title': '',
                'text': pos_ctx
            }],
            'negative_ctxs': [],
            'answers': [],
            'hard_negative_ctxs': []
        }
        all_data.append(row_dict)
    train_data, dev_data = split_data(all_data)
    with open(f'./Dataset/Questions/train_medinfo.json', 'w') as f:
        json.dump(train_data, f)
    with open(f'./Dataset/Questions/dev_medinfo.json', 'w') as f:
        json.dump(dev_data, f)

In [87]:
parse_medinfo()

690it [27:39,  2.40s/it]


## 5. TTHealth (All consumer health datasets combined)

In [6]:
import random

random.seed(42)

In [7]:
#train dataset
with open('./Dataset/Questions/train_medquad.json', 'r') as f:
    all_train = json.load(f)

other_trains = ['./Dataset/Questions/train_livemedqa.json', './Dataset/Questions/train_medinfo.json', './Dataset/Questions/train_mediqa.json']

for other_train in other_trains:
    with open(other_train, 'r') as f:
        other_train_json = json.load(f)
        all_train = all_train + other_train_json
random.shuffle(all_train)
print(len(all_train))
with open('./Dataset/Questions/train_tthealth.json', 'w') as f:
    json.dump(all_train, f)

13443


In [8]:
#development dataset
with open('./Dataset/Questions/dev_medquad.json', 'r') as f:
    all_dev = json.load(f)

other_devs = ['./Dataset/Questions/dev_livemedqa.json', './Dataset/Questions/dev_medinfo.json', './Dataset/Questions/dev_mediqa.json']

for other_dev in other_devs:
    with open(other_dev, 'r') as f:
        other_dev_json = json.load(f)
        all_dev = all_dev + other_dev_json
random.shuffle(all_dev)
print(len(all_dev))
with open('./Dataset/Questions/dev_tthealth.json', 'w') as f:
    json.dump(all_dev, f)

4744


# 4. Syifa-QA

In [8]:
import json
import xml.etree.ElementTree as ET
import glob
from tqdm import tqdm

filenames = glob.glob('./corpus-syifa/*.xml')
out_json = []
for filename in filenames:
    print(filename)
    tree = ET.parse(filename)
    root = tree.getroot()
    for doc in tqdm(root.findall('.//DOK')):
        question = doc.find('./PERTANYAAN')
        if question is None:
            continue
        question_title = question.find('./JUDUL')
        question_text = ""
        if question_title is not None:
            question_text = question_title.text
        if question_text is not None and len(question_text) > 0 and question.find('./ISI') is not None:
            question_text = f"{question_text}. {question.find('./ISI').text}"
        elif question.find('./ISI') is not None:
            question_text = question.find('./ISI').text
        if question_text is None:
            continue
        question_text = question_text.strip()
        question_text = normalize_text(question_text)
        if len(question_text)==0:
            continue
        ctx = doc.find('.//JAWABAN')
        if ctx is None or ctx!=ctx:
            continue
        ctx = ctx.find('./ISI').text
        if ctx is None or ctx!=ctx or ctx=='':
            continue
        ctx = ctx.strip()
        ctx = normalize_text(ctx)
        row = {
            'question': question_text,
            'answers': [],
            'positive_ctxs': [{
                'id': '',
                'title': '',
                'text': ctx
            }],
            'negative_ctxs': [],
            'hard_negative_ctxs': []
        }
        out_json.append(row)
train_data, dev_data = split_data(out_json)
with open(f'./Dataset/Questions/train_other-syifa.json', 'w') as f:
    json.dump(train_data, f)
with open(f'./Dataset/Questions/dev_other-syifa.json', 'w') as f:
    json.dump(dev_data, f)

./corpus-syifa/tanyadok_normalized.xml


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15279/15279 [00:00<00:00, 17551.77it/s]


./corpus-syifa/alodokter_normalized.xml


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 316914/316914 [00:30<00:00, 10396.43it/s]


./corpus-syifa/detikhealth_normalized.xml


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 216/216 [00:00<00:00, 3922.71it/s]


./corpus-syifa/doktersehat_normalized.xml


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3158/3158 [00:00<00:00, 20976.30it/s]


./corpus-syifa/klikdokter_normalized.xml


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1958/1958 [00:00<00:00, 10486.28it/s]


# 5. IndoSum

In [25]:
import json
import xml.etree.ElementTree as ET
import glob
from tqdm import tqdm
from sacremoses import MosesDetokenizer
md = MosesDetokenizer(lang='id')

def process_indosum(mode='train'):
    jsons = []
    jsons_processed = []
    filenames = glob.glob(f'./Dataset/Questions/indosum/{mode}*.jsonl')
    for filename in tqdm(filenames):
        with open(filename, 'r') as f:
            json_lst = list(f) 
        for json_str in json_lst:
            jsons.append(json.loads(json_str))

    for a_json in tqdm(jsons):
        summary = ""
        for sentence in a_json['summary']:
            if len(summary)==0:
                summary = md.detokenize(sentence)
            else:
                summary = f"{summary} {md.detokenize(sentence)}"
        ctx = ""
        for paragraph in a_json['paragraphs']:
            for sentence in paragraph:
                if len(ctx)==0:
                    ctx = md.detokenize(sentence)
                else:
                    ctx = f"{ctx} {md.detokenize(sentence)}"


        row_json = {
            'question': normalize_text(summary),
            'answers': [],
            'positive_ctxs': [{
                'id': '',
                'title': '',
                'text': normalize_text(ctx)
            }],
            'negative_ctxs': [],
            'hard_negative_ctxs': []
        }
        jsons_processed.append(row_json)
    with open(f'./Dataset/Questions/{mode}_indosum.json', 'w') as f:
        json.dump(jsons_processed, f)

In [26]:
process_indosum('train')
process_indosum('dev')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.79s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71353/71353 [02:49<00:00, 421.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 17.60it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

# 6. TTMeqSum

In [6]:
import pandas

In [7]:
df=pd.read_excel('./Dataset/Questions/MeQSum_ACL2019_BenAbacha_Demner-Fushman.xlsx')

In [91]:
import re

all_data = []
for idx, row in tqdm(df.iterrows()):
    context = row['CHQ']
    context = context.lower()
    if 'subject:' in context and 'message:' in context:
        ctx_title = re.search('subject: (.+)', context)
        if ctx_title is None:
            ctx_title = re.search('subject:\n(.+)', context)
        ctx_text = re.search('message: (.+)', context)
        if ctx_text is None:
            ctx_text = re.search('message:\n(.+)', context)
        ctx_title = ctx_title.group(1)
        ctx_text = ctx_text.group(1)

    elif len(context.split('\n'))==2:
        splitted_ctx = context.split('\n')
        ctx_title = splitted_ctx[0]
        ctx_text =  splitted_ctx[1]
    else:
        ctx_title = ''
        ctx_text = context
    if ctx_title!='':
        ctx_title = translate_to_indo(ctx_title)
    ctx_text = translate_to_indo(ctx_text)
    question = translate_to_indo(row['Summary'].lower())
    all_data.append({
        'question': question,
        'answers': [],
        'positive_ctxs': [{
            'id': '',
            'title': ctx_title,
            'text': ctx_text
        }],
        'negative_ctxs': [],
        'hard_negative_ctxs': []
    })
train_data, dev_data = split_data(all_data)
with open(f'./Dataset/Questions/train_ttmeqsum.json', 'w') as f:
    json.dump(train_data, f)
with open(f'./Dataset/Questions/dev_ttmeqsum.json', 'w') as f:
    json.dump(dev_data, f)





0it [00:00, ?it/s][A[A[A[A



1it [00:01,  1.78s/it][A[A[A[A



2it [00:05,  2.37s/it][A[A[A[A



3it [00:06,  2.09s/it][A[A[A[A



4it [00:08,  1.88s/it][A[A[A[A



5it [00:11,  2.24s/it][A[A[A[A



6it [00:13,  2.27s/it][A[A[A[A



7it [00:15,  2.08s/it][A[A[A[A



8it [00:19,  2.76s/it][A[A[A[A



9it [00:22,  2.64s/it][A[A[A[A



10it [00:24,  2.47s/it][A[A[A[A



11it [00:26,  2.27s/it][A[A[A[A



12it [00:28,  2.26s/it][A[A[A[A



13it [00:29,  2.00s/it][A[A[A[A



14it [00:30,  1.78s/it][A[A[A[A



15it [00:34,  2.45s/it][A[A[A[A



16it [00:37,  2.36s/it][A[A[A[A



17it [00:41,  2.98s/it][A[A[A[A



18it [00:44,  2.86s/it][A[A[A[A



19it [00:45,  2.45s/it][A[A[A[A



20it [00:47,  2.17s/it][A[A[A[A



21it [00:49,  2.09s/it][A[A[A[A



22it [00:51,  2.15s/it][A[A[A[A



23it [00:51,  1.70s/it][A[A[A[A



24it [00:54,  1.85s/it][A[A[A[A



25it [00:56,  1.89s/it][A[A[A[A

411it [17:11,  3.78s/it][A[A[A[A



412it [17:13,  3.24s/it][A[A[A[A



413it [17:16,  3.27s/it][A[A[A[A



414it [17:20,  3.35s/it][A[A[A[A



415it [17:22,  2.96s/it][A[A[A[A



416it [17:25,  3.10s/it][A[A[A[A



417it [17:29,  3.26s/it][A[A[A[A



418it [17:33,  3.64s/it][A[A[A[A



419it [17:35,  3.14s/it][A[A[A[A



420it [17:39,  3.19s/it][A[A[A[A



421it [17:40,  2.53s/it][A[A[A[A



422it [17:41,  2.22s/it][A[A[A[A



423it [17:45,  2.74s/it][A[A[A[A



424it [17:52,  4.11s/it][A[A[A[A



425it [17:55,  3.54s/it][A[A[A[A



426it [17:56,  2.95s/it][A[A[A[A



427it [17:59,  2.80s/it][A[A[A[A



428it [18:01,  2.68s/it][A[A[A[A



429it [18:03,  2.36s/it][A[A[A[A



430it [18:06,  2.59s/it][A[A[A[A



431it [18:08,  2.39s/it][A[A[A[A



432it [18:10,  2.24s/it][A[A[A[A



433it [18:13,  2.60s/it][A[A[A[A



434it [18:14,  2.20s/it][A[A[A[A



435it [18:16,  2.11s/it][A[A[A[A





819it [33:58,  2.17s/it][A[A[A[A



820it [34:00,  2.04s/it][A[A[A[A



821it [34:05,  3.03s/it][A[A[A[A



822it [34:06,  2.33s/it][A[A[A[A



823it [34:09,  2.51s/it][A[A[A[A



824it [34:11,  2.43s/it][A[A[A[A



825it [34:14,  2.44s/it][A[A[A[A



826it [34:15,  2.13s/it][A[A[A[A



827it [34:19,  2.61s/it][A[A[A[A



828it [34:19,  2.01s/it][A[A[A[A



829it [34:21,  2.04s/it][A[A[A[A



830it [34:26,  2.87s/it][A[A[A[A



831it [34:27,  2.34s/it][A[A[A[A



832it [34:29,  2.08s/it][A[A[A[A



833it [34:31,  2.11s/it][A[A[A[A



834it [34:36,  2.91s/it][A[A[A[A



835it [34:37,  2.52s/it][A[A[A[A



836it [34:38,  1.99s/it][A[A[A[A



837it [34:42,  2.42s/it][A[A[A[A



838it [34:44,  2.48s/it][A[A[A[A



839it [34:49,  3.13s/it][A[A[A[A



840it [34:50,  2.62s/it][A[A[A[A



841it [34:51,  2.06s/it][A[A[A[A



842it [34:52,  1.86s/it][A[A[A[A



843it [34:58,  2.98s/it][A[A[A[A





# 7. ICT Syifa

In [6]:
import json
import csv
questions_lst = []
with open('./Dataset/Questions/question-syifa-test.json', 'r') as f:
    syifa_test = json.load(f)    

In [7]:
in_test = set()

for row in syifa_test:
    for ctx in row['ctxs']:
        in_test.add(str(ctx['id']))
    for ctx in row['neg_ctxs']:
        in_test.add(str(ctx['id']))

with open('./Dataset/Corpus/corpus-syifa-normalized.tsv', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        if str(row[0]) not in in_test:
            questions_lst.append(row[1])

In [23]:
import random 

def split_sentences(txt):
    end_symbols = ['?', '!']
    for symbol in end_symbols:
        txt = txt.split(symbol)
        txt = ' '.join(txt).strip()
    txt = txt.split('.')
    return txt

def generate_ict_type_files(questions):
    random.seed(42)
    json_lst = []
    for question in questions:
        splitted_question = split_sentences(question)
        splitted_question = [question for question in splitted_question if len(question) > 0]
        if len(splitted_question) == 0:
            continue
        question_idx = random.randint(0, len(splitted_question)-1)
        question = splitted_question[question_idx]
        positive_ctxs = [splitted_question[i] for i in range(len(splitted_question)) if i != question_idx]
        positive_ctxs_formatted = [{'id': '', 'title': '', 'text': pos_ctx} for pos_ctx in positive_ctxs]
        
        json_lst.append({
            'question': question,
            'positive_ctxs': positive_ctxs_formatted,
            'negative_ctxs': [],
            'answers': [],
            'hard_negative_ctxs': []
        })
    train_json, dev_json = split_data(json_lst)
    print(len(train_json))
    print(len(dev_json))
    with open('./Dataset/Questions/train_ict.json', 'w') as f:
        json.dump(train_json, f)
    with open('./Dataset/Questions/dev_ict.json', 'w') as f:
        json.dump(dev_json, f)
    

In [24]:
generate_ict_type_files(questions_lst)

60239
20079


# 8. IDWiki

In [4]:
import json
import glob
import re 
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from collections import defaultdict
import random 

in_degree_map=defaultdict(int)
wiki_map = []
mention_sign = '|||||||'

def normalize_text(txt):
    txt = re.sub(r"\s+", " ",txt).strip() #remove whitespaces
    txt = re.sub(r"<.*?>", " ", txt).strip() #remove html tags
    #remove unicode
    txt = txt.encode("ascii", "ignore")
    txt = txt.decode()
    return txt

def split_data(lst, train_ratio=0.75):
    random.seed(42)
    random.shuffle(lst)
    end_train = int(len(lst)*0.75)+1
    train_data = lst[:end_train]
    dev_data = lst[end_train:]
    return train_data, dev_data
def get_soup(url, verify=True):
    page = requests.get(url, verify=verify)
    soup = BeautifulSoup(page.content)
    return soup

def get_entities_mentions(text):
    suffixes=re.findall(r'<a href=[\'"]?([^\'" >]+).*?>(.+?)(?=</a>)', text)
    entities = []
    mentions = []
    for suffix in suffixes:
        entity = suffix[0].replace('\"', '').replace("\'", "")
        entity = requests.utils.unquote(entity)
        mention = suffix[1].lower()
        #entity = get_soup(f'https://id.wikipedia.org/wiki/{suffix}').find('h1', {'id': 'firstHeading'}).text.strip()
        assert entity is not None and entity!='' and entity==entity
        entities.append(entity.lower())
        mentions.append(mention)
    return entities, mentions



def mask_text(text):
    return re.sub(r'<a href=.+?>.+?</a>', mention_sign, text)

def invalid_text(text):
    res = re.sub(r'[^a-zA-Z]', '', text)
    return len(res) < 3

def split_to_passages(text):
    tokens = word_tokenize(text)
    passages = []
    for i in range(0, len(tokens), 100):
        passages.append(tokens[i:i+100])
    return passages

def is_dual_link(q, p):
    return q['title'] in p['entities'] and p['title'] in q['entities']

def is_comention(q, p, k=3):
    for entity in q['entities']:
        if in_degree_map[entity] <  k and entity in p['entities'] and q['title'] in p['entities']:
            return True
    return False

for filename in tqdm(glob.glob('./text/*/*.jsonl')):
    json_lst = []
    with open(filename, 'r') as f:
        json_lst = list(f)
    for json_row in json_lst:
        row_dict = dict()
        json_row = json.loads(json_row)
        if invalid_text(json_row['title']) or invalid_text(json_row['text']):
            continue
        row_dict = dict()
        docid = json_row['id']
        url = json_row['url']
        title = json_row['title'].lower()
        text = json_row['text'].replace('&lt;', '<').replace('&gt;', '>')
        entities, mentions = get_entities_mentions(text)
        for entity in entities:
            in_degree_map[entity] += 1
        row_dict['id'] = docid
        row_dict['url'] = url
        row_dict['title'] = title
        row_dict['text'] = text
        row_dict['entities'] = entities
        wiki_map.append(row_dict)
question_passage_pairs = []

for i in tqdm(range(len(wiki_map)-1)):
    question = normalize_text(f"{wiki_map[i]['text']}. {wiki_map[i]['title']}") 
    positive_ctxs = []
    for j in range(i+1, len(wiki_map)):
        if is_dual_link(wiki_map[i], wiki_map[j]) or is_comention(wiki_map[i], wiki_map[j]):
            positive_ctxs.append({
                'id': wiki_map[j]['id'],
                'title': normalize_text(wiki_map[j]['title']),
                'text': normalize_text(wiki_map[j]['text'])
            })
    if len(positive_ctxs) == 0:
        continue
    row_dict ={
        'question': question,
        'positive_ctxs': positive_ctxs[:5],
        'negative_ctxs': [],
        'hard_negative_ctxs': [],
        'answers': []
    }
    question_passage_pairs.append(row_dict)

train_wiki, dev_wiki = split_data(question_passage_pairs)
with open('train_wiki.json', 'w') as f:
    json.dump(train_wiki, f)
with open('dev_wiki.json', 'w') as f:
    json.dump(dev_wiki, f)
print(len(train_wiki))
print(len(dev_wiki))

for i in tqdm(range(len(wiki_map)-1, 0,-1)):
    question = normalize_text(f"{wiki_map[i]['text']}. {wiki_map[i]['title']}") 
    positive_ctxs = []
    for j in range(i-1, -1,-1):
        if is_dual_link(wiki_map[i], wiki_map[j]) or is_comention(wiki_map[i], wiki_map[j]):
            positive_ctxs.append({
                'id': wiki_map[j]['id'],
                'title': normalize_text(wiki_map[j]['title']),
                'text': normalize_text(wiki_map[j]['text'])
            })
    if len(positive_ctxs) == 0:
        continue
    row_dict ={
        'question': question,
        'positive_ctxs': positive_ctxs,
        'negative_ctxs': [],
        'hard_negative_ctxs': [],
        'answers': []
    }
    question_passage_pairs.append(row_dict)

train_wiki, dev_wiki = split_data(question_passage_pairs)
with open('train_wiki.json', 'w') as f:
    json.dump(train_wiki, f)
with open('dev_wiki.json', 'w') as f:
    json.dump(dev_wiki, f)
print(len(train_wiki))
print(len(dev_wiki))


HBox(children=(FloatProgress(value=0.0, max=1092.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=589066.0), HTML(value='')))

KeyboardInterrupt: 

# 9. Corpus Data

In [None]:
import pandas as pd
import numpy as np
import csv
import glob
from tqdm import tqdm

In [None]:
import csv
import xml.etree.ElementTree as ET

corpus = []
file_name = './Dataset/Corpus/corpus_full_repaired.xml'
with open('./Dataset/Corpus/corpus-syifa.tsv', 'w', encoding='utf-8') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    mytree = ET.parse(file_name)
    myroot = mytree.getroot()

    for item in myroot.findall('DOK'):

        id = item.find('ID').text
        title = item.find('./KELUHAN/JUDUL').text if item.find('./KELUHAN/JUDUL') is not None else ""
        text = item.find('./KELUHAN/ISI').text if item.find('./KELUHAN/ISI') is not None else ""
        row = [id, text, title]
        tsv_writer.writerow(row)


# 10. Test Data

In [None]:
query_map=[]


file_name = './Dataset/Questions/question_queries_normalized.xml'

mytree = ET.parse(file_name)
myroot = mytree.getroot()

for item in myroot.findall('DOK'):
    id_int = item.find('NO').text
    title = item.find('JUDUL').text if item.find('JUDUL').text is not None else ""
    text = item.find('ISI').text if item.find('ISI').text is not None else ""
    query_row = {
        'id': id_int,
        'title': title,
        'text': text
    }
    query_map.append(query_row)

In [None]:
full_test = []

#full query
for qid in valid_ids:
    query_dict = dict()
    chosen_query = query_map[qid]
    query_dict['question'] = {
        'id': chosen_query['id'],
        'title':  chosen_query['title'],
        'text': chosen_query['text']
    }
    query_dict['ctxs'] = []
    query_dict['neg_ctxs'] = []
    df = annotated_dfs[ids2pos[qid]]
    pos_df = df[(df['Relevansi'] > 0) & (df['Relevansi'] <= 2)]
    neg_df = df[df['Relevansi']==0]
    for _, row in pos_df.iterrows():
        ctx_dict = {
            'id': row['ID'],
            'title': row['Title'],
            'text': row['Detail']
        }
        query_dict['ctxs'].append(ctx_dict)
    for _, row in neg_df.iterrows():
        ctx_dict = {
            'id': row['ID'],
            'title': row['Title'],
            'text': row['Detail']
        }
        query_dict['neg_ctxs'].append(ctx_dict)
    full_test.append(query_dict)

In [None]:
assert len(full_test)==45

In [None]:
with open('./Dataset/Questions/question-syifa-test.json', 'w') as outfile:
    json.dump(full_test, outfile)

# 11. Test Data Revised

In [4]:
from tqdm import tqdm

In [5]:
annotated = pd.read_excel('unknown_results_annotation_ver1.xlsx')
with open('./Dataset/Questions/question-syifa-test.json', 'r') as f:
    test_data = json.load(f)
annotated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          150 non-null    float64
 1   question_id         150 non-null    int64  
 2   question_text       140 non-null    object 
 3   question_title      150 non-null    object 
 4   passage_id          150 non-null    object 
 5   passage_text        144 non-null    object 
 6   passage_title       150 non-null    object 
 7   relevance_krisna    150 non-null    float64
 8   relevance_mahendra  0 non-null      float64
 9   relevance_alfan     150 non-null    float64
 10  final_relevance     150 non-null    int64  
dtypes: float64(4), int64(2), object(5)
memory usage: 13.0+ KB


In [6]:
annotated.head()

Unnamed: 0.1,Unnamed: 0,question_id,question_text,question_title,passage_id,passage_text,passage_title,relevance_krisna,relevance_mahendra,relevance_alfan,final_relevance
0,0.0,3,sore dok . . maaf dok saya sudah 4 hari mengal...,benjolan di sekitar kelamin,AD-23768,sore dok . sudah 2 minggu saya ada benjolan ke...,benjolon di kepala belakang,0.0,,0.0,0
1,1.0,3,sore dok . . maaf dok saya sudah 4 hari mengal...,benjolan di sekitar kelamin,TD-3617,"tanya dok , saya laki-laki umur 22 tahun . sud...",di paha saya ada seperti benjolan namun di dal...,1.0,,1.0,1
2,2.0,3,sore dok . . maaf dok saya sudah 4 hari mengal...,benjolan di sekitar kelamin,KD-14989,"saya akhyani , saya mau tanya , saya ada benjo...",benjolan di selangkangan,1.0,,1.0,1
3,3.0,3,sore dok . . maaf dok saya sudah 4 hari mengal...,benjolan di sekitar kelamin,DH-868,"dok , sejak smp saya menemukan benjolan di seb...",apa penyebab benjolan di seputar kemaluan ?,2.0,,2.0,2
4,4.0,3,sore dok . . maaf dok saya sudah 4 hari mengal...,benjolan di sekitar kelamin,AD-16770,"pagi dok . saya mau tanya , saya memiliki benj...",benjolan di leher .,0.0,,0.0,0


In [7]:
#gather all gold ids
pos_ids = dict()
neg_ids = dict()

for row in tqdm(test_data):
    qid = int(row['question']['id'])
    pos_ids[qid] = []
    neg_ids[qid] = []
    for pos_ctx in row['ctxs']:
        pos_ids[qid].append(pos_ctx['id'])
    for neg_ctx in row['neg_ctxs']:
        neg_ids[qid].append(neg_ctx['id'])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 14347.68it/s]


In [8]:
pos_ids

{1: ['KD-10780',
  'AD-11170',
  'KD-34954',
  'KD-42528',
  'AD-3644',
  'AD-20187',
  'KD-9647',
  'AD-15985',
  'AD-8111',
  'AD-19236',
  'KD-30953',
  'KD-1190',
  'AD-11811',
  'AD-23059',
  'AD-1505',
  'AD-8996',
  'AD-13391',
  'AD-23609',
  'AD-20888',
  'AD-14282',
  'AD-19627',
  'KD-40213',
  'TD-7481',
  'KD-33961',
  'AD-8178',
  'AD-10434',
  'AD-1225',
  'AD-6755',
  'AD-5483',
  'AD-15205',
  'AD-2483',
  'KD-36449',
  'KD-38063',
  'KD-31262',
  'KD-7197',
  'KD-13846',
  'KD-29989',
  'KD-34810',
  'TD-2104',
  'KD-28831',
  'KD-818',
  'KD-12573',
  'KD-32421',
  'KD-130',
  'KD-16041',
  'KD-18305',
  'TD-11575',
  'AD-22794',
  'AD-6224',
  'AD-20846',
  'TD-4539',
  'AD-5304',
  'KD-26899',
  'KD-6929',
  'AD-12475',
  'KD-7384',
  'AD-11852',
  'AD-8328',
  'AD-9875',
  'KD-26435',
  'AD-10103'],
 2: ['KD-30522',
  'KD-34185',
  'TD-1266',
  'DS-45',
  'KD-38291',
  'KD-35019',
  'KD-27476',
  'TD-2937',
  'AD-22778',
  'KD-7412',
  'TD-3834',
  'KD-9737',
  'K

In [9]:
for idx, row in tqdm(enumerate(test_data)):
    qid = int(row['question']['id'])
    if qid not in annotated['question_id'].unique():
        continue
    pos_ctxs = row['ctxs']
    neg_ctxs = row['neg_ctxs']
    new_pos_ctxs = []
    new_neg_ctxs = []
    selected = annotated[annotated['question_id']==qid]
    for pidx, pair in selected.iterrows():
        new_instance = {
                'id': pair['passage_id'],
                'title': pair['passage_title'],
                'text': pair['passage_text']
        }
        if pair['final_relevance'] <= 1:
            new_neg_ctxs.append(new_instance)
        else:
            new_pos_ctxs.append(new_instance)
    test_data[idx]['ctxs'] = pos_ctxs + new_pos_ctxs
    test_data[idx]['neg_ctxs'] = neg_ctxs + new_neg_ctxs


45it [00:00, 2006.91it/s]


In [10]:
with open('./Dataset/Questions/question-syifa-test-v2.json', 'w') as f:
    json.dump(test_data, f, indent = 4)