In [1]:
%load_ext autoreload
%autoreload 2
import json
from pathlib import Path
import random
from itertools import islice, product
from collections import Counter, defaultdict
from typing import List, Dict
import re
from pprint import pprint

from tqdm.notebook import tqdm
#import itables.interactive
import pandas as pd
# Set interactive pandas
#itables.options.mode = "notebook"

from bela.utils.analysis_utils import Sample, Entity


def yield_jsonl_lines(path):
    with open(path, "rt") as f:
        for line in f:
            yield json.loads(line)


def inspect_json(json_dict, indent=0, max_elements=3):
    """Only print the first 3 keys of a dict, recursively"""
    for i, (key, value) in enumerate(json_dict.items()):
        if isinstance(value, dict):
            print("  " * indent + f"{key}:")
            inspect_json(value, indent=indent + 1)
        else:
            print("  " * indent + f"{key}: {value}")
        if i + 1 == max_elements:
            print("  " * indent + "...")
            break


def read_jsonl_file(path, n_lines=None):
    return list(tqdm(islice(yield_jsonl_lines(path), n_lines), total=n_lines))


In [58]:

def DEPRECATED_infer_original_text(texts):
    """Given a list of texts, corresponding to the same document but with different [START] mention [END] tokens, try to infer the original text by removing the [START] and [END] tokens and the extra spaces that were added."""
    text_counter = Counter()
    for text in texts:
        left_context, mention, right_context = re.match(r"(.*) \[START\] (.*) \[END\] (.*)", text, re.DOTALL).groups()  # re.DOTALL to match newlines with .
        # We don't know if there is a space between the context and the mention. Sometimes there was even 2 in the original. Try all combinations.
        # There are multiple possible characters that were stripped
        #possible_characters = ["", " ", "\n", "\u3000", "\u3000\u3000"]  # \u3000 happens a lot in chinese texts
        # Sometimes there can be a sequence of 2 of them
        #possible_separators = set(["".join(sep) for sep in product(possible_characters, repeat=2)])
        possible_separators = ["", " ", "\n", "\u3000", " \n", "\n\n", "\u3000\u3000", " \u3000"]
        # Then we can have either of those before and after the mention
        for sep_1, sep_2 in product(possible_separators, repeat=2):
            text = f"{left_context}{sep_1}{mention}{sep_2}{right_context}"
            text_counter[text] += 1
    #assert text_counter.most_common()[0][1] != text_counter.most_common()[1][1], f"Could not infer original text, the most common text is not unique: {text_counter.most_common()[:2]}"
    if len(texts) > 1 and text_counter.most_common()[0][1] == text_counter.most_common()[1][1]:
        print(f"Could not infer original text, the most common text is not unique: {text_counter.most_common()[:2]}")
    # Assume that the text with the more occurences is the correct one
    #assert text_counter.most_common()[0][1] > 1, f"Could not infer original text, the most common text occurs only once: {text_counter.most_common()[0][0]}"
    if len(texts) > 1 and text_counter.most_common()[0][1] <= 1:
        print(f"Could not infer original text, the most common text occurs only once: {text_counter.most_common()[0][0]}")
    return text_counter.most_common()[0][0]

def merge_contexts(longuest_left_context, longuest_right_context):
    # Concatenate the two contexts after removing the overlapping part
    # First find the length of the overlapping part, e.g. if longuest_left_context = "abcdef" and longuest_right_context = "cdefgh", then the overlapping part is "cdef"
    overlapping_length = 0
    for i in range(1, len(longuest_right_context) + 1):
        if longuest_left_context[-i:] == longuest_right_context[:i]:
            overlapping_length = i
    assert overlapping_length > 3
    #if overlapping_length == 0:
    #    print(f"Could not merge contexts: {longuest_left_context} and {longuest_right_context}")
    #    # HACK: This fixes the following edge case
    #    # texts = [
    #    #     '<doc id="SPA_DF_000404_20150618_F001000AU"> <headline> Un loco blanquito tirotea una iglesia en [START] Charleston [END] SC </headline> <post id="p1" author="chikilinda52" datetime="2015-06-18T21:25:00"> http://noticias.univision.com/article/2372330/2015-06-17/estados-unidos/noticias/tiroteo-multiple-en... </post> <post id="p2" author="carmensantos" datetime="2015-06-28T00:18:00"> la pena de muerte para este desgraci*ado </post> <post id="p3" author="arcoiris02" datetime="2015-06-29T01:09:00"> horrible </post> <post id="p4" author="andreitu" datetime="2015-07-07T23:39:00"> pobre gentes </post> <post id="p5" author="amazona71" datetime="2015-07-11T01:28:00"> debio morir antes </post> </doc>',
    #    #     '<doc id="SPA_DF_000404_20150618_F001000AU"> <headline> Un loco blanquito tirotea una iglesia en Charleston [START] SC [END] </headline> <post id="p1" author="chikilinda52" datetime="2015-06-18T21:25:00"> http://noticias.univision.com/article/2372330/2015-06-17/estados-unidos/noticias/tiroteo-multiple-en... </post> <post id="p2" author="carmensantos" datetime="2015-06-28T00:18:00"> la pena de muerte para este desgraci*ado </post> <post id="p3" author="arcoiris02" datetime="2015-06-29T01:09:00"> horrible </post> <post id="p4" author="andreitu" datetime="2015-07-07T23:39:00"> pobre gentes </post> <post id="p5" author="amazona71" datetime="2015-07-11T01:28:00"> debio morir antes </post> </doc>'
    #    # ]
    #    # original_text = '<doc id="SPA_DF_000404_20150618_F001000AU"> <headline> Un loco blanquito tirotea una iglesia en Charleston SC </headline> <post id="p1" author="chikilinda52" datetime="2015-06-18T21:25:00"> http://noticias.univision.com/article/2372330/2015-06-17/estados-unidos/noticias/tiroteo-multiple-en... </post> <post id="p2" author="carmensantos" datetime="2015-06-28T00:18:00"> la pena de muerte para este desgraci*ado </post> <post id="p3" author="arcoiris02" datetime="2015-06-29T01:09:00"> horrible </post> <post id="p4" author="andreitu" datetime="2015-07-07T23:39:00"> pobre gentes </post> <post id="p5" author="amazona71" datetime="2015-07-11T01:28:00"> debio morir antes </post> </doc>',
    #    # assert infer_original_text(texts) == original_text
    #    longuest_left_context += " "
    return longuest_left_context + longuest_right_context[overlapping_length:]


def test_merge_contexts():
    assert merge_contexts("abcdef", "cdefgh") == "abcdefgh"
    assert merge_contexts("abcdef", "cdefghijkl") == "abcdefghijkl"
    assert merge_contexts("abcdef", "cdef") == "abcdef"
    assert merge_contexts("zabcabc", "abcabcde") == "zabcabcde"
    assert merge_contexts("abcdef", "abcdef") == "abcdef"
test_merge_contexts()


def infer_original_text(texts):
    """Given a list of texts, corresponding to the same document but with different [START] mention [END] tokens, try to infer the original text by removing the [START] and [END] tokens and the extra spaces that were added.
    The big issue is that when adding [START] and [END] some characters were removed (whitespace, newlines, \u3000 etc.). E.g. "abc \n mention    def" -> "abc [START] mention [END] def", so we try to recover the original text
    """
    def infer_original_text_from_single_text(text):
        left_context, mention, right_context = re.match(r"(.*) \[START\] (.*) \[END\] (.*)", text, re.DOTALL).groups()  # re.DOTALL to match newlines with .
        return f"{left_context} {mention} {right_context}"
    
    if len(texts) == 1:
        return infer_original_text_from_single_text(texts[0])

    longuest_left_context = max(texts, key=lambda text: len(text.split(" [START] ")[0])).split(" [START] ")[0]
    longuest_right_context = max(texts, key=lambda text: len(text.split(" [END] ")[-1])).split(" [END] ")[-1]
    try:
        return merge_contexts(longuest_left_context, longuest_right_context)
    except AssertionError:
        # Fallback
        return infer_original_text_from_single_text(texts[0])


def test_infer_original_text():
    texts = [
        '<DOC id="CMN_NW_001149_20150314_F0000005H"> <SOURCE>http://sports.hangzhou.com.cn/zxbd/content/2015-03/14/content_5689329.htm</SOURCE> <DATE_TIME>2015-03-14T00:00:00</DATE_TIME> <HEADLINE> “ [START] 刀锋战士 [END] ”阻止枪杀案件重审 申诉被驳回 </HEADLINE> <TEXT> <P>    在13日的法院审理中，残奥会明星皮斯托瑞斯的法律团队企图阻止其枪杀女友案件重新在最高法院审理的申诉被驳回。 </P> <P> \u3000\u30002014年11月4日，政府检察官向法庭提交了上诉文件，文件对皮斯托瑞斯杀死女友案的定罪和判刑均提出了上诉。之后，法庭批准了此项上诉。 </P> <P> \u3000\u3000“刀锋战士”的辩护律师企图阻止法院重新开庭审理此案的申诉被驳回后，律师表示控辩双方都有权利陈述自己的观点。 </P> <P> \u3000\u3000现年27岁的皮斯托瑞斯在2013年2月14日“情人节”当天，透过浴室的门射杀了女友斯滕坎普。南非北豪登省最高法院2014年9月12日做出决定，判定皮斯托瑞斯谋杀罪名不成立，并裁定他犯有过失杀人罪。 </P> <P> \u3000\u3000北豪登省最高法院庭审法官玛希帕2014年10月21日宣布，皮斯托瑞斯因过失导致女友斯滕坎普死亡，将在监狱服刑5年。皮斯托瑞斯表示将不会对此判决提出上诉，他将有资格在服刑10个月后，获得在家禁闭服刑的权利。 </P> <P> \u3000\u3000现在法官玛希帕批准了检察官的上诉申请，而驳回了辩护律师的申诉，这意味着这场旷日持久的官司仍将继续。 </P> </TEXT> </DOC>',
        '<DOC id="CMN_NW_001149_20150314_F0000005H"> <SOURCE>http://sports.hangzhou.com.cn/zxbd/content/2015-03/14/content_5689329.htm</SOURCE> <DATE_TIME>2015-03-14T00:00:00</DATE_TIME> <HEADLINE> “刀锋战士”阻止枪杀案件重审 申诉被驳回 </HEADLINE> <TEXT> <P>    在13日的法院审理中，残奥会明星 [START] 皮斯托瑞斯 [END] 的法律团队企图阻止其枪杀女友案件重新在最高法院审理的申诉被驳回。 </P> <P> \u3000\u30002014年11月4日，政府检察官向法庭提交了上诉文件，文件对皮斯托瑞斯杀死女友案的定罪和判刑均提出了上诉。之后，法庭批准了此项上诉。 </P> <P> \u3000\u3000“刀锋战士”的辩护律师企图阻止法院重新开庭审理此案的申诉被驳回后，律师表示控辩双方都有权利陈述自己的观点。 </P> <P> \u3000\u3000现年27岁的皮斯托瑞斯在2013年2月14日“情人节”当天，透过浴室的门射杀了女友斯滕坎普。南非北豪登省最高法院2014年9月12日做出决定，判定皮斯托瑞斯谋杀罪名不成立，并裁定他犯有过失杀人罪。 </P> <P> \u3000\u3000北豪登省最高法院庭审法官玛希帕2014年10月21日宣布，皮斯托瑞斯因过失导致女友斯滕坎普死亡，将在监狱服刑5年。皮斯托瑞斯表示将不会对此判决提出上诉，他将有资格在服刑10个月后，获得在家禁闭服刑的权利。 </P> <P> \u3000\u3000现在法官玛希帕批准了检察官的上诉申请，而驳回了辩护律师的申诉，这意味着这场旷日持久的官司仍将继续。 </P> </TEXT> </DOC>',
        '<DOC id="CMN_NW_001149_20150314_F0000005H"> <SOURCE>http://sports.hangzhou.com.cn/zxbd/content/2015-03/14/content_5689329.htm</SOURCE> <DATE_TIME>2015-03-14T00:00:00</DATE_TIME> <HEADLINE> “刀锋战士”阻止枪杀案件重审 申诉被驳回 </HEADLINE> <TEXT> <P>    在13日的法院审理中，残奥会明星皮斯托瑞斯的法律团队企图阻止其枪杀女友案件重新在最高法院审理的申诉被驳回。 </P> <P> \u3000\u30002014年11月4日，政府检察官向法庭提交了上诉文件，文件对 [START] 皮斯托瑞斯 [END] 杀死女友案的定罪和判刑均提出了上诉。之后，法庭批准了此项上诉。 </P> <P> \u3000\u3000“刀锋战士”的辩护律师企图阻止法院重新开庭审理此案的申诉被驳回后，律师表示控辩双方都有权利陈述自己的观点。 </P> <P> \u3000\u3000现年27岁的皮斯托瑞斯在2013年2月14日“情人节”当天，透过浴室的门射杀了女友斯滕坎普。南非北豪登省最高法院2014年9月12日做出决定，判定皮斯托瑞斯谋杀罪名不成立，并裁定他犯有过失杀人罪。 </P> <P> \u3000\u3000北豪登省最高法院庭审法官玛希帕2014年10月21日宣布，皮斯托瑞斯因过失导致女友斯滕坎普死亡，将在监狱服刑5年。皮斯托瑞斯表示将不会对此判决提出上诉，他将有资格在服刑10个月后，获得在家禁闭服刑的权利。 </P> <P> \u3000\u3000现在法官玛希帕批准了检察官的上诉申请，而驳回了辩护律师的申诉，这意味着这场旷日持久的官司仍将继续。 </P> </TEXT> </DOC>',
    ]
    original_text = '<DOC id="CMN_NW_001149_20150314_F0000005H"> <SOURCE>http://sports.hangzhou.com.cn/zxbd/content/2015-03/14/content_5689329.htm</SOURCE> <DATE_TIME>2015-03-14T00:00:00</DATE_TIME> <HEADLINE> “刀锋战士”阻止枪杀案件重审 申诉被驳回 </HEADLINE> <TEXT> <P>    在13日的法院审理中，残奥会明星皮斯托瑞斯的法律团队企图阻止其枪杀女友案件重新在最高法院审理的申诉被驳回。 </P> <P> \u3000\u30002014年11月4日，政府检察官向法庭提交了上诉文件，文件对皮斯托瑞斯杀死女友案的定罪和判刑均提出了上诉。之后，法庭批准了此项上诉。 </P> <P> \u3000\u3000“刀锋战士”的辩护律师企图阻止法院重新开庭审理此案的申诉被驳回后，律师表示控辩双方都有权利陈述自己的观点。 </P> <P> \u3000\u3000现年27岁的皮斯托瑞斯在2013年2月14日“情人节”当天，透过浴室的门射杀了女友斯滕坎普。南非北豪登省最高法院2014年9月12日做出决定，判定皮斯托瑞斯谋杀罪名不成立，并裁定他犯有过失杀人罪。 </P> <P> \u3000\u3000北豪登省最高法院庭审法官玛希帕2014年10月21日宣布，皮斯托瑞斯因过失导致女友斯滕坎普死亡，将在监狱服刑5年。皮斯托瑞斯表示将不会对此判决提出上诉，他将有资格在服刑10个月后，获得在家禁闭服刑的权利。 </P> <P> \u3000\u3000现在法官玛希帕批准了检察官的上诉申请，而驳回了辩护律师的申诉，这意味着这场旷日持久的官司仍将继续。 </P> </TEXT> </DOC>'
    infered_text = infer_original_text(texts)
    assert original_text == infered_text, f"original_text != infered_text:\n{original_text}\n{infered_text}"
    texts = [
        'Công an Hà Nội khủng bố tinh thần anh Trịnh Bá Phương: [START] CÔNG [END] AN HÀ NỘI KHỦNG BỐ TINH THẦN ANH TRỊNH B... https://t.co/bEocspCx5k',
        'Công an [START] Hà Nội [END] khủng bố tinh thần anh Trịnh Bá Phương: \n\n        CÔNG AN HÀ NỘI KHỦNG BỐ TINH THẦN ANH TRỊNH B... https://t.co/bEocspCx5k',
    ]
    original_text = "Công an Hà Nội khủng bố tinh thần anh Trịnh Bá Phương: \n\n        CÔNG AN HÀ NỘI KHỦNG BỐ TINH THẦN ANH TRỊNH B... https://t.co/bEocspCx5k"
    infered_text = infer_original_text(texts)
    assert original_text == infered_text, f"original_text != infered_text:\n{original_text}\n{infered_text}"
    texts = [
        '<doc id="SPA_DF_000404_20150618_F001000AU"> <headline> Un loco blanquito tirotea una iglesia en [START] Charleston [END] SC </headline> <post id="p1" author="chikilinda52" datetime="2015-06-18T21:25:00"> http://noticias.univision.com/article/2372330/2015-06-17/estados-unidos/noticias/tiroteo-multiple-en... </post> <post id="p2" author="carmensantos" datetime="2015-06-28T00:18:00"> la pena de muerte para este desgraci*ado </post> <post id="p3" author="arcoiris02" datetime="2015-06-29T01:09:00"> horrible </post> <post id="p4" author="andreitu" datetime="2015-07-07T23:39:00"> pobre gentes </post> <post id="p5" author="amazona71" datetime="2015-07-11T01:28:00"> debio morir antes </post> </doc>',
        '<doc id="SPA_DF_000404_20150618_F001000AU"> <headline> Un loco blanquito tirotea una iglesia en Charleston [START] SC [END] </headline> <post id="p1" author="chikilinda52" datetime="2015-06-18T21:25:00"> http://noticias.univision.com/article/2372330/2015-06-17/estados-unidos/noticias/tiroteo-multiple-en... </post> <post id="p2" author="carmensantos" datetime="2015-06-28T00:18:00"> la pena de muerte para este desgraci*ado </post> <post id="p3" author="arcoiris02" datetime="2015-06-29T01:09:00"> horrible </post> <post id="p4" author="andreitu" datetime="2015-07-07T23:39:00"> pobre gentes </post> <post id="p5" author="amazona71" datetime="2015-07-11T01:28:00"> debio morir antes </post> </doc>',
    ]
    original_text = '<doc id="SPA_DF_000404_20150618_F001000AU"> <headline> Un loco blanquito tirotea una iglesia en Charleston SC </headline> <post id="p1" author="chikilinda52" datetime="2015-06-18T21:25:00"> http://noticias.univision.com/article/2372330/2015-06-17/estados-unidos/noticias/tiroteo-multiple-en... </post> <post id="p2" author="carmensantos" datetime="2015-06-28T00:18:00"> la pena de muerte para este desgraci*ado </post> <post id="p3" author="arcoiris02" datetime="2015-06-29T01:09:00"> horrible </post> <post id="p4" author="andreitu" datetime="2015-07-07T23:39:00"> pobre gentes </post> <post id="p5" author="amazona71" datetime="2015-07-11T01:28:00"> debio morir antes </post> </doc>'
    infered_text = infer_original_text(texts)
    assert original_text == infered_text, f"original_text != infered_text:\n{original_text}\n{infered_text}"
    # TODO: This doesn't pass yet because [END] of first mention is [START] of second mention and no overlap
    #texts = [
    #    '#OromoRevolution " [START] Godina Arsii Lixaa [END] Aanaa shaallaatti waraana ummata Oromoofi waraana Agaazii gidduutti... https://t.co/LAxEOiaiEU',
    #    '#OromoRevolution "Godina Arsii Lixaa [START] Aanaa shaallaatti [END] waraana ummata Oromoofi waraana Agaazii gidduutti... https://t.co/LAxEOiaiEU',
    #]
    #original_text = '#OromoRevolution "Godina Arsii Lixaa Aanaa shaallaatti waraana ummata Oromoofi waraana Agaazii gidduutti... https://t.co/LAxEOiaiEU'
    #infered_text = infer_original_text(texts)
    #assert original_text == infered_text, f"original_text != infered_text:\n{original_text}\n{infered_text}"
test_infer_original_text()


def fix_offset(entity, mention):
    assert entity.mention != mention
    # Search for the mention close to the start of the entity
    fixed_offset = entity.text[entity.offset-5:].index(mention) + entity.offset - 5
    entity.offset = fixed_offset
    assert entity.mention == mention
    return entity


def sample_to_json(sample: Sample) -> str:
    """Convert a sample to a jsonl string."""
    return {"data_example_id": sample.sample_id, "original_text": sample.text, "gt_entities": [[0, 0, entity.entity_id, "wiki", entity.offset, entity.length] for entity in sample.ground_truth_entities]}

    
def convert_tackbp_json_mentions_to_sample(json_mentions: List[Dict]) -> Sample:
    """Given mentions from a single document, convert them to a Sample object."""
    # Check they all come from the same document
    assert len(set(json_mention["meta"]["document_id"] for json_mention in json_mentions)) == 1
    # Convert json_mentions to sample
    json_mentions.sort(key=lambda json_mention: json_mention["meta"]["start_offset"])
    ground_truth_entities = []
    text = infer_original_text([json_mention["input"] for json_mention in json_mentions])
    for json_mention in json_mentions:
        offset = len(json_mention["meta"]["left_context"])
        length = len(json_mention["meta"]["mention"])
        # Leading whitespaces
        additional_spaces = len(text[offset:]) - len(text[offset:].lstrip(" \n\u3000"))
        offset += additional_spaces
        entity = Entity(text=text, offset=offset, length=length, entity_id=json_mention["entity_id"])
        # Sometimes there are extra spaces / newlines in the original text mention that were removed in the metadata
        # json_mention["meta"]["mention"] = 'Must u h a m m a d u Bow u h a r i'
        # entity.mention = 'Must u h a m m a d u \nBow u h a r '
        assert entity.entity_id is not None
        #assert entity.mention == json_mention["meta"]["mention"], f"{entity.mention=} != {json_mention['meta']['mention']=}"
        if entity.mention != json_mention["meta"]["mention"]:
            fix_offset(entity, json_mention["meta"]["mention"])
            #print(f"{entity.mention=} != {json_mention['meta']['mention']=}")
        ground_truth_entities.append(entity)
    return Sample(text=text, sample_id=json_mentions[0]["meta"]["document_id"], ground_truth_entities=ground_truth_entities)


def convert_tackbp_json_mentions_to_samples(json_mentions: List[Dict]) -> List[Sample]:
    """Given a list of json mentions, group them by document and convert them to Samples objects."""
    # Group mentions together
    aggregated_mentions = defaultdict(list)
    for json_mention in json_mentions:
        aggregated_mentions[json_mention["meta"]["document_id"]].append(json_mention)
    samples = [convert_tackbp_json_mentions_to_sample(json_mentions) for json_mentions in tqdm(aggregated_mentions.values())]
    return samples


def convert_lorelei_json_mentions_to_sample(json_mentions: List[Dict]) -> Sample:
    """Given a list of json mentions, convert them to a Sample object."""
    # Check that all mentions are from the same document
    assert len(set(json_mention["meta"]["doc_id"] for json_mention in json_mentions)) == 1
    # Order mentions by start offset
    json_mentions.sort(key=lambda json_mention: json_mention["meta"]["start_char"])
    ground_truth_entities = []
    text = infer_original_text([json_mention["input"] for json_mention in json_mentions])
    for json_mention in json_mentions:
        offset = len(json_mention["meta"]["left_context"])
        length = len(json_mention["meta"]["mention"])
        # Leading whitespaces
        additional_spaces = len(text[offset:]) - len(text[offset:].lstrip(" \n\u3000"))
        offset += additional_spaces
        entity = Entity(text=text, offset=offset, length=length, entity_id=json_mention["entity_id"])
        # Sometimes there are extra spaces / newlines in the original text mention that were removed in the metadata
        assert entity.entity_id is not None
        #assert entity.mention == json_mention["meta"]["mention"], f"{entity.mention=} != {json_mention['meta']['mention']=}"

        if entity.mention != json_mention["meta"]["mention"]:
            fix_offset(entity, json_mention["meta"]["mention"])  # HACK: Sometimes the offset is still not good
            #print(f"{entity.mention=} != {json_mention['meta']['mention']=}")
        ground_truth_entities.append(entity)
    return Sample(text=text, sample_id=json_mentions[0]["meta"]["doc_id"], ground_truth_entities=ground_truth_entities)


def convert_lorelei_json_mentions_to_samples(json_mentions: List[Dict]) -> List[Sample]:
    """Given a list of json mentions, group them by document and convert them to Samples objects."""
    # Group mentions together
    aggregated_mentions = defaultdict(list)
    for json_mention in json_mentions:
        aggregated_mentions[json_mention["meta"]["doc_id"]].append(json_mention)
    samples = [convert_lorelei_json_mentions_to_sample(json_mentions) for json_mentions in tqdm(aggregated_mentions.values())]
    return samples


# Convert the TACKBP jsonl from Nicola De Cao's backup to bela format

In [59]:

retrieved_data_folder = Path("/fsx/louismartin/bela/retrieved_from_aws_backup/")
for phase in ["train", "dev"]:
    kilt_format_path = retrieved_data_folder / f"ndecao/TACKBP2015/{phase}.jsonl"
    json_mentions = read_jsonl_file(kilt_format_path)
    for json_mention in json_mentions:
        # json_sample["output"] looks something like [{'KB_ID': 'm.0d06m5', 'answer': ['Q6294']}] 
        # Convert it to -> ['Q6294']
        entities = [entity_id for entity in json_mention["output"] for entity_id in entity["answer"]]
        if len(entities) > 1:
            print(f"More than one entity for {json_mention['meta']['document_id']}: {entities}. Taking only the first one.")
        json_mention["entity_id"] = entities[0]
    json_mentions = [json_mention for json_mention in json_mentions if json_mention["entity_id"] is not None]

    languages = ["ENG", "SPA", "CMN"]
    all_samples = []
    for language in languages:
        language_mentions = [json_mention for json_mention in json_mentions if json_mention["meta"]["document_id"].startswith(language)]
        print(f"{language}: {len(language_mentions)} mentions")
        samples = convert_tackbp_json_mentions_to_samples(language_mentions)
        all_samples.extend(samples)
        print(f"{language}: {len(samples)} samples")
        output_path = retrieved_data_folder / f"ndecao/TACKBP2015/{phase}_bela_format_{language.lower()}.jsonl"
        with open(output_path, "w") as f:
            for sample in samples:
                f.write(json.dumps(sample_to_json(sample)) + "\n")
    output_path = retrieved_data_folder / f"ndecao/TACKBP2015/{phase}_bela_format_all_languages.jsonl"
    random.shuffle(all_samples)
    with open(output_path, "w") as f:
        for sample in all_samples:
            f.write(json.dumps(sample_to_json(sample)) + "\n")

A Jupyter Widget

More than one entity for ENG_NW_001004_20141029_F00000003: ['Q612', 'Q613']. Taking only the first one.
More than one entity for SPA_NW_001078_20150105_F0000002W: ['Q693039', 'Q813']. Taking only the first one.
More than one entity for SPA_NW_001078_20150105_F0000002W: ['Q693039', 'Q813']. Taking only the first one.
More than one entity for SPA_NW_001078_20150105_F0000002W: ['Q693039', 'Q813']. Taking only the first one.
More than one entity for SPA_NW_001066_20150105_F0000002H: ['Q693039', 'Q813']. Taking only the first one.
More than one entity for ENG_DF_001220_20150404_F0000007K: ['Q612', 'Q613']. Taking only the first one.
More than one entity for CMN_DF_000181_20140830_F000000DG: ['Q693039', 'Q813']. Taking only the first one.
More than one entity for CMN_DF_000191_20150401_F000000DX: ['Q693039', 'Q813']. Taking only the first one.
ENG: 9027 mentions


A Jupyter Widget

ValueError: substring not found

# Lorelei
Lorelei samples look like this:
```
{'id': 'Men-VIE_SN_000370_20160324_G0T100LHC-7', 'input': ' [START] Bỉ [END] truy tìm nghi phạm mới trong vụ đánh bom ga tàu điện ngầm Brussels. - Mua bán thiết bị tàu biển https://t.co/Rf0J4cQKgu qua @sharethis', 'output': [{'answer': ['Q31']}], 'meta': {'left_context': '', 'mention': 'Bỉ', 'right_context': 'truy tìm nghi phạm mới trong vụ đánh bom ga tàu điện ngầm Brussels. - Mua bán thiết bị tàu biển https://t.co/Rf0J4cQKgu qua @sharethis', 'doc_id': 'VIE_SN_000370_20160324_G0T100LHC', 'entity_id': 'Ent-VIE_SN_000370_20160324_G0T100LHC-7', 'entity_type': 'GPE', 'mention_status': 'representative', 'start_char': '0', 'end_char': '1', 'original_mention_text': '__\n', 'system_run_id': 'LDC', 'mention_text_2': '__', 'extents': 'VIE_SN_000370_20160324_G0T100LHC:0-1', 'kb_id': '2802361', 'mention_type': 'NAM', 'confidence': '1.0\n'}, 'candidates': ['Q31', 'Q166776', 'Q697625', 'Q17435', 'Q815524', 'Q216022', 'Q20997', 'Q205317', 'Q381124', 'Q21000', 'Q1095', 'Q234', 'Q658870', 'Q15873988', 'Q792312', 'Q3992', 'Q792419', 'Q134121', 'Q240']}
```

In [62]:
lorelei_dir = Path("/fsx/louismartin/bela/retrieved_from_aws_backup/ndecao/lorelei/")
for path in lorelei_dir.glob("*.jsonl"): 
    if "bela_format" in path.name:  # Already processed
        continue
    print(path)
    json_mentions = read_jsonl_file(path)
    for json_mention in json_mentions:
        # json_sample["output"] looks something like [{'KB_ID': 'm.0d06m5', 'answer': ['Q6294']}] 
        # Convert it to -> ['Q6294']
        entities = [entity_id for entity in json_mention.get("output", []) for entity_id in entity["answer"]]
        #if len(entities) > 1:
        #    print(f"More than one entity for {json_mention['meta']['doc_id']}: {entities}. Taking only the first one.")
        if len(entities) == 0:
            json_mention["entity_id"] = None
            continue
        json_mention["entity_id"] = entities[0]
    json_mentions = [json_mention for json_mention in json_mentions if json_mention["entity_id"] is not None]
    samples = convert_lorelei_json_mentions_to_samples(json_mentions)
    output_path = lorelei_dir / f"{path.stem}_bela_format.jsonl"
    with open(output_path, "w") as f:
        for sample in samples:
            f.write(json.dumps(sample_to_json(sample)) + "\n")


/fsx/louismartin/bela/retrieved_from_aws_backup/ndecao/lorelei/lorelei_vietnamese.jsonl


A Jupyter Widget

A Jupyter Widget

/fsx/louismartin/bela/retrieved_from_aws_backup/ndecao/lorelei/lorelei_oromo.jsonl


A Jupyter Widget

A Jupyter Widget

/fsx/louismartin/bela/retrieved_from_aws_backup/ndecao/lorelei/lorelei_tigrinya.jsonl


A Jupyter Widget

A Jupyter Widget

/fsx/louismartin/bela/retrieved_from_aws_backup/ndecao/lorelei/lorelei_ukrainian.jsonl


A Jupyter Widget

A Jupyter Widget

In [10]:
import re

# Define the pattern to match [START] and [END] tags with optional spaces
pattern = re.compile(r'\s*\[START\]\s*(.*?)\s*\[END\]\s*')

# Define the annotated texts
annotated_texts = [
    '#OromoRevolution " [START] Godina Arsii Lixaa [END] Aanaa shaallaatti waraana ummata Oromoofi waraana Agaazii gidduutti... https://t.co/LAxEOiaiEU',
    '#OromoRevolution "Godina Arsii Lixaa [START] Aanaa shaallaatti [END] waraana ummata Oromoofi waraana Agaazii gidduutti... https://t.co/LAxEOiaiEU'
]

# Initialize a set to keep track of unique entities
entities = set()

# Replace the [START] and [END] tags with placeholders
for annotated_text in annotated_texts:
    text = pattern.sub(r' __\1__ ', annotated_text)
    entities.update(re.findall(pattern, annotated_text))
    print(text)

# Join the texts together and replace the placeholders with entities
print([text.strip() for text in text.split('__') if text.strip()])
original_text = ' '.join(text.strip() for text in text.split('__') if text.strip())
print(original_text)
for entity in entities:
    original_text = original_text.replace('__{}__'.format(entity), entity)

print(original_text)


#OromoRevolution " __Godina Arsii Lixaa__ Aanaa shaallaatti waraana ummata Oromoofi waraana Agaazii gidduutti... https://t.co/LAxEOiaiEU
#OromoRevolution "Godina Arsii Lixaa __Aanaa shaallaatti__ waraana ummata Oromoofi waraana Agaazii gidduutti... https://t.co/LAxEOiaiEU
['#OromoRevolution "Godina Arsii Lixaa', 'Aanaa shaallaatti', 'waraana ummata Oromoofi waraana Agaazii gidduutti... https://t.co/LAxEOiaiEU']
#OromoRevolution "Godina Arsii Lixaa Aanaa shaallaatti waraana ummata Oromoofi waraana Agaazii gidduutti... https://t.co/LAxEOiaiEU
#OromoRevolution "Godina Arsii Lixaa Aanaa shaallaatti waraana ummata Oromoofi waraana Agaazii gidduutti... https://t.co/LAxEOiaiEU


# Example of files in BELA/Matcha format

In [None]:
aida_format_path = "/fsx/kassner/data_BELA/wikipedia/aida_pretrain.jsonl"
# gt_entities: list of entities in the format [offset (words), length (words), wiki_data_id, kb_type?]
# entities_raw: list of entities in the format [offset (chars), length (chars), wiki_data_id, kb_type?]
# Seems that kb_type is always "wiki": set([entity[-1] for sample in df_aida["entities_raw"].tolist() for entity in sample])
df_aida = pd.DataFrame(read_jsonl_file(aida_format_path, n_lines=100000))
df_aida

In [66]:
# We only care about raw text and chars offsets
path = "/fsx/movb/data/matcha/mel/test.txt"
df = pd.DataFrame(read_jsonl_file(path))
df

0it [00:00, ?it/s]

Unnamed: 0,data_example_id,original_text,gt_entities
0,14932393_1,Adobe Creek rises on the west flank of Sonoma...,"[[0, 0, Q7562183, wiki, 40, 15], [0, 0, Q71716..."
1,1010039_0,"Esther Gorostiza Garai (Atxondo, Bizkaia, 19...","[[0, 0, Q1242393, wiki, 26, 7], [0, 0, Q93366,..."
2,458956_3,<section Articles connexes > industrie automo...,"[[0, 0, Q65445, wiki, 52, 14], [0, 0, Q184937,..."
3,10739205_2,"From 1956 to 1963, Habib worked as an assista...","[[0, 0, Q1044, wiki, 237, 12], [0, 0, Q502276,..."
4,116287_1,South Sanford is located at lat:43.4019444444...,"[[0, 0, Q637413, wiki, 119, 27]]"
...,...,...,...
99995,11782414_1,Bei den Olympischen Jugendspielen 2010 in Sin...,"[[0, 0, Q613716, wiki, 9, 30], [0, 0, Q815514,..."
99996,16744_0,minidesno200pxKraljevska palata Aranjuez Conc...,"[[0, 0, Q29, wiki, 96, 8], [0, 0, Q151084, wik..."
99997,7646568_0,Werner Protzel (* 5. Oktober 1973 in Rosenhei...,"[[0, 0, Q2930, wiki, 19, 10], [0, 0, Q2477, wi..."
99998,38438_0,Марек Михал Грехута ( 10 Арванхоёрдугаар сар...,"[[0, 0, Q36, wiki, 78, 5]]"


# Old method

The old method used dataframes and was broken / not robust

In [None]:
def unnest_dict_column(df, column_name):
    unnested_df = df[column_name].apply(pd.Series)
    # Add prefix to avoid name collisions
    unnested_df = unnested_df.add_prefix(f"{column_name}.")
    return pd.concat([df.drop([column_name], axis=1), unnested_df], axis=1)


def extract_entity_in_aida_format(row):
    # Format: [offset (chars), length (chars), wiki_data_id, kb_type?]
    return [len(row["meta.left_context_original"]), len(row["meta.mention_original"]), row["output.answer"], "wiki"]  # TODO: Infered that the kb type is wiki but not sure


def concatenate_contexts_and_mention(left_context, mention, right_context):
    left_separator = " "
    right_separator = " "
    # If the right/left context starts/ends with a punctuation, then an extra space would be added, so we have to remove it   
    left_punctuation_chars = tuple(" ,!?;:()[]{}'’‘“\"")
    right_punctuation_chars = tuple(" ,.!?;:()[]{}'’‘“\"")
    if right_context.startswith(right_punctuation_chars):
        right_separator = ""
    if left_context.endswith(left_punctuation_chars):
        left_separator = ""
    return "".join([left_context, left_separator, mention, right_separator, right_context])


def fix_offsets(text, entity, mention):
    """Fix offsets in the entity list that might have been corrupted by the concatenation
    - entity format: [offset (chars), length (chars), wiki_data_id, kb_type]
    """
    offset, length, _, _ = entity
    recovered_mention = text[offset:offset + length]
    if mention != recovered_mention:
        # Find the mention in the text
        lookup_offset = offset - (len(mention) - 1)  # So that we can't match another mention that would be right before the mention
        infered_offset = text[lookup_offset:].find(mention) + (lookup_offset)
        if infered_offset == -1:
            raise ValueError(f"Could not find mention {mention} in text {text}")
        # Fix the offset
        assert abs(infered_offset - offset) < 10, f"Offset is too far from the infered offset: {offset=}, {infered_offset=}, {mention=}, {text[offset:offset + 50]=}"
        entity[0] = infered_offset
        entity[1] = len(mention)
    return entity


# Quick unit tests
assert fix_offsets("hello world", [7, 5, "Q123", "wiki"], "world") == [6, 5, "Q123", "wiki"]
assert fix_offsets("hello world world", [11, 5, "Q123", "wiki"], "world") == [12, 5, "Q123", "wiki"]


def is_correct_entity_offset(text, entity, mention):
    offset, length, _, _ = entity
    return text[offset:offset + length] == mention


def infer_original_text(texts_with_annotated_mentions):
    # NOT USED YET
    """Infer the original text from the texts with annotated mentions.
    Each annotation might have introduced additional spaces but we don't know when, so we combine the longest left and right context to infer the original text.
    Use with: `df_kilt.groupby("meta.document_id").agg({"original_text": infer_original_text}).reset_index()`
    """
    if len(texts_with_annotated_mentions) == 1:
        [text] = texts_with_annotated_mentions
        left_context = text.split(" [START]")[0]
        right_context = text.split("[END] ")[1]
        mention = text.split("[START] ")[1].split(" [END]")[0]
        return concatenate_contexts_and_mention(left_context, mention, right_context)
    left_contexts = [text.split(" [START]")[0] for text in texts_with_annotated_mentions]
    right_contexts = [text.split("[END] ")[1] for text in texts_with_annotated_mentions]
    longest_left_context = max(left_contexts, key=len)
    longest_right_context = max(right_contexts, key=len)
    # Some of the longest left context's end is often some of the start of the right context
    # Find the substring in common
    intersection = ""
    for i in range(len(longest_right_context)):
        if longest_left_context.endswith(longest_right_context[:i]):
            intersection = longest_right_context[:i]
    assert intersection in longest_right_context
    # Merge the two contexts, removing the intersection
    merged = "".join([longest_left_context, longest_right_context.replace(intersection, "")])
    return merged



def convert_kilt_to_bela_format(kilt_format_path):
    """Example usage: `convert_kilt_to_bela_format(Path("/fsx/louismartin/bela/retrieved_from_aws_backup/ndecao/TACKBP2015/train.jsonl"))`
    """
    df_kilt = pd.DataFrame(read_jsonl_file(kilt_format_path, n_lines=10000))
    df_kilt = unnest_dict_column(df_kilt, "meta")
    assert df_kilt["output"].apply(len).unique().tolist() == [1]
    df_kilt["output"] = df_kilt["output"].apply(lambda x: x[0])  # Only one output per sample
    df_kilt = unnest_dict_column(df_kilt, "output")
    # assert df_kilt["output.answer"].apply(len).unique().tolist() == [1]
    df_kilt["output.answer"] = df_kilt["output.answer"].apply(lambda x: x[0])  # Take only the first option, e.g. sample TEDL15_TRAINING_06967 has [Q612, Q613] for mention Dubai

    df_kilt["entities_raw"] = df_kilt.apply(extract_entity_in_aida_format, axis=1).tolist()
    # Aggregate "entities_raw" as list and "meta.input_original" as a single string (should be unique)
    df_kilt["text_raw"] = df_kilt.apply(lambda row: concatenate_contexts_and_mention(row["meta.left_context_original"], row["meta.mention_original"], row["meta.right_context_original"]), axis=1)
    df_bela_format = df_kilt.groupby("meta.document_id").agg({
        "text_raw": "first",  # TODO: The concatenation sometimes produces some different texts (adding extra spaces between mention and context or removing them).
        "entities_raw": list,
        "meta.mention_original": list,
    }).reset_index()
    # Fix offsets
    for _, row in df_bela_format.iterrows():
        fixed_entities = []
        for i, (entity, mention) in enumerate(zip(row["entities_raw"], row["meta.mention_original"])):
            if not is_correct_entity_offset(row["text_raw"], entity, mention):
                try:
                    entity = fix_offsets(row["text_raw"], entity, mention)
                except AssertionError as e:
                    print(f"Skipping mention in {row['meta.document_id']=}: {e}")
                    continue
                assert is_correct_entity_offset(row["text_raw"], entity, mention), f"{mention=}, {entity=}"
            fixed_entities.append(entity)
        row["entities_raw"] = fixed_entities
    df_bela_format["entities_raw"] = df_bela_format["entities_raw"].apply(lambda entities: [[0, 0] + entity for entity in entities])  # Add dummy 0, 0 for backward compatibility
    # HACK: reorder entities in the format that is used in EvaluateMEL.ipynb notebook. Format in EvaluateMEL.ipynb: _,_,ent_id,_,offset,length.
    # TODO: We should uniformize. 
    df_bela_format["entities_raw"] = df_bela_format["entities_raw"].apply(lambda entities: [(dummy_1, dummy_2, entity, source, offset, length) for dummy_1, dummy_2, offset, length, entity, source in entities])
    # document_id	original_text	gt_entities
    df_bela_format = df_bela_format.rename(columns={"meta.document_id": "document_id", "text_raw": "original_text", "entities_raw": "gt_entities"})
    df_bela_format = df_bela_format[["document_id", "original_text", "gt_entities"]]
    new_path = kilt_format_path.parent / f"{kilt_format_path.stem}_bela_format.jsonl"
    # Write to jsonl 
    with open(new_path, "w", encoding="utf8") as f:
        for _, row in df_bela_format.iterrows():
            f.write(row.to_json() + "\n")
    print(f"Saved {new_path}")



In [None]:

retrieved_data_folder = Path("/fsx/louismartin/bela/retrieved_from_aws_backup/")
kilt_format_path = retrieved_data_folder / "ndecao/TACKBP2015/train.jsonl"
#convert_kilt_to_bela_format(kilt_format_path)
df_kilt = pd.DataFrame(read_jsonl_file(kilt_format_path, n_lines=10000))
df_kilt = unnest_dict_column(df_kilt, "meta")
assert df_kilt["output"].apply(len).unique().tolist() == [1]
df_kilt["output"] = df_kilt["output"].apply(lambda x: x[0])  # Only one output per sample
df_kilt = unnest_dict_column(df_kilt, "output")
# assert df_kilt["output.answer"].apply(len).unique().tolist() == [1]
df_kilt["output.answer"] = df_kilt["output.answer"].apply(lambda x: x[0])  # Take only the first option, e.g. sample TEDL15_TRAINING_06967 has [Q612, Q613] for mention Dubai

df_kilt["entities_raw"] = df_kilt.apply(extract_entity_in_aida_format, axis=1).tolist()
# Aggregate "entities_raw" as list and "meta.input_original" as a single string (should be unique)
df_kilt["text_raw"] = df_kilt.apply(lambda row: concatenate_contexts_and_mention(row["meta.left_context_original"], row["meta.mention_original"], row["meta.right_context_original"]), axis=1)
df_bela_format = df_kilt.groupby("meta.document_id").agg({
    "text_raw": "first",  # TODO: The concatenation sometimes produces some different texts (adding extra spaces between mention and context or removing them).
    "entities_raw": list,
    "meta.mention_original": list,
}).reset_index()

# Draft

In [None]:
from bela.utils.analysis_utils import Entity, Sample


def coerce_to_sample(jsonl_sample):
    """Coerce a jsonl sample to a Sample object.
    """
    if isinstance(jsonl_sample, Sample):
        return jsonl_sample
    # BELA format
    if all(key in jsonl_sample for key in ["document_id", "original_text", "gt_entities"]):
        ground_truth_entities = [
            Entity(entity_id=entity_id, offset=offset, length=length, text=jsonl_sample['original_text'])
            for _, _, entity_id, _, offset, length in jsonl_sample['gt_entities']
        ]
        return Sample(text=jsonl_sample['original_text'], ground_truth_entities=ground_truth_entities)
    raise ValueError(f"Unknown format for {jsonl_sample=}")


def coerce_to_entity(jsonl_entity):
    """Coerce a jsonl entity to an Entity object.
    """
    if isinstance(jsonl_entity, Entity):
        return jsonl_entity
    # KILT format
    if all(key in jsonl_entity for key in ["id", "input", "output", "meta"]):
        return Entity(entity_id=jsonl_entity["output"]["answer"], offset=jsonl_entity["meta"]["offset"], length=jsonl_entity["meta"]["length"], text=jsonl_entity["input"])
    raise ValueError(f"Unknown format for {jsonl_entity=}")

In [None]:
from pprint import pprint
#kilt_format_path = Path.home() / "dev/BELA/data/KILT_format/TACKBP2015_training.jsonl"
retrieved_data_folder = Path("/fsx/louismartin/bela/retrieved_from_aws_backup/")
kilt_format_path = retrieved_data_folder / "ndecao/TACKBP2015/train.jsonl"
for line in islice(yield_jsonl_lines(kilt_format_path), 10, 100):
    if not line["meta"]["document_id"].startswith("EN"):
        continue
    print("*" * 100)
    pprint(line, width=300)

In [None]:
!head /fsx/louismartin/bela/retrieved_from_aws_backup/ndecao/TACKBP2015/train_bela_format.jsonl

In [None]:
merged

In [None]:
#df = df[df["meta.document_id"].str.startswith("ENG")]  # Take only english
#df = df[~df["meta.document_id"].str.startswith("CMN")]  # Remove Chinese

In [None]:
len("hello world ")

In [None]:
mask = df_kilt["meta.document_id"] == "CMN_NW_001145_20150413_F0000005B"
df_kilt[mask].to_dict(orient="records")

In [None]:
print("Documents with texts not joined correctly")
mask = (df["n_text_raw"] > 1)
print(df[mask]["meta.document_id"].tolist()[:3])

In [None]:
document_id = "ENG_NW_001006_20150301_F00000005"  # Bosco café
document_id = "CMN_DF_000178_20150318_F000000CO"
document_id = "CMN_DF_000178_20150318_F000000CO"
mask = df["meta.document_id"] == document_id
print(list(df[mask].head(1).to_dict(orient="records")[0]["text_raw"])[0])
print("------------")
print(list(df[mask].head(1).to_dict(orient="records")[0]["text_raw"])[1])

In [None]:
mask = df_kilt["output"].apply(lambda x: x[0]["answer"] != [None])
df_kilt[mask]