In [1]:
import json
from pathlib import Path
from itertools import islice

from tqdm.notebook import tqdm
#import itables.interactive
import pandas as pd
# Set interactive pandas
#itables.options.mode = "notebook"


def yield_jsonl_lines(path):
    with open(path, "rt") as f:
        for line in f:
            yield json.loads(line)


def inspect_json(json_dict, indent=0, max_elements=3):
    """Only print the first 3 keys of a dict, recursively"""
    for i, (key, value) in enumerate(json_dict.items()):
        if isinstance(value, dict):
            print("  " * indent + f"{key}:")
            inspect_json(value, indent=indent + 1)
        else:
            print("  " * indent + f"{key}: {value}")
        if i + 1 == max_elements:
            print("  " * indent + "...")
            break


def read_jsonl_file(path, n_lines=None):
    """Read a jsonl file and return a pandas DataFrame"""
    return pd.DataFrame(tqdm(islice(yield_jsonl_lines(path), n_lines), total=n_lines))


In [None]:
aida_format_path = "/fsx/kassner/data_BELA/wikipedia/aida_pretrain.jsonl"
# gt_entities: list of entities in the format [offset (words), length (words), wiki_data_id, kb_type?]
# entities_raw: list of entities in the format [offset (chars), length (chars), wiki_data_id, kb_type?]
# Seems that kb_type is always "wiki": set([entity[-1] for sample in df_aida["entities_raw"].tolist() for entity in sample])
df_aida = read_jsonl_file(aida_format_path, n_lines=100000)
df_aida

# Example of files in BELA/Matcha format

In [7]:
# We only care about raw text and chars offsets
path = "/fsx/movb/data/matcha/mel/test.txt"
df = read_jsonl_file(path)
df

0it [00:00, ?it/s]

Unnamed: 0,data_example_id,original_text,gt_entities
0,14932393_1,Adobe Creek rises on the west flank of Sonoma...,"[[0, 0, Q7562183, wiki, 40, 15], [0, 0, Q71716..."
1,1010039_0,"Esther Gorostiza Garai (Atxondo, Bizkaia, 19...","[[0, 0, Q1242393, wiki, 26, 7], [0, 0, Q93366,..."
2,458956_3,<section Articles connexes > industrie automo...,"[[0, 0, Q65445, wiki, 52, 14], [0, 0, Q184937,..."
3,10739205_2,"From 1956 to 1963, Habib worked as an assista...","[[0, 0, Q1044, wiki, 237, 12], [0, 0, Q502276,..."
4,116287_1,South Sanford is located at lat:43.4019444444...,"[[0, 0, Q637413, wiki, 119, 27]]"
...,...,...,...
99995,11782414_1,Bei den Olympischen Jugendspielen 2010 in Sin...,"[[0, 0, Q613716, wiki, 9, 30], [0, 0, Q815514,..."
99996,16744_0,minidesno200pxKraljevska palata Aranjuez Conc...,"[[0, 0, Q29, wiki, 96, 8], [0, 0, Q151084, wik..."
99997,7646568_0,Werner Protzel (* 5. Oktober 1973 in Rosenhei...,"[[0, 0, Q2930, wiki, 19, 10], [0, 0, Q2477, wi..."
99998,38438_0,Марек Михал Грехута ( 10 Арванхоёрдугаар сар...,"[[0, 0, Q36, wiki, 78, 5]]"


In [64]:
def unnest_dict_column(df, column_name):
    unnested_df = df[column_name].apply(pd.Series)
    # Add prefix to avoid name collisions
    unnested_df = unnested_df.add_prefix(f"{column_name}.")
    return pd.concat([df.drop([column_name], axis=1), unnested_df], axis=1)


def extract_entity_in_aida_format(row):
    # Format: [offset (chars), length (chars), wiki_data_id, kb_type?]
    return [len(row["meta.left_context_original"]), len(row["meta.mention_original"]), row["output.answer"], "wiki"]  # TODO: Infered that the kb type is wiki but not sure


def concatenate_contexts_and_mention(left_context, mention, right_context):
    left_separator = " "
    right_separator = " "
    # If the right/left context starts/ends with a punctuation, then an extra space would be added, so we have to remove it   
    left_punctuation_chars = tuple(" ,!?;:()[]{}'’‘“\"")
    right_punctuation_chars = tuple(" ,.!?;:()[]{}'’‘“\"")
    if right_context.startswith(right_punctuation_chars):
        right_separator = ""
    if left_context.endswith(left_punctuation_chars):
        left_separator = ""
    return "".join([left_context, left_separator, mention, right_separator, right_context])


def fix_offsets(text, entity, mention):
    """Fix offsets in the entity list that might have been corrupted by the concatenation
    - entity format: [offset (chars), length (chars), wiki_data_id, kb_type]
    """
    offset, length, _, _ = entity
    recovered_mention = text[offset:offset + length]
    if mention != recovered_mention:
        # Find the mention in the text
        lookup_offset = offset - (len(mention) - 1)  # So that we can't match another mention that would be right before the mention
        infered_offset = text[lookup_offset:].find(mention) + (lookup_offset)
        if infered_offset == -1:
            raise ValueError(f"Could not find mention {mention} in text {text}")
        # Fix the offset
        assert abs(infered_offset - offset) < 10, f"Offset is too far from the infered offset: {offset=}, {infered_offset=}, {mention=}, {text[offset:offset + 50]=}"
        entity[0] = infered_offset
        entity[1] = len(mention)
    return entity


# Quick unit tests
assert fix_offsets("hello world", [7, 5, "Q123", "wiki"], "world") == [6, 5, "Q123", "wiki"]
assert fix_offsets("hello world world", [11, 5, "Q123", "wiki"], "world") == [12, 5, "Q123", "wiki"]


def is_correct_entity_offset(text, entity, mention):
    offset, length, _, _ = entity
    return text[offset:offset + length] == mention


def infer_original_text(texts_with_annotated_mentions):
    # NOT USED YET
    """Infer the original text from the texts with annotated mentions.
    Each annotation might have introduced additional spaces but we don't know when, so we combine the longest left and right context to infer the original text.
    Use with: `df_kilt.groupby("meta.document_id").agg({"original_text": infer_original_text}).reset_index()`
    """
    if len(texts_with_annotated_mentions) == 1:
        [text] = texts_with_annotated_mentions
        left_context = text.split(" [START]")[0]
        right_context = text.split("[END] ")[1]
        mention = text.split("[START] ")[1].split(" [END]")[0]
        return concatenate_contexts_and_mention(left_context, mention, right_context)
    left_contexts = [text.split(" [START]")[0] for text in texts_with_annotated_mentions]
    right_contexts = [text.split("[END] ")[1] for text in texts_with_annotated_mentions]
    longest_left_context = max(left_contexts, key=len)
    longest_right_context = max(right_contexts, key=len)
    # Some of the longest left context's end is often some of the start of the right context
    # Find the substring in common
    intersection = ""
    for i in range(len(longest_right_context)):
        if longest_left_context.endswith(longest_right_context[:i]):
            intersection = longest_right_context[:i]
    assert intersection in longest_right_context
    # Merge the two contexts, removing the intersection
    merged = "".join([longest_left_context, longest_right_context.replace(intersection, "")])
    return merged



def convert_kilt_to_bela_format(kilt_format_path):
    """Example usage: `convert_kilt_to_bela_format(Path("/fsx/louismartin/bela/retrieved_from_aws_backup/ndecao/TACKBP2015/train.jsonl"))`
    """
    df_kilt = read_jsonl_file(kilt_format_path, n_lines=10000)
    df_kilt = unnest_dict_column(df_kilt, "meta")
    assert df_kilt["output"].apply(len).unique().tolist() == [1]
    df_kilt["output"] = df_kilt["output"].apply(lambda x: x[0])  # Only one output per sample
    df_kilt = unnest_dict_column(df_kilt, "output")
    # assert df_kilt["output.answer"].apply(len).unique().tolist() == [1]
    df_kilt["output.answer"] = df_kilt["output.answer"].apply(lambda x: x[0])  # Take only the first option, e.g. sample TEDL15_TRAINING_06967 has [Q612, Q613] for mention Dubai

    df_kilt["entities_raw"] = df_kilt.apply(extract_entity_in_aida_format, axis=1).tolist()
    # Aggregate "entities_raw" as list and "meta.input_original" as a single string (should be unique)
    df_kilt["text_raw"] = df_kilt.apply(lambda row: concatenate_contexts_and_mention(row["meta.left_context_original"], row["meta.mention_original"], row["meta.right_context_original"]), axis=1)
    df_bela_format = df_kilt.groupby("meta.document_id").agg({
        "text_raw": "first",  # TODO: The concatenation sometimes produces some different texts (adding extra spaces between mention and context or removing them).
        "entities_raw": list,
        "meta.mention_original": list,
    }).reset_index()
    # Fix offsets
    for _, row in df_bela_format.iterrows():
        fixed_entities = []
        for i, (entity, mention) in enumerate(zip(row["entities_raw"], row["meta.mention_original"])):
            if not is_correct_entity_offset(row["text_raw"], entity, mention):
                try:
                    entity = fix_offsets(row["text_raw"], entity, mention)
                except AssertionError as e:
                    print(f"Skipping mention in {row['meta.document_id']=}: {e}")
                    continue
                assert is_correct_entity_offset(row["text_raw"], entity, mention), f"{mention=}, {entity=}"
            fixed_entities.append(entity)
        row["entities_raw"] = fixed_entities
    df_bela_format["entities_raw"] = df_bela_format["entities_raw"].apply(lambda entities: [[0, 0] + entity for entity in entities])  # Add dummy 0, 0 for backward compatibility
    # data_example_id	original_text	gt_entities
    df_bela_format = df_bela_format.rename(columns={"meta.document_id": "data_example_id", "text_raw": "original_text", "entities_raw": "gt_entities"})
    df_bela_format = df_bela_format[["data_example_id", "original_text", "gt_entities"]]
    new_path = kilt_format_path.parent / f"{kilt_format_path.stem}_bela_format.jsonl"
    # Write to jsonl 
    with open(new_path, "w", encoding="utf8") as f:
        for _, row in df_bela_format.iterrows():
            f.write(row.to_json() + "\n")
    print(f"Saved {new_path}")



kilt_format_path = Path.home() / "dev/BELA/data/KILT_format/TACKBP2015_training.jsonl"
retrieved_data_folder = Path("/fsx/louismartin/bela/retrieved_from_aws_backup/")
kilt_format_path = retrieved_data_folder / "ndecao/TACKBP2015/train.jsonl"
convert_kilt_to_bela_format(kilt_format_path)

  0%|          | 0/10000 [00:00<?, ?it/s]

Skipping mention in row['meta.document_id']='CMN_NW_001145_20150413_F0000005B': Offset is too far from the infered offset: offset=287, infered_offset=441, mention='美', text[offset:offset + 50]='古两国领导人首次正式会晤。会晤持续一个多小时，奥巴马赞赏卡斯特罗“开明坦诚、彬彬有礼”。 </P> '
Skipping mention in row['meta.document_id']='CMN_NW_001145_20150413_F0000005B': Offset is too far from the infered offset: offset=1197, infered_offset=1240, mention='美', text[offset:offset + 50]='古有望恢复外交关系 </P> <P> \u3000\u300011日，奥巴马与劳尔·卡斯特罗闭门会晤打破了美国和古巴半个'
Skipping mention in row['meta.document_id']='CMN_NW_001145_20150413_F0000005B': Offset is too far from the infered offset: offset=1762, infered_offset=1782, mention='美', text[offset:offset + 50]='古之间其他分歧。 </P> <P> \u3000\u3000美国会否解除经济封锁？ </P> <P> \u3000\u3000奥巴马任期内恐'
Skipping mention in row['meta.document_id']='CMN_NW_001145_20150413_F0000005B': Offset is too far from the infered offset: offset=2165, infered_offset=2252, mention='美', text[offset:offset + 50]='古关系正常化对彼此都有利，奥巴马要为自己留下外交遗产，会去推动这

In [63]:
!head /fsx/louismartin/bela/retrieved_from_aws_backup/ndecao/TACKBP2015/train_bela_format.jsonl

{"data_example_id":"CMN_DF_000020_20150228_F000000CW","original_text":"<doc id=\"CMN_DF_000020_20150228_F000000CW\"> <headline> \u8c01\u67aa\u6740\u4e86\u6d85\u59c6\u4f50\u592b\uff1f <\/headline> <post id=\"p1\" author=\"box321\" datetime=\"2015-02-28T20:05:00\"> (Article) <\/post> <post id=\"p2\" author=\"\u7c73\u9505\u5927\u997c\" datetime=\"2015-02-28T20:47:00\"> \u5148\u522b\u6307\u8d23\u8c01\u6697\u6740\u7684\uff0c\u8981\u5148\u770b\u770b\u53f0\u4e0a\u535a\u5f08\u8005\u8c01\u4e3b\u52a8\u4e86\uff0c\u8c01\u88ab\u52a8\u4e86\uff1f\u6700\u540e\u518d\u7814\u7a76\u4e00\u4e0b\u897f\u65b9\u7f8e\u72d7\u8206\u8bba\u8d70\u5411\uff0c\u57fa\u672c\u5c31\u80fd\u65ad\u5b9a\u8c01\u662f\u80cc\u540e\u7b56\u5212\u8005\u4e86\u3002\u5f53\u7136\u6211\u4e0d\u662f\u60f3\u8bf4\u660e\u6b64\u4eba\u8be5\u6740\u8fd8\u662f\u4e0d\u8be5\u6740\uff0c\u4f46\u5bf9\u4e8e\u6211\u4e2a\u4eba\uff0c\u6211\u89c9\u5f97\u7f8e\u72d7\u8be5\u6740\uff01&lt;img src=\"http:\/\/image.club.china.com\/data\/templets\/default\/images\/p

In [41]:
merged

'Hello my name is Jack and I am a student.'

In [None]:
#df = df[df["meta.document_id"].str.startswith("ENG")]  # Take only english
#df = df[~df["meta.document_id"].str.startswith("CMN")]  # Remove Chinese

In [32]:
len("hello world ")

12

In [62]:
mask = df_kilt["meta.document_id"] == "CMN_NW_001145_20150413_F0000005B"
df_kilt[mask].to_dict(orient="records")

[{'id': 'TEDL15_TRAINING_03700',
  'input': '<DOC id="CMN_NW_001145_20150413_F0000005B"> <SOURCE>http://news.xinhuanet.com/world/2015-04/13/c_127680872.htm</SOURCE> <DATE_TIME>2015-04-13T00:00:00</DATE_TIME> <HEADLINE> 美古首脑半个世纪来首次会晤 美国古巴何时能够复交? </HEADLINE> <TEXT> <P> [START] 美国 [END] 总统奥巴马和古巴领导人劳尔·卡斯特罗11日在巴拿马首都巴拿马城出席第七届美洲国家首脑会议期间举行会晤。这是半个世纪以来，美古两国领导人首次正式会晤。会晤持续一个多小时，奥巴马赞赏卡斯特罗“开明坦诚、彬彬有礼”。 </P> <P> \u3000\u3000奥巴马愿“翻篇” </P> <P> \u3000\u3000奥巴马和劳尔当天先是在媒体记者面前互相问候。 </P> <P> \u3000\u3000两人面对媒体记者坐下后，奥巴马首先开口。他说：“显而易见，这是一场历史性会晤。”他表示，美国过去50年对古巴实施的外交政策未能奏效，“在通向未来的道路上，现在是时候做一些新尝试了。” </P> <P> \u3000\u3000奥巴马说：“我们达成的共识是，我们可以在尊重和礼貌的基调下存在分歧。随着时间推移，我们有可能‘翻篇’，发展两国新型关系。”奥巴马同时表示，要实现美古两国关系正常化，首先要做的就是重开大使馆。 </P> <P> \u3000\u30001959年古巴革命胜利后，美国于1961年与古巴断绝外交关系，随后关闭使馆。次年，美国宣布对古巴实施经济、金融封锁和贸易禁运。奥巴马和劳尔去年12月17日分别发表讲话，宣布启动两国关系正常化进程。 </P> <P> \u3000\u3000闭门会晤毫不紧张 </P> <P> \u3000\u3000奥巴马讲完后，他和劳尔同时起身，面对媒体记者握手致意。 </P> <P> \u3000\u3000劳尔随后说，他同意奥巴马说的一切。但他同时强调，两国政府仍存在分歧。不过，双方都同意尊重对方观点。劳尔用西班牙语说：“我们愿意谈任何事情，但一切都需要耐心，而且要非常耐心

In [51]:
print("Documents with texts not joined correctly")
mask = (df["n_text_raw"] > 1)
print(df[mask]["meta.document_id"].tolist()[:3])

Documents with texts not joined correctly
['CMN_DF_000020_20150301_F000000CN', 'CMN_DF_000178_20150318_F000000CO', 'CMN_DF_000181_20140726_F000000BW']


In [57]:
document_id = "ENG_NW_001006_20150301_F00000005"  # Bosco café
document_id = "CMN_DF_000178_20150318_F000000CO"
document_id = "CMN_DF_000178_20150318_F000000CO"
mask = df["meta.document_id"] == document_id
print(list(df[mask].head(1).to_dict(orient="records")[0]["text_raw"])[0])
print("------------")
print(list(df[mask].head(1).to_dict(orient="records")[0]["text_raw"])[1])

<doc id="CMN_DF_000178_20150318_F000000CO"> <headline> [原创]内塔尼亚胡获胜说明了什么 </headline> <post id="p1" author="wq1129" datetime="2015-03-18T09:13:00"> 以色列大选，现任总理再次获胜，这事虽然与我没什么关系，但是恶心的是前几天国内媒体为什么总是抵毁内塔尼亚胡呢，一直都在说他没什么机会，这个属于干涉别国的内政吗，那些所谓专家难道都是猪脑袋吗 </post> <post id="p2" author="联署 " datetime="2015-03-18T09:26:00"> 香烟爱上火柴就注定要被伤害 </post> <post id="p3" author="那老头" datetime="2015-03-18T09:50:00"> 人家可是靠手持的一张张选票获胜的；而天朝是屁民莫名其妙地无数次“被代表”还要表现的感激零涕的样子----跟真的一样！ </post> <post id="p4" author="桑丘" datetime="2015-03-18T10:00:00"> 以色列周围全是虎视眈眈的阿拉伯敌对国，国内强硬派获胜的机率远大于温和派。 </post> <post id="p5" author="9命猫妖" datetime="2015-03-18T10:20:00"> 这说明内塔尼亚胡的主张得到了大多数以色列人的赞同。特别是到美国会演讲的那些主张。 </post> <post id="p6" author="我开推土机" datetime="2015-03-18T10:56:00"> 我们媒体评论员思想觉悟高，体现在处处“讲政治”。 </post> </doc>
------------
<doc id="CMN_DF_000178_20150318_F000000CO"> <headline> [原创]内塔尼亚胡获胜说明了什么 </headline> <post id="p1" author="wq1129" datetime="2015-03-18T09:13:00"> 以色列大选，现任总理再次获胜，这事虽然与我没什么关系，但是恶心的是前几天国内媒体为什么总是抵毁内塔尼亚胡呢，一直都在说他没什么机会，这个属于干涉别

In [10]:
mask = df_kilt["output"].apply(lambda x: x[0]["answer"] != [None])
df_kilt[mask]

Unnamed: 0,id,input,output,left_context,mention,right_context,document_id,start_offset,end_offset,system_run_ID,...,entity_type,mention_type,confidence,web_search,wiki_text,Unknown,input_original,left_context_original,mention_original,right_context_original
1223,TEDL15_TRAINING_01226,"<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...","[{'KB_ID': 'm.0f1lfn', 'answer': ['Q201377']}]","<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...",刀锋战士,”阻止枪杀案件重审 申诉被驳回 </HEADLINE> <TEXT> <P> 在13日...,CMN_NW_001149_20150314_F0000005H,190,194,LDC,...,PER,NAM,1.0,N,N,N\n,"<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...","<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...",刀锋战士,”阻止枪杀案件重审 申诉被驳回 </HEADLINE> <TEXT> <P> 在13日...
1224,TEDL15_TRAINING_01227,"<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...","[{'KB_ID': 'm.0f1lfn', 'answer': ['Q201377']}]","<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...",皮斯托瑞斯,的法律团队企图阻止其枪杀女友案件重新在最高法院审理的申诉被驳回。 </P> <P> 20...,CMN_NW_001149_20150314_F0000005H,252,257,LDC,...,PER,NAM,1.0,N,N,N\n,"<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...","<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...",皮斯托瑞斯,的法律团队企图阻止其枪杀女友案件重新在最高法院审理的申诉被驳回。 </P> <P> 20...
1225,TEDL15_TRAINING_01228,"<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...","[{'KB_ID': 'm.0f1lfn', 'answer': ['Q201377']}]","<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...",皮斯托瑞斯,杀死女友案的定罪和判刑均提出了上诉。之后，法庭批准了此项上诉。 </P> <P> “刀锋...,CMN_NW_001149_20150314_F0000005H,331,336,LDC,...,PER,NAM,1.0,N,N,N\n,"<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...","<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...",皮斯托瑞斯,杀死女友案的定罪和判刑均提出了上诉。之后，法庭批准了此项上诉。 </P> <P> “刀锋...
1226,TEDL15_TRAINING_01229,"<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...","[{'KB_ID': 'm.0f1lfn', 'answer': ['Q201377']}]","<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...",刀锋战士,”的辩护律师企图阻止法院重新开庭审理此案的申诉被驳回后，律师表示控辩双方都有权利陈述自己的观...,CMN_NW_001149_20150314_F0000005H,380,384,LDC,...,PER,NAM,1.0,N,N,N\n,"<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...","<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...",刀锋战士,”的辩护律师企图阻止法院重新开庭审理此案的申诉被驳回后，律师表示控辩双方都有权利陈述自己的观...
1227,TEDL15_TRAINING_01230,"<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...","[{'KB_ID': 'm.0f1lfn', 'answer': ['Q201377']}]","<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...",皮斯托瑞斯,在2013年2月14日“情人节”当天，透过浴室的门射杀了女友斯滕坎普。南非北豪登省最高法院2...,CMN_NW_001149_20150314_F0000005H,450,455,LDC,...,PER,NAM,1.0,N,N,N\n,"<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...","<DOC id=""CMN_NW_001149_20150314_F0000005H""> <S...",皮斯托瑞斯,在2013年2月14日“情人节”当天，透过浴室的门射杀了女友斯滕坎普。南非北豪登省最高法院2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,TEDL15_TRAINING_02003,"<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...","[{'KB_ID': 'm.05fz6q', 'answer': ['Q363846']}]","<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...",涅姆佐夫,的过程中保持了积极接触。 </P> <P> 目前，俄安全部门正在核实此次抓获的2名嫌疑人...,CMN_NW_001146_20150311_F0000005D,578,582,LDC,...,PER,NAM,1.0,N,N,N\n,"<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...","<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...",涅姆佐夫,的过程中保持了积极接触。 </P> <P> 目前，俄安全部门正在核实此次抓获的2名嫌疑人...
1996,TEDL15_TRAINING_02004,"<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...","[{'KB_ID': 'm.05fz6q', 'answer': ['Q363846']}]","<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...",涅姆佐夫,被害一案，并将根据调查结果决定是否向法院申请对他们实施逮捕。 </P> <P> 莫斯科巴...,CMN_NW_001146_20150311_F0000005D,635,639,LDC,...,PER,NAM,1.0,N,N,N\n,"<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...","<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...",涅姆佐夫,被害一案，并将根据调查结果决定是否向法院申请对他们实施逮捕。 </P> <P> 莫斯科巴...
1997,TEDL15_TRAINING_02005,"<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...","[{'KB_ID': 'm.05fz6q', 'answer': ['Q363846']}]","<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...",涅姆佐夫,的5名犯罪嫌疑人。俄罗斯安全部门表示，根据警方掌握的初步鉴定结果，有充足的证据证明射杀涅姆佐...,CMN_NW_001146_20150311_F0000005D,700,704,LDC,...,PER,NAM,1.0,N,N,N\n,"<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...","<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...",涅姆佐夫,的5名犯罪嫌疑人。俄罗斯安全部门表示，根据警方掌握的初步鉴定结果，有充足的证据证明射杀涅姆佐...
1998,TEDL15_TRAINING_02006,"<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...","[{'KB_ID': 'm.05fz6q', 'answer': ['Q363846']}]","<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...",涅姆佐夫,的是绍尔·达达耶夫。巴斯曼区法院临时院长穆什尼科娃说，5名犯罪嫌疑人中只有绍尔·达达耶夫承认...,CMN_NW_001146_20150311_F0000005D,747,751,LDC,...,PER,NAM,1.0,N,N,N\n,"<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...","<DOC id=""CMN_NW_001146_20150311_F0000005D""> <S...",涅姆佐夫,的是绍尔·达达耶夫。巴斯曼区法院临时院长穆什尼科娃说，5名犯罪嫌疑人中只有绍尔·达达耶夫承认...
