In [1]:
import os
import spacy
import pickle
import random
from cassis import load_typesystem, load_cas_from_xmi
import pandas as pd
import numpy as np

In [2]:
def get_paths(test_set_metadata_path, rounds_path="../data/annotations_and_sources"):
  paths_data = {}

  with open(test_set_metadata_path, "rb") as f:
    test_set_metadata = pickle.load(f)

  for fname, annotator in test_set_metadata.items():
    paths_data[fname] = {}
    paths_data[fname]["annotations"] = f"{rounds_path}/annotation/{fname}.txt/{annotator}/{annotator}.xmi"
    paths_data[fname]["typesystem"] = f"{rounds_path}/annotation/{fname}.txt/{annotator}/TypeSystem.xml"
    paths_data[fname]["source"] = f"{rounds_path}/source/{fname}.txt"
    
  return paths_data

In [3]:
def get_cas(paths):
  with open(paths["typesystem"], "rb") as f:
    typesystem = load_typesystem(f)
    
  with open(paths["annotations"], "rb") as f:
    cas = load_cas_from_xmi(f, typesystem=typesystem)

  return cas

In [4]:
def get_annotated_units(paths_data):
  ground_truth = {}
  get_main_info = lambda ground_elem: (ground_elem.get_covered_text(), pd.Interval(ground_elem.begin, ground_elem.end, closed="left"))

  for fname, paths in paths_data.items():
    ground_truth[fname] = {}

    cas = get_cas(paths)

    ground_numbers = [ann for ann in cas.select("custom.Span") if ann.label == "Number"]
    ground_units = [(ann.Governor, ann.Dependent) for ann in cas.select("custom.Relation") if ann.Governor.label == "Number" and ann.Dependent.label == "Unit"]

    for ground_number in ground_numbers:
      ground_n = get_main_info(ground_number)
      ground_truth[fname][ground_n] = []

    for ground_number, ground_unit in ground_units:
      ground_n = get_main_info(ground_number)
      ground_u = get_main_info(ground_unit)
      ground_truth[fname][ground_n].append(ground_u)

  return ground_truth

In [5]:
TEST_SET_METADATA_PATH = "../data/test_set_metadata.pkl"
paths = get_paths(TEST_SET_METADATA_PATH)

In [6]:
paths

{'a2_6051': {'annotations': '../data/annotations_and_sources/annotation/a2_6051.txt/annotator2/annotator2.xmi',
  'typesystem': '../data/annotations_and_sources/annotation/a2_6051.txt/annotator2/TypeSystem.xml',
  'source': '../data/annotations_and_sources/source/a2_6051.txt'},
 'a2_186485': {'annotations': '../data/annotations_and_sources/annotation/a2_186485.txt/annotator2/annotator2.xmi',
  'typesystem': '../data/annotations_and_sources/annotation/a2_186485.txt/annotator2/TypeSystem.xml',
  'source': '../data/annotations_and_sources/source/a2_186485.txt'},
 'a2_26483': {'annotations': '../data/annotations_and_sources/annotation/a2_26483.txt/annotator2/annotator2.xmi',
  'typesystem': '../data/annotations_and_sources/annotation/a2_26483.txt/annotator2/TypeSystem.xml',
  'source': '../data/annotations_and_sources/source/a2_26483.txt'},
 'a3_223620': {'annotations': '../data/annotations_and_sources/annotation/a3_223620.txt/annotator3/annotator3.xmi',
  'typesystem': '../data/annotation

In [7]:
# Number of excerpts in the test set
len(paths.keys())

156

In [8]:
annotated_units = get_annotated_units(paths)

In [9]:
annotated_units

{'a2_6051': {('1.1 million',
   Interval(270, 281, closed='left')): [('people',
    Interval(282, 288, closed='left'))],
  ('378,000',
   Interval(299, 306, closed='left')): [('children',
    Interval(311, 319, closed='left'))],
  ('307,000',
   Interval(324, 331, closed='left')): [('women',
    Interval(336, 341, closed='left'))]},
 'a2_186485': {('84%',
   Interval(104, 107, closed='left')): [('women',
    Interval(111, 116, closed='left'))],
  ('5%', Interval(170, 172, closed='left')): [],
  ('35%',
   Interval(230, 233, closed='left')): [('women',
    Interval(237, 242, closed='left'))],
  ('50%',
   Interval(282, 285, closed='left')): [('men',
    Interval(289, 292, closed='left'))]},
 'a2_26483': {('60%', Interval(88, 91, closed='left')): [],
  ('70%', Interval(121, 124, closed='left')): []},
 'a3_223620': {('10 percent',
   Interval(79, 89, closed='left')): [('livestock production',
    Interval(30, 50, closed='left'))],
  ('60 percent',
   Interval(121, 131, closed='left')): [(