In [40]:
import os
import csv
import glob
from indra_world.sources import eidos
from indra.statements import Event

In [41]:
parts = ['amyoutput', 'winoutput1', 'winoutput2']
stmts = {}
for part in parts:
    fnames = glob.glob(part + '/*.jsonld')
    stmts[part] = []
    for fname in fnames:
        doc = os.path.basename(fname)[:-7]
        ep = eidos.process_json_file(fname, extract_filter=['influence'])
        ep.extract_all_events()
        fix_provenance(doc, ep.statements)
        stmts[part] += ep.statements

In [24]:
def fix_provenance(fname, stmts):
    doc_id = os.path.basename(fname).split('.')[0]
    for stmt in stmts:
        for ev in stmt.evidence:
            ev.text_refs['DART'] = doc_id


def get_indexed_event(event):
    wmgr = event.concept.db_refs.get('WM')
    gr = None if not wmgr else wmgr[0]
    doc_char_pos = event.evidence[0].annotations['provenance'][0]['documentCharPositions']
    if isinstance(doc_char_pos, list):
        pos = doc_char_pos[0]
    else:
        pos = doc_char_pos
    doc_id = event.evidence[0].text_refs['DART']
    key = (doc_id, pos['start'], pos['end'])
    return key, gr, event.concept.db_refs['TEXT']


def get_indexed_events(stmts):
    indexed_events = {}
    for stmt in stmts:
        if isinstance(stmt, Event):
            key, gr, text = get_indexed_event(stmt)
            if not key:
                continue
            indexed_events[key] = (gr, text)
    return indexed_events    

In [25]:
indexed_events = {}
for part in parts:
    indexed_events[part] = get_indexed_events(stmts[part])

In [26]:
indexed_events

{'amyoutput': {('0a6200447248b0bfb4a67d0fb5e84cbd',
   344,
   355): ([('wm/concept/environment/climate', 1.0),
    None,
    ('wm/process/communication/informing', 0.876518726348877),
    None], 'climate news'),
  ('0a6200447248b0bfb4a67d0fb5e84cbd',
   433,
   459): ([('wm/process/communication/informing', 0.8533480167388916),
    ('wm/property/stability', 1.0509308576583862),
    None,
    None], 'You may not have noticed it'),
  ('0a6200447248b0bfb4a67d0fb5e84cbd',
   522,
   571): ([('wm/concept/environment/climate', 1.0),
    None,
    ('wm/process/communication/meeting', 1.7538217306137085),
    None], 'collapse of the COP25 climate conference in Madrid'),
  ('0a6200447248b0bfb4a67d0fb5e84cbd',
   566,
   571): ([('wm/concept/economy/tourism', 0.63947594165802),
    None,
    None,
    None], 'Madrid'),
  ('0a6200447248b0bfb4a67d0fb5e84cbd',
   901,
   934): ([('wm/property/preparedness', 0.8710763454437256),
    None,
    None,
    None], 'both the most optimistic scenarios'),


In [32]:
groundings_by_indexed_event = defaultdict(dict)
for part, indexed_evs in indexed_events.items():
    for index, grounding in indexed_evs.items():
        groundings_by_indexed_event[index][part] = grounding[0]
        groundings_by_indexed_event[index]['text'] = grounding[1]

In [34]:
dict(groundings_by_indexed_event)

{('0a6200447248b0bfb4a67d0fb5e84cbd',
  344,
  355): {'amyoutput': [('wm/concept/environment/climate', 1.0),
   None,
   ('wm/process/communication/informing', 0.876518726348877),
   None], 'text': 'climate news', 'winoutput1': [('wm/concept/environment/climate',
    1.0),
   None,
   ('wm/process/communication/informing', 0.44785910844802856),
   None], 'winoutput2': [('wm/concept/environment/climate', 1.0),
   None,
   ('wm/process/publication', 0.42294541001319885),
   None]},
 ('0a6200447248b0bfb4a67d0fb5e84cbd',
  433,
  459): {'amyoutput': [('wm/process/communication/informing',
    0.8533480167388916),
   ('wm/property/stability', 1.0509308576583862),
   None,
   None], 'text': 'You may not have noticed it', 'winoutput1': [('wm/concept/frequency',
    0.5604546070098877),
   None,
   ('wm/process/stay_or_remain', 0.5931714773178101),
   None], 'winoutput2': [('wm/concept/time/timely', 0.5184054374694824),
   None,
   ('wm/process/communication/informing', 0.5429894328117371),
  

In [53]:
rows = [['text'] + parts + ['key']]
for key, entry in groundings_by_indexed_event.items():
    rows.append([entry['text']] + [str(entry.get(part)) for part in parts] + [key])

In [54]:
rows[0]

['text', 'amyoutput', 'winoutput1', 'winoutput2', 'key']

In [55]:
with open('grounding_comparison.csv', 'w') as fh:
    wr = csv.writer(fh)
    wr.writerows(rows)

In [48]:
rows[1]

['climate news',
 "[('wm/concept/environment/climate', 1.0), None, ('wm/process/communication/informing', 0.876518726348877), None]",
 "[('wm/concept/environment/climate', 1.0), None, ('wm/process/communication/informing', 0.44785910844802856), None]",
 "[('wm/concept/environment/climate', 1.0), None, ('wm/process/publication', 0.42294541001319885), None]"]

In [49]:
from indra_world.service.controller import preparation_pipeline

In [50]:
assembled_stmts = {part: preparation_pipeline.run(stmts[part]) for part in parts}

INFO: [2022-03-09 23:13:16] indra.pipeline.pipeline - Running the pipeline
INFO: [2022-03-09 23:13:16] indra.pipeline.pipeline - Calling filter_by_type
INFO: [2022-03-09 23:13:16] indra.tools.assemble_corpus - Filtering 4668 statements for type Influence...
INFO: [2022-03-09 23:13:16] indra.tools.assemble_corpus - 546 statements after filter...
INFO: [2022-03-09 23:13:16] indra.pipeline.pipeline - Calling validate_grounding_format
INFO: [2022-03-09 23:13:16] indra.pipeline.pipeline - Calling remove_namespaces
INFO: [2022-03-09 23:13:16] indra_world.assembly.operations - Removing unnecessary namespaces
INFO: [2022-03-09 23:13:16] indra_world.assembly.operations - Finished removing unnecessary namespaces
INFO: [2022-03-09 23:13:16] indra.pipeline.pipeline - Calling compositional_grounding_filter
INFO: [2022-03-09 23:13:16] indra.pipeline.pipeline - Calling validate_grounding_format
INFO: [2022-03-09 23:13:16] indra.pipeline.pipeline - Calling filter_out_long_words
INFO: [2022-03-09 23:13

In [51]:
for part in parts:
    print(part, len(assembled_stmts[part]))

amyoutput 383
winoutput1 265
winoutput2 281


In [52]:
preparation_pipeline.steps

[{'function': 'filter_by_type', 'kwargs': {'stmt_type': 'Influence'}},
 {'function': 'validate_grounding_format'},
 {'function': 'remove_namespaces', 'args': [['WHO', 'MITRE12', 'UN']]},
 {'function': 'compositional_grounding_filter',
  'kwargs': {'score_threshold': 0.6,
   'groundings_to_exclude': ['wm',
    'wm/concept',
    'wm/entity',
    'wm/time',
    'wm/process',
    'wm/property'],
   'remove_self_loops': True}},
 {'function': 'validate_grounding_format'},
 {'function': 'filter_out_long_words', 'args': [10]},
 {'function': 'filter_context_date',
  'kwargs': {'from_date': {'function': 'datetime', 'args': [1900, 1, 1]}}},
 {'function': 'set_positive_polarities'},
 {'function': 'sort_compositional_groundings'}]