In [None]:
from trec_car.read_data import *

def get_lead_paras(page):
    
    '''
    given a page object, return a list of lead paragraphs (i.e., the intro paragraphs, before the first section)
    
   :arg page: the page object
   :return: a list of paragraph objects
   '''
    
    lead_paras = [x.paragraph for x in page.skeleton if not isinstance(x, Section) 
                             and not isinstance(x, Image)
                             and not isinstance(x, List)]
    return lead_paras

def get_top_level_sections(page):
    
    '''
    given a page object, return a list of top-level sections
    
   :arg page: the page object
   :return: a list of section objects
   '''
    
    sections = [x for x in page.skeleton if isinstance(x, Section)]
    return sections

def get_paras(section):

    '''
    given a section object, return a list of paragraphs  - be careful, all its sub-sections will be considered as paragraphs as well
    
   :arg section: the section object
   :return: a list of paragraph objects
   '''
    
    paras = []

    for element in section.children:
            if isinstance(element, Para):
                paras.append(element.paragraph)
            elif isinstance(element, Section):
                paras += get_paras(element)
            else:
                pass

    return paras
        
def get_entities (paragraph):
    
    '''
    given a paragraph object, return a list of mentioned entities
    
   :arg paragraph: the paragraph object
   :return: a list of entities-ids
   '''
    
    entities = [elem.pageid
            for elem in paragraph.bodies
                if isinstance(elem, ParaLink)]

    return entities
    
class EntityAspect():
 
    """
    An Entity-Aspect object

    Attributes:
        pageStarter    The page-id, where the entity-aspect link is found
        sectionStarter The section-id, where the entity-aspect link is found
        paraStarter    The paragraph, where the entity-aspect link is found
        anchor         The anchor text of the entity-aspect link
        pageTarget     The linked page-id
        sectionTarget  The linked section (name, not id! Entity-Aspect Link ids are currently missing)
    """   
    
    
    def __init__(self, pageStarter, sectionStarter,paraStarter,anchor,pageTarget, sectionTarget):
        self.pageStarter = pageStarter
        self.sectionStarter = sectionStarter
        self.paraStarter = paraStarter
        self.anchor = anchor
        self.pageTarget = pageTarget
        self.sectionTarget = sectionTarget


def get_ent_asp(page):
    
    
    '''
    given a page object, return a list of entity-aspect objects
    
   :arg page: the page object
   :return: a list of entity-aspect objects
   '''

    page_id = page.page_id
    
    sections = get_top_level_sections(page)
    
    entAsps = []
    for section in sections:
        paras = get_paras(section)

        for para in paras:

            entAsps += [EntityAspect(pageStarter=page_id, sectionStarter=section.headingId,
                                                    paraStarter=para,
                                                    anchor = elem.anchor_text,
                                                    pageTarget = elem.pageid, 
                                                    sectionTarget = elem.link_section) 
                                 for elem in para.bodies if isinstance(elem, ParaLink) 
                                   and elem.link_section != None and page_id != elem.pageid]    
    return entAsps

In [None]:
with open("processedAllButBenchmark.cbor", 'rb') as f:
    for p in iter_pages(f):
        # get the lead paras
        lead_paras  = get_lead_paras(p)
        print (lead_paras)
        print ()
        
        # get their content
        lead_para_content = [x.get_text() for x in lead_paras]
        print (lead_para_content)
        print ()

        #get their entities 
        lead_para_entities = [get_entities(x) for x in lead_paras]
        print (lead_para_entities)
        print ()
        
        # get the top level sections - each collected object is a section
        top_level_sections = get_top_level_sections(p)
        print (top_level_sections)
        print ()
        
        #for the first retrieved section, get its paras
        #for each para you can now get the text and the entities, using the fuctions used above
        section_paras = get_paras(top_level_sections[0])
        print (section_paras)
        print ()
        
        break

with open("unprocessedAllButBenchmark.cbor", 'rb') as f:
    for p in iter_pages(f):     
        ent_asps = get_ent_asp(p)
        if len(ent_asps)>0:
            for ent_asp in ent_asps:
                print (ent_asp.pageStarter, ent_asp.sectionStarter,ent_asp.anchor,"-->", ent_asp.pageTarget, ent_asp.sectionTarget)
                break
            break