Run this notebook to produce CoreNLP parses. CoreNLP parses are required for the quotation feature (QU) and for the DP & TP Orig features 

In [1]:
import sys
sys.path.append('../src')

from misc import open_dict, save_dict, get_file_names, get_raw_text, token_map, get_raw_text_latin

#### parse for quotations

In [2]:
from stanza.server import CoreNLPClient
def coreNLPParse_quotes(rawText, endpoint = 'http://localhost:9020'):
    '''
    Performs a Stanford CoreNLP parse of the raw text of a story, for use in semantic_subject(), dependencies() and triples()
    Parameters:
        raw text
        local endpoint
    Returns:
        CoreNLP annotation object for the raw text
    '''
    with CoreNLPClient(
        annotators=['quote'],
        timeout=60000,
        endpoint=endpoint,
        memory='6G',
        output_format='json',
        be_quiet=True) as client:
        ann = client.annotate(rawText)

    return ann 

In [12]:
settings = {
    # 'ProppLearner':{
    #     'rawTextDir': '../data/ProppLearner/texts/', 
    #     'annSaveDir': '../intermediate/ProppLearner/parses/quotes/', 
    #     'latin':False},
    'LitBank':{
        'rawTextDir': '../data/LitBank/corefs_gold_brat/', 
        'annSaveDir': '../intermediate/LitBank/parses_shortened/quotes/', 
        'latin':False},
    'CEN':{
        'rawTextDir': '../data/CEN/texts/', 
        'annSaveDir': '../intermediate/CEN/parses/quotes/', 
        'latin':True},
    }

In [None]:
for name, set in settings.items():

    print(name,'started')

    fileNames = get_file_names(set['rawTextDir'], '.txt')


    fileNamesDone = get_file_names(set['annSaveDir'], '.p')

    if set['latin']:
        get_raw_text_function = get_raw_text_latin
    else:
        get_raw_text_function = get_raw_text


    for fileName in fileNames:

        if fileName in fileNamesDone:
            print(fileName, 'already done')
            continue


        print('starting', fileName)
        
        rawText = get_raw_text_function(set['rawTextDir'] + fileName + '.txt')

        ann = coreNLPParse_quotes(rawText)

        save_dict(ann, set['annSaveDir'] + fileName + '.p')


        print(fileName, 'finished')

    print(name,'finished')


### Parse for triples and dependencies

In [2]:
from stanza.server import CoreNLPClient
def coreNLPParse_quotes(rawText, endpoint = 'http://localhost:9020'):
    '''
    Performs a Stanford CoreNLP parse of the raw text of a story, for use in semantic_subject(), dependencies() and triples()
    Parameters:
        raw text
        local endpoint
    Returns:
        CoreNLP annotation object for the raw text
    '''
    with CoreNLPClient(
        annotators=['ssplit', 'depparse', 'openie'],
        timeout=60000,
        endpoint=endpoint,
        memory='6G',
        output_format='json',
        be_quiet=True) as client:
        ann = client.annotate(rawText)

    return ann 

In [3]:
settings = {
    # 'ProppLearner':{
    #     'rawTextDir': '../data/ProppLearner/texts/', 
    #     'annSaveDir': '../intermediate/ProppLearner/parses/coreNLP/', 
    #     'latin':False},
    'LitBank':{
        'rawTextDir': '../data/LitBank/corefs_gold_brat/', 
        'annSaveDir': '../intermediate/LitBank/parses_shortened/coreNLP/', 
        'latin':False},
    'CEN':{
        'rawTextDir': '../data/CEN/texts/', 
        'annSaveDir': '../intermediate/CEN/parses/coreNLP/', 
        'latin':True},
    }

In [None]:
for name, set in settings.items():

    print(name,'started')

    fileNames = get_file_names(set['rawTextDir'], '.txt')


    fileNamesDone = get_file_names(set['annSaveDir'], '.p')

    if set['latin']:
        get_raw_text_function = get_raw_text_latin
    else:
        get_raw_text_function = get_raw_text


    for fileName in fileNames:

        if fileName in fileNamesDone:
            print(fileName, 'already done')
            continue


        print('starting', fileName)
        
        rawText = get_raw_text_function(set['rawTextDir'] + fileName + '.txt')

        ann = coreNLPParse_quotes(rawText)

        save_dict(ann, set['annSaveDir'] + fileName + '.p')


        print(fileName, 'finished')

    print(name,'finished')

### Parses for NER

In [2]:
from stanza.server import CoreNLPClient
def coreNLPParse_quotes(rawText, endpoint = 'http://localhost:9020'):
    '''
    Performs a Stanford CoreNLP parse of the raw text of a story, for use in semantic_subject(), dependencies() and triples()
    Parameters:
        raw text
        local endpoint
    Returns:
        CoreNLP annotation object for the raw text
    '''
    with CoreNLPClient(
        annotators=['ner'],
        timeout=60000,
        endpoint=endpoint,
        memory='6G',
        output_format='json',
        be_quiet=True) as client:
        ann = client.annotate(rawText)

    return ann 

In [3]:
settings = {
    'ProppLearner':{
        'rawTextDir': '../data/ProppLearner/texts/', 
        'annSaveDir': '../intermediate/ProppLearner/parses/coreNLP_ner/', 
        'latin':False},
    'LitBank':{
        'rawTextDir': '../data/LitBank/corefs_gold_brat/', 
        'annSaveDir': '../intermediate/LitBank/parses_shortened/coreNLP_ner/', 
        'latin':False},
    'CEN':{
        'rawTextDir': '../data/CEN/texts/', 
        'annSaveDir': '../intermediate/CEN/parses/coreNLP_ner/', 
        'latin':True},
    }

In [None]:
for name, set in settings.items():


    fileNames = get_file_names(set['rawTextDir'], '.txt')


    fileNamesDone = get_file_names(set['annSaveDir'], '.p')

    if set['latin']:
        get_raw_text_function = get_raw_text_latin
    else:
        get_raw_text_function = get_raw_text


    for fileName in fileNames:

        if fileName in fileNamesDone:
            print(fileName, 'already done')
            continue


        print('starting', fileName)
        
        rawText = get_raw_text_function(set['rawTextDir'] + fileName + '.txt')

        ann = coreNLPParse_quotes(rawText)

        save_dict(ann, set['annSaveDir'] + fileName + '.p')


        print(fileName, 'finished')

    print(name,'finished')