This notebook scrapes character and animacy labels from Jahans labelled chains (CEN and ProppLearner). It then matches the labels to the indexed coreference chains: CEN AllenNLP corefs, ProppLearner AllenNLP corefs and ProppLearner Gold Corefs


In [1]:
import numpy as np
from collections import Counter

import sys
sys.path.append('../src/')

from misc import open_dict, get_file_names, get_raw_text_latin, get_ref_expressions, scrape_char_and_animacy_labels, scrape_char_and_animacy_labels_new
import json

#### ProppLearner Gold Corefs

In [4]:
### get character labels and animacy labels by comparing chain titles in coref chains from xml file, with labelled coref chain heads

n = 15
offset = 0

for storyNum in range(1 + offset,(n+1 + offset)):

    # get gold standard corefs
    pFileName = "story" + str(storyNum) + ".p"
    corefs = open_dict('ProppLearner/corefs_gold_new_format/' + pFileName)

    # get "gold standard" referring expressions &  character labels
    jsonFileName = "story" + str(storyNum) + ".json"

    with open("../reproduction_of_jahan/intermediate/from_jahan_coref_file/ReferringExpressions/" + jsonFileName, 'r') as f:
            refExpressions = json.load(f)

    npFileName = "story" + str(storyNum) + '.npy'
    charLabels = np.load("../reproduction_of_jahan/intermediate/from_jahan_coref_file/CharacterLabels/"+npFileName)
    animLabels = np.load("../reproduction_of_jahan/intermediate/from_jahan_coref_file/Animacy/"+npFileName)


    # compare canonical name to 'Gold Standard' referring expressions. Get animacy label for matching GS coref chains.
    char_labels = np.zeros(len(corefs['clusters']))
    anim_labels = np.zeros(len(corefs['clusters']))

    for i, chain in enumerate(corefs['clusters']):  
        match = False

        if chain['name'] == None:
            char_labels[i] = 0.
            anim_labels[i] = 0.
            continue
        
        for j, refExp in enumerate(refExpressions):

            if chain['name'] == refExp[0].strip():
                char_labels[i] = charLabels[j]
                anim_labels[i] = animLabels[j]
                match = True
                break

        if match == False:
            char_labels[i] = 0.
            anim_labels[i] = 0.

    # # save labels as a geature
    # npFileName = "story" + str(storyNum)
    np.save('../intermediate/ProppLearner/from_gold_corefs/character_labels_gold/' + npFileName, char_labels)
    np.save('../intermediate/ProppLearner/from_gold_corefs/animacy_labels_gold/' + npFileName, anim_labels)

#### ProppLearner and CEN AllenNLP corefs [OLD, USE SECTION BELOW]

In [1]:
corefsDir = 'CEN/corefs_allen/'
labelledChainsDir = 'CEN/coref_heads_labelled/'


fileNames = get_file_names(corefsDir, '.p')


for fileName in fileNames:

    corefs = open_dict(corefsDir + fileName + '.p')

    # get referring expressions
    labChains = get_raw_text_latin(labelledChainsDir + fileName + '.txt')
    refExpressions, charLabels, animLabels = get_ref_expressions(labChains)

    # compare canonical name to 'Gold Standard' referring expressions. Get animacy label for matching GS coref chains.
    char_labels = np.zeros(len(corefs['clusters']))
    anim_labels = np.zeros(len(corefs['clusters']))

    for i, chain in enumerate(corefs['clusters']):  
        match = False
        canonName = chain['name']

        # if canon name is None or 's mark as not a character
        if canonName == None or canonName.strip() == "'s":
            char_labels[i] = 0.
            anim_labels[i] = 0.
            continue
        
        # remove " 's" from end of canon name
        if len(canonName) > 3:
            if canonName[-3:] == " 's":
                canonName = canonName[:-3]
        
        for j, refExp in enumerate(refExpressions):

            if canonName == refExp[0].strip():
                char_labels[i] = charLabels[j]
                anim_labels[i] = animLabels[j]
                match = True
                break

        if match == False:
            char_labels[i] = 0.
            anim_labels[i] = 0.

    np.save('../intermediate/CEN/from_allenNLP_corefs/character_labels_scraped/' + fileName, char_labels)
    np.save('../intermediate/CEN/from_allenNLP_corefs/animacy_labels_scraped/' + fileName, anim_labels)

NameError: name 'get_file_names' is not defined

In [8]:
corefsDir = 'ProppLearner/corefs_allen/'
labelledChainsDir = 'ProppLearner/coref_heads_labelled/'


fileNames = get_file_names(labelledChainsDir, '.txt')


for fileName in fileNames:


    fileNameLower = fileName.lower()

    corefs = open_dict(corefsDir + fileNameLower + '.p')

    # get referring expressions
    labChains = get_raw_text_latin(labelledChainsDir + fileName + '.txt')
    refExpressions, charLabels, animLabels = get_ref_expressions(labChains)

    # compare canonical name to 'Gold Standard' referring expressions. Get animacy label for matching GS coref chains.
    char_labels = np.zeros(len(corefs['clusters']))
    anim_labels = np.zeros(len(corefs['clusters']))

    for i, chain in enumerate(corefs['clusters']):  
        match = False

        if chain['name'] == None:
            char_labels[i] = 0.
            anim_labels[i] = 0.
            continue
        
        for j, refExp in enumerate(refExpressions):

            if chain['name'] == refExp[0].strip():
                char_labels[i] = charLabels[j]
                anim_labels[i] = animLabels[j]
                match = True
                break

        if match == False:
            char_labels[i] = 0.
            anim_labels[i] = 0.

    np.save('../intermediate/ProppLearner/from_allenNLP_corefs/character_labels_scraped/' + fileNameLower, char_labels)
    np.save('../intermediate/ProppLearner/from_allenNLP_corefs/animacy_labels_scraped/' + fileNameLower, anim_labels)

#### ProppLearner and CEN AllenNLP corefs

In [2]:
### ProppLearner Allen corefs

corefsDir = 'ProppLearner/corefs_allen/'
labelledChainsDir = 'ProppLearner/coref_heads_labelled/'
featuresDir = '../intermediate/ProppLearner/from_allenNLP_corefs/'


# scrape_char_and_animacy_labels(corefsDir, labelledChainsDir, featuresDir, caseDiff=True)


In [3]:
scrape_char_and_animacy_labels_new(corefsDir, labelledChainsDir, featuresDir, caseDiff=True)

In [4]:
### CEN Allen corefs

corefsDir = 'CEN/corefs_allen/'
labelledChainsDir = 'CEN/coref_heads_labelled/'
featuresDir = '../intermediate/CEN/from_allenNLP_corefs/'


# scrape_char_and_animacy_labels(corefsDir, labelledChainsDir, featuresDir, caseDiff=False)

In [5]:
scrape_char_and_animacy_labels_new(corefsDir, labelledChainsDir, featuresDir, caseDiff=False)