In [8]:
import json
import numpy as np

import sys
sys.path.append('../src')

from misc import open_dict, save_dict, get_file_names, get_raw_text, get_ref_expressions, get_raw_text_latin, open_list, save_list
from eval_functions import get_all_variations_catchall, get_all_variations
from coreference_resolution import get_canonical_character_name_from_list

#### ProppLearner and CEN

In [26]:

# CHAIN HEADS ONLY

for dataset in ['CEN', 'ProppLearner']:

    corefsLabeledJahanDir = dataset + '/coref_heads_labelled/'
    charGoldListDir = dataset + '/char_list_gold/'
    fileNames = get_file_names(corefsLabeledJahanDir, '.txt')

    if dataset == 'CEN':
        get_raw_text_func = get_raw_text_latin
    else:
        get_raw_text_func = get_raw_text

    for fileName in fileNames:
        chains = get_raw_text_func(corefsLabeledJahanDir + fileName + '.txt')
        refExpressions, charLabels, _ = get_ref_expressions(chains)
        characterList = []
        for i, refs in enumerate(refExpressions):
            if charLabels[i] == 1.:
                characterList.append(refs[0])

        save_list(characterList, charGoldListDir + fileName + '.json')

In [16]:
# Most frequent referring expression

for dataset in ['CEN', 'ProppLearner']:
    
    corefsLabeledJahanDir = dataset + '/coref_heads_labelled/'
    charGoldListDir = dataset + '/char_list_gold_most_frequent/'
    fileNames = get_file_names(corefsLabeledJahanDir, '.txt')

    if dataset == 'CEN':
        get_raw_text_func = get_raw_text_latin
    else:
        get_raw_text_func = get_raw_text

    for fileName in fileNames:
        chains = get_raw_text_func(corefsLabeledJahanDir + fileName + '.txt')
        refExpressions, charLabels, _ = get_ref_expressions(chains)
        canonNames = []
        for i, refs in enumerate(refExpressions):
            if charLabels[i] == 1.:
                canonNames.append(get_canonical_character_name_from_list(refs))

        save_list(characterList, charGoldListDir + fileName + '.json')

In [3]:
# FULL CHAINS

for dataset in ['CEN', 'ProppLearner']:

    corefsLabeledJahanDir = dataset + '/coref_heads_labelled/'
    charGoldListDir = dataset + '/char_list_gold_full_chains/'
    fileNames = get_file_names(corefsLabeledJahanDir, '.txt')

    if dataset == 'CEN':
        get_raw_text_func = get_raw_text_latin
    else:
        get_raw_text_func = get_raw_text

    for fileName in fileNames:
        chains = get_raw_text_func(corefsLabeledJahanDir + fileName + '.txt')
        refExpressions, charLabels, _ = get_ref_expressions(chains)
        characterList = []
        for i, refs in enumerate(refExpressions):
            
            if charLabels[i] == 1.:
                characterList.append(refs)

        save_list(characterList, charGoldListDir + fileName + '.json')

#### LitBank

In [2]:
characterDictCliffs = open_dict("LitBank/characters/litbank_character_lists_from_cliffs.p")
characterDictSpark = open_dict("LitBank/characters/litbank_character_lists_from_spark.p")
litBankDict = open_dict("LitBank/characters/litbank_ids_and_titles_dict.p")

corefsFile = 'LitBank/corefs_gold_new_format/'
fileNames = get_file_names(corefsFile, '.p')

charGoldListDir =  'LitBank/char_list_gold/'

In [3]:
characters_all = {}

for key in litBankDict:
    if key in list(characterDictSpark.keys()) and key in list(characterDictCliffs.keys()):
        values = [characterDictSpark[key], characterDictCliffs[key]]
        sizes = [len(characterDictSpark[key]), len(characterDictSpark[key])]
        
        characters_all[key] = {
                                'characterList':values[sizes.index(max(sizes))],
                                'fileName':litBankDict[key]
                                }

    elif key in list(characterDictSpark.keys()):
        characters_all[key] = {
                                'characterList':characterDictSpark[key],
                                'fileName':litBankDict[key]
                                }

    elif key in list(characterDictCliffs.keys()):
        characters_all[key] = {
                                'characterList':characterDictCliffs[key],
                                'fileName':litBankDict[key]
                                }

In [4]:
for fileName in fileNames:
    storyID = int(fileName.split('_')[0])

    characters = characters_all[storyID]

    characterList = characters['characterList']

    save_list(characterList, charGoldListDir + fileName + '.json')


In [5]:
alice = open_list('LitBank/char_list_gold/11_alices_adventures_in_wonderland.json')

### Get rid of chracters who don't appear in first 2000 tokens

In [4]:
charListsDir = 'LitBank/char_list_gold/'
fileNames = get_file_names(charListsDir, '.json')

In [6]:
for fileName in fileNames:

    if fileName != '1400_great_expectations':
        continue

    print(fileName)
    # get extended char list
    charList = open_list('LitBank/char_list_gold_extended/' + fileName + '.json')

    # # save extended char list in new folder
    # save_list(charList, 'LitBank/char_list_gold_extended/' + fileName + '.json')

    # get variations for each char in char list
    variations = get_all_variations_catchall(charList)

    # get shortened raw text
    shortenedText = get_raw_text('LitBank/corefs_gold_brat/' + fileName + '_brat.txt')
    shortenedText = shortenedText.replace('\n','')

    # if a char name variation is in shorteded text, add character to shortened list
    charListShortened = []

    for i, variations in enumerate(variations):
        for variation in variations:
            if variation.strip() in shortenedText:
                charListShortened.append(charList[i])
                break

    # save shortened char list
    save_list(charListShortened, 'LitBank/char_list_gold/' + fileName + '.json')



1400_great_expectations


In [8]:
charList

['Pip',
 'Joe Gargery',
 'Mrs. Joe Gargery',
 'Biddy',
 'Uncle Pumblechook',
 'Dolge Orlick',
 'Mr. Wopsle (Mr. Waldengarver)',
 'Mr. Trabb',
 "Mr. Trabb's Boy",
 'Mr. and Mrs. Hubble',
 "Mr. Wopsle's Great-Aunt",
 'Squires',
 'Philip Pirrip, late of this parish',
 'Georgiana, wife of the above',
 'Alexander, Bartholomew, Abraham, Tobias, Roger',
 'Miss Havisham',
 'Estella',
 'Mrs. Camilla, Mr. Raymond (Cousin Raymond, Mr. Camilla), Sarah Pocket, Georgiana Pocket',
 'Mr. Jaggers',
 'John Wemmick',
 'Molly',
 'Aged Parent (Aged P.)',
 'Miss Skiffins',
 'Mr. Skiffins',
 'Herbert Pocket (Pale Young Gentleman)',
 'Clara Barley',
 'Bill Barley (Gruffandgrim)',
 'Mrs. Whimple',
 'Startop',
 'Bentley Drummle',
 'Matthew Pocket',
 'Mrs. Pocket (Belinda)',
 'Sophia, Flopson, and Millers',
 'Mrs. Brandley',
 'Mrs. Coiler',
 'The Avenger (Pepper)',
 'Clarriker',
 'the Jack',
 'Mary Anne',
 'Magwitch',
 'Compeyson (Second Convict)',
 'Arthur',
 'Colonel',
 'Sally',
 'Stranger at the Three Jolly B

In [9]:
charListShortened

['Pip',
 'Joe Gargery',
 'Mrs. Joe Gargery',
 'Philip Pirrip, late of this parish',
 'Georgiana, wife of the above',
 'Alexander, Bartholomew, Abraham, Tobias, Roger',
 'Mrs. Camilla, Mr. Raymond (Cousin Raymond, Mr. Camilla), Sarah Pocket, Georgiana Pocket']