In [1]:
import csv
import glob

import sys
sys.path.append('../src/')

from misc import open_dict, get_raw_text, save_dict
from format_corefs_LitBank import get_coreference_chains_from_brat_annotation_file, litbank_to_allen_indices_map, cut_tokenized_document, add_allenNLP_indices_to_coref_dict, format_coref_dict, get_id_and_title, get_character_labels

##### get list of files to process

In [2]:
filePaths =  glob.glob("LitBank/texts/*.txt")

In [3]:
characterDictCliffs = open_dict("LitBank/characters/litbank_character_lists_from_cliffs.p")
characterDictSpark = open_dict("LitBank/characters/litbank_character_lists_from_spark.p")
litBankDict = open_dict("LitBank/characters/litbank_ids_and_titles_dict.p")

In [4]:
have_characters = list(characterDictSpark.keys()) + list(characterDictCliffs.keys())
have_characters = set(have_characters)

In [5]:
fileNamesToProcess = []

for filePath in filePaths:
    storyID, _ = get_id_and_title(filePath)

    if storyID in have_characters:
        filePath = filePath.split('/')[-1]
        filePath = filePath.split('.')[0]

        fileNamesToProcess.append(filePath)

##### Process files

In [6]:
for fileName in fileNamesToProcess:

    print(fileName, 'started')
    # open annotated brat file
    with open('LitBank/corefs_gold_brat/' + fileName + '_brat.ann') as f:
        reader = csv.reader(f, delimiter='\t')
        brat = list(reader)

    # get coref chain annotations and put into dict
    annotations_by_label = get_coreference_chains_from_brat_annotation_file(brat)

    # convert indices from litbank character indices to allenNLP token indices
    tokensAllen = open_dict('LitBank/tokenized/' + fileName + '.p')
    bratInput = get_raw_text('LitBank/corefs_gold_brat/' + fileName + '_brat.txt')
    tokenDict, tokenNum = litbank_to_allen_indices_map(tokensAllen, bratInput)

    # shorten annotated text file to match length annotated in LitBank
    tokenizedCut = cut_tokenized_document(tokenNum, tokensAllen)
    save_dict(tokenizedCut, 'LitBank/tokenized_shortened/' + fileName + '.p')

    # add AllenNLP indices to annotations dict
    annotations_by_label_allenIndices = add_allenNLP_indices_to_coref_dict(annotations_by_label, tokenDict)

    # get into same format as other coref dicts
    coref_dict_final = format_coref_dict(annotations_by_label, tokenizedCut)

    # save coref dict
    save_dict(coref_dict_final, 'LitBank/corefs_gold_new_format/' + fileName + '.p')



45_anne_of_green_gables started
766_david_copperfield started
158_emma started
2084_the_way_of_all_flesh started
1023_bleak_house started
74_the_adventures_of_tom_sawyer started
113_the_secret_garden started
2775_the_good_soldier started
768_wuthering_heights started
550_silas_marner started
145_middlemarch started
120_treasure_island started
215_the_call_of_the_wild started
105_persuasion started
1400_great_expectations started
27_far_from_the_madding_crowd started
155_the_moonstone started
2891_howards_end started
219_heart_of_darkness started
514_little_women started
217_sons_and_lovers started
2489_moby_dick started
174_the_picture_of_dorian_gray started
1342_pride_and_prejudice started
599_vanity_fair started
11_alices_adventures_in_wonderland started
541_the_age_of_innocence started
209_the_turn_of_the_screw started
24_o_pioneers started
730_oliver_twist started
345_dracula started
77_the_house_of_the_seven_gables started
33_the_scarlet_letter started
32_herland started
543_main_

### Label coreference chain according to whether it's a character

##### Prep

In [2]:
characterDictCliffs = open_dict("LitBank/characters/litbank_character_lists_from_cliffs.p")
characterDictSpark = open_dict("LitBank/characters/litbank_character_lists_from_spark.p")
litBankDict = open_dict("LitBank/characters/litbank_ids_and_titles_dict.p")

In [3]:
# swap sotyr titles to story filepaths in litBankDict
filePaths =  glob.glob("LitBank/texts/"+"*.txt")

for filePath in filePaths:

    storyID, _ = get_id_and_title(filePath)

    for key in litBankDict.keys():
        if key == storyID:

            filePath = filePath.split('/')[-1]
            filePath = filePath.split('.')[0]

            litBankDict[key] = filePath


save_dict(litBankDict, "LitBank/characters/litbank_ids_and_titles_dict.p")

    

In [4]:
### get dict of stories with filenames and characters
characters_all = {}

for key in litBankDict:
    if key in list(characterDictSpark.keys()) and key in list(characterDictCliffs.keys()):

        if key == 2814:
            characters_all[key] = {
                                'characterList':characterDictCliffs[key],
                                'fileName':litBankDict[key]
                                }

        else:

            values = [characterDictSpark[key], characterDictCliffs[key]]
            sizes = [len(characterDictSpark[key]), len(characterDictSpark[key])]
            
            characters_all[key] = {
                                    'characterList':values[sizes.index(max(sizes))],
                                    'fileName':litBankDict[key]
                                    }

    elif key in list(characterDictSpark.keys()):
        characters_all[key] = {
                                'characterList':characterDictSpark[key],
                                'fileName':litBankDict[key]
                                }

    elif key in list(characterDictCliffs.keys()):
        characters_all[key] = {
                                'characterList':characterDictCliffs[key],
                                'fileName':litBankDict[key]
                                }

##### match coreference chains with characters 

In [11]:
corefsDir_gold = 'LitBank/corefs_gold_new_format/'
charLabelsDir_gold = '../intermediate/LitBank/from_gold_corefs/character_labels_scraped/'

corefsDir_allen = 'LitBank/corefs_allen/'
charLabelsDir_allen = '../intermediate/LitBank/from_allenNLP_corefs/character_labels_scraped/'


corefsDir = corefsDir_allen
charLabelsDir = charLabelsDir_allen

In [12]:
import numpy as np

In [13]:
for story in characters_all.values():
    characters = story['characterList']
    fileName = story['fileName']
    corefs = open_dict(corefsDir + fileName + '.p')

    characterLabelsScraped = get_character_labels(corefs, characters)

    np.save(charLabelsDir + fileName, characterLabelsScraped)
