In [1]:
from allennlp.predictors.predictor import Predictor
import numpy as np
import glob

import sys
sys.path.append('../src')

from misc import open_dict, save_dict, get_file_names, get_raw_text, token_map, open_list, save_list
from character_features import SRL_feat, dep_feat, ner_feat, SRL_feat, len_feat, CN_feat, WN_feat, disp_feat, QU_feat, const_feat, openie_feat
from character_features_original import SRL_feat_original, dep_feat_original, ner_feat_original, openie_feat_original, CN_feat_original, WN_feat_original, openie_feat_coreNLP, ner_feat_coreNLP, dep_feat_coreNLP
from animacy import categorise_coref_chains

from settings import settings


#### Consruct features (most)

In [2]:
for name, set in settings.items():

    tokensDir = set['tokensDir']
    corefDir = set['corefDir']
    featuresDir = set['featuresDir']
    parsesDir = set['parsesDir']

    ssDir = featuresDir + 'SS/'
    dpDir = featuresDir + 'DP/'
    neDir = featuresDir + 'NE/'
    tpDir = featuresDir + 'TP/'
    clDir = featuresDir + 'CL/'
    cnDir = featuresDir + 'CN/'
    wnDir = featuresDir + 'WN/'
    cdDir = featuresDir + 'CD/'
    cpDir = featuresDir + 'CP/'

    ssOrigDir = featuresDir + 'OrigSS/'
    dpOrigDir = featuresDir + 'OrigDP/'
    neOrigDir = featuresDir + 'OrigNE/'
    tpOrigDir = featuresDir + 'OrigTP/'
    cnOrigDir = featuresDir + 'OrigCN/'
    wnOrigDir = featuresDir + 'OrigWN/'

    srlParsesDir = parsesDir + 'SRL-bert/'
    depParsesDir = parsesDir + 'deps-biaffine/'
    nerParsesDir = parsesDir + 'NER-elmo/'
    openParsesDir = parsesDir + 'openIE/'
    constParsesDir = parsesDir + 'constituency/'

    fileNames = get_file_names(corefDir, '.p')
    
    if set['OrigOnly'] == False:

        for fileName in fileNames:

            # get corefs and document needed to construct feature
            pFileName = fileName + ".p"

            document = open_dict(tokensDir + pFileName)
            corefs = open_dict(corefDir + pFileName)

            srlParses = open_dict(srlParsesDir + pFileName)['parses']
            depParses = open_dict(depParsesDir + pFileName)['parses']
            nerParses = open_dict(nerParsesDir + pFileName)['parses']
            openParses = open_dict(openParsesDir + pFileName)['parses']
            constParses = open_dict(constParsesDir + pFileName)['parses']
            
            # SS
            srlFeat = SRL_feat(srlParses, document['sents'], corefs)
            np.save(ssDir + fileName, srlFeat)

            # DP
            depFeat = dep_feat(depParses, document['sents'], corefs)
            np.save(dpDir + fileName, depFeat)

            # NE            
            nerFeat = ner_feat(nerParses, document['sents'], corefs)
            np.save(neDir + fileName, nerFeat)

            # TP
            openFeat = openie_feat(openParses, document['sents'], corefs)
            np.save(tpDir + fileName, openFeat)

            # CP
            constFeat = const_feat(constParses, document['sents'], corefs)
            np.save(cpDir + fileName, constFeat)

            # # CD
            # cdFeat = disp_feat(corefs, document)
            # np.save(cdDir + fileName, cdFeat)

            # # CN
            # cnFeat = CN_feat(corefs)
            # np.save(cnDir + fileName, cnFeat)

            # # WN
            # wnFeat = WN_feat(corefs)
            # np.save(wnDir + fileName, wnFeat)


    for fileName in fileNames:
        corefs = open_dict(corefDir + fileName + ".p")

        pFileName = fileName + ".p"
        nerParses = open_dict(nerParsesDir + pFileName)
        openParses = open_dict(openParsesDir + pFileName)
        depParses = open_dict(depParsesDir + pFileName)
        srlParses = open_dict(srlParsesDir + pFileName)

        # CL
        lenFeat = len_feat(corefs)
        np.save(clDir + fileName, lenFeat)

        # original features
        dpOrigFeat = dep_feat_original(depParses, corefs)
        np.save(dpOrigDir + fileName, dpOrigFeat)

        ssOrigFeat = SRL_feat_original(srlParses, corefs)
        np.save(ssOrigDir + fileName, ssOrigFeat)

        neOrigFeat = ner_feat_original(nerParses, corefs)
        np.save(neOrigDir + fileName, neOrigFeat)

        tpOrigFeat = openie_feat_original(openParses, corefs)
        np.save(tpOrigDir + fileName, tpOrigFeat)

        # cnOrigFeat = CN_feat_original(corefs)
        # np.save(cnOrigDir + fileName, cnOrigFeat)

        # wnOrigFeat = WN_feat_original(corefs)
        # np.save(wnOrigDir + fileName, wnOrigFeat)


#### Construct Quotation Feature

In [6]:
for set in settings.values():

    if set['OrigOnly']:
        continue

    tokensDir = set['tokensDir']
    corefDir = set['corefDir']
    featuresDir = set['featuresDir']
    parsesDir = set['parsesDir']

    quDir = featuresDir + 'QU/'

    coreNLPParseDir = parsesDir + 'quotes/'

    fileNames = get_file_names(corefDir, '.p')
    

    for fileName in fileNames:
        
        print(fileName)
        pFileName = fileName + ".p"

        tokenized = open_dict(tokensDir + pFileName)
        corefs = open_dict(corefDir + pFileName)

        if set['bratExtentionOnCoreNLPParseFiles']:
            ann = open_dict(coreNLPParseDir + fileName + '_brat.p')
        else:    
            ann = open_dict(coreNLPParseDir + pFileName)

        quFeat = QU_feat(corefs, ann, tokenized)

        np.save(quDir + fileName, quFeat)

345_dracula
345_dracula


#### Run Animacy Classifier

In [7]:
for name, set in settings.items():

    if set['OrigOnly']:
        continue
    
    tokensDir = set['tokensDir']
    corefDir = set['corefDir']
    featuresDir = set['featuresDir']
    parsesDir = set['parsesDir']


    fileNames = get_file_names(corefDir, '.p')
    animacyLabelsDir = featuresDir + 'animacy_labels_classifier/'

    srlParsesDir = parsesDir + 'SRL-bert/'
    nerParsesDir = parsesDir + 'NER-elmo/'

    for fileName in fileNames:
        
        pFileName = fileName + ".p"

        corefs = open_dict(corefDir + pFileName)
        corefChains = corefs['clusters']

        nerParsesDict = open_dict(nerParsesDir + pFileName)
        nerParses = nerParsesDict['parses']

        srlParsesDict = open_dict(srlParsesDir + pFileName)
        srlParses = srlParsesDict['parses']

        document = open_dict(tokensDir + pFileName)
        sentences = document['sents']

        labels = categorise_coref_chains(corefChains, nerParses, srlParses, sentences)

        np.save(animacyLabelsDir + fileName, labels)



#### Old Features

In [8]:
for name, set in settings.items():

    corefDir = set['corefDir']
    featuresDir = set['featuresDir']
    coreNLPParseDir = set['parsesDir'] + 'coreNLP/'
    coreNLPNERParseDir = set['parsesDir'] + 'coreNLP_ner/'

    fileNames = get_file_names(corefDir, '.p')


    for fileName in fileNames:

        corefs = open_dict(corefDir + fileName + ".p")

        if set['bratExtentionOnCoreNLPParseFiles']:
            annFileName = fileName + '_brat'
        else:
            annFileName = fileName

        if 'ProppLearner' in name:
            ann = open_list(coreNLPParseDir + annFileName + ".json")
        else:
            ann = open_dict(coreNLPParseDir + annFileName + ".p")

        ann_ner = open_dict(coreNLPNERParseDir + annFileName + '.p')

        ### dep feature
        depsFeat = dep_feat_coreNLP(ann, corefs)
        np.save(featuresDir + 'OrigDP_coreNLP/' + fileName, depsFeat)

        ### trip feature
        tripFeat = openie_feat_coreNLP(ann, corefs)
        np.save(featuresDir + 'OrigTP_coreNLP/' + fileName, tripFeat)

        ### ne features
        nerFeat = ner_feat_coreNLP(ann_ner, corefs)
        np.save(featuresDir + 'OrigNE_coreNLP/' + fileName, nerFeat)

        print(fileName, 'done')

345_dracula done
345_dracula done


### Testing

In [22]:
tokenized = open_dict('../data/LitBank/tokenized_shortened/345_dracula.p')

In [23]:
len(tokenized['tokens'])

2232

In [21]:
fileNames = get_file_names('../data/LitBank/tokenized_shortened/', '.p')

for fileName in fileNames:

    tokenized = open_dict('../data/LitBank/tokenized_shortened/'+fileName+'.p')
    if len(tokenized['tokens']) > 2500:
        print(fileName)

In [13]:
fileNames = get_file_names('../data/LitBank/corefs_allen/', '.p')

for fileName in fileNames:

    corefs = open_dict('../data/LitBank/corefs_allen/'+fileName+'.p')
    if len(corefs['clusters']) < 30:
        print(corefs['clusters'])

[{'mentions': [{'position': [20, 26], 'text': 'the Academy of Music in New York'}, {'position': [86, 89], 'text': 'the sociable old Academy'}, {'position': [93, 93], 'text': 'it'}, {'position': [127, 127], 'text': 'it'}, {'position': [129, 129], 'text': 'its'}, {'position': [137, 137], 'text': 'its'}, {'position': [292, 293], 'text': 'the Academy'}, {'position': [585, 586], 'text': 'the Academy'}], 'name': 'its'}, {'mentions': [{'position': [25, 26], 'text': 'New York'}, {'position': [110, 111], 'text': 'New York'}, {'position': [419, 420], 'text': 'New York'}, {'position': [461, 465], 'text': "Newland Archer 's New York"}, {'position': [1389, 1391], 'text': "New York 's"}, {'position': [1696, 1698], 'text': 'old New York'}, {'position': [1697, 1698], 'text': 'New York'}, {'position': [1741, 1742], 'text': 'New York'}, {'position': [1824, 1825], 'text': 'New York'}], 'name': 'New York'}, {'mentions': [{'position': [13, 14], 'text': 'Christine Nilsson'}, {'position': [157, 159], 'text':

In [5]:
corefs

{'tokenizedDocument': ['CHAPTER',
  'I',
  'JONATHAN',
  'HARKER',
  "'S",
  'JOURNAL',
  '(',
  '_Kept',
  'in',
  'shorthand._',
  ')'],
 'clusters': []}