In [57]:
import numpy as np

import sys
sys.path.append('../src')

from misc import open_dict, save_dict, get_file_names, get_raw_text, token_map, open_list, save_list
from character_features import SRL_feat, dep_feat, ner_feat, SRL_feat, len_feat, CN_feat, WN_feat, disp_feat, QU_feat, const_feat, get_arg0_positions, get_nsubj_positions, get_PER_positions, get_arg0s_const
from character_features_original import SRL_feat_original, dep_feat_original, ner_feat_original, openie_feat_original, CN_feat_original, WN_feat_original, openie_feat_coreNLP, ner_feat_coreNLP, dep_feat_coreNLP
from animacy import categorise_coref_chains

from settings import settings

### comb features

In [82]:
def combine_features(featureNames, featuresDir, corpus = 'ProppLearner'):
    '''
    Combines specified features into a single array, which can be used by an SVM
    Parameters:
        featureNames - list containing feature names (i.e. feature FILE names) to be combined
        featuresDir - directory where feature files will be found
    Returns:
        featuresCombined - np array
    '''

    if corpus == 'ProppLearner':

        n = 46
        offset = 0

        # the first feature from story 1 will start the array
        npFileName = 'story' + str(1) + '.npy'
        featuresAll = np.atleast_2d(np.load(featuresDir + featureNames[0] + '/' + npFileName))

        # iterate through remaining features in story 1 and add to array
        for i in range(1, len(featureNames)):
            feature = np.atleast_2d(np.load(featuresDir + featureNames[i] + '/' + npFileName))
            featuresAll = np.concatenate((featuresAll, feature))

        # iterate through all remaining features in all remaining stories and add to array
        for storyNum in range(2 + offset,(n+1 + offset)):

            if storyNum == 34:
                continue

            npFileName = 'story' + str(storyNum) + '.npy'

            featuresThisStory = np.atleast_2d(np.load(featuresDir + featureNames[0] + '/' + npFileName))

            for i in range(1, len(featureNames)):
                feature = np.atleast_2d(np.load(featuresDir + featureNames[i] + '/' + npFileName))
                featuresThisStory = np.concatenate((featuresThisStory, feature))

            featuresAll = np.concatenate((featuresAll, featuresThisStory), axis = 1)
            

    else:
        print('need to add this corpus to function')

    return featuresAll.transpose()

In [3]:
experiments = {'ProppLearner_from_allen': 
                    {
                    'feature_names': ['SS', 'DP', 'CN', 'CD', 'CL', 'NE', 'WN', 'TR', 'animacy_labels_scraped', 'character_labels_scraped'],
                    'features_dir': "../intermediate/ProppLearner/from_allenNLP_corefs/",
                    'results_file_path': "../results/ProppLearner/from_allenNLP_corefs.txt",
                    },

                'ProppLearner_from_gold': 
                    {
                    'feature_names': ['SS', 'DP', 'CN', 'CD', 'CL', 'NE', 'WN', 'TR', 'animacy_labels_scraped', 'character_labels_scraped'],
                    'features_dir': "../intermediate/ProppLearner/from_allenNLP_corefs/",
                    'results_file_path': "../results/ProppLearner/from_allenNLP_corefs.txt",
                    },
                
                }

In [6]:
exp = experiments['ProppLearner_from_allen']
featureNames, featuresDir = exp['feature_names'], exp['features_dir']


In [69]:

n = 46
offset = 0

# the first feature from story 1 will start the array
npFileName = 'story' + str(1) + '.npy'
featuresAll = np.atleast_2d(np.load(featuresDir + featureNames[0] + '/' + npFileName))


In [70]:
featuresAll.shape

(1, 20)

In [71]:

# iterate through remaining features in story 1 and add to array
for i in range(1, len(featureNames)):
    feature = np.atleast_2d(np.load(featuresDir + featureNames[i] + '/' + npFileName))
    featuresAll = np.concatenate((featuresAll, feature))


In [72]:
featuresAll.shape

(10, 20)

In [51]:
npFileName = 'story' + str(4) + '.npy'

featuresThisStory = np.atleast_2d(np.load(featuresDir + featureNames[0] + '/' + npFileName))

for i in range(1, len(featureNames)):
    feature = np.atleast_2d(np.load(featuresDir + featureNames[i] + '/' + npFileName))
    featuresThisStory = np.concatenate((featuresThisStory, feature))

# print(featuresThisStory.shape)
# featuresAll = np.concatenate((featuresAll, featuresThisStory), axis = 0)

In [81]:

# iterate through all remaining features in all remaining stories and add to array
for storyNum in range(2 + offset,(n+1 + offset)):

    if storyNum == 34:
        continue

    npFileName = 'story' + str(storyNum) + '.npy'

    featuresThisStory = np.atleast_2d(np.load(featuresDir + featureNames[0] + '/' + npFileName))

    for i in range(1, len(featureNames)):
        feature = np.atleast_2d(np.load(featuresDir + featureNames[i] + '/' + npFileName))
        featuresThisStory = np.concatenate((featuresThisStory, feature))

    featuresAll = np.concatenate((featuresAll, featuresThisStory), axis = 1)

    # np.save(outputDir + "story" + str(storyNum), features)


(10, 20)
(10, 32)
(10, 34)
(10, 29)
(10, 33)
(10, 34)
(10, 36)
(10, 36)
(10, 23)
(10, 40)
(10, 32)
(10, 47)
(10, 36)
(10, 55)
(10, 36)
(10, 29)
(10, 25)
(10, 44)
(10, 22)
(10, 34)
(10, 28)
(10, 37)
(10, 35)
(10, 40)
(10, 43)
(10, 59)
(10, 64)
(10, 32)
(10, 24)
(10, 50)
(10, 56)
(10, 65)
(10, 112)
(10, 68)
(10, 94)
(10, 95)
(10, 83)
(10, 102)
(10, 67)
(10, 64)
(10, 86)
(10, 48)
(10, 33)
(10, 49)


In [12]:
features.shape

(10, 49)

In [83]:
featuresAllStories = combine_features(featureNames, featuresDir)


In [84]:
featuresAllStories.shape

(2131, 10)

### coref chains


In [9]:
corefsGoldDir = '../data/ProppLearner/corefs_gold_new_format/'
fileNames = get_file_names(corefsGoldDir, '.p')


goldCorefs = { }
for fileName in fileNames:

    corefs = open_dict(corefsGoldDir + fileName + '.p')
    simplified = []

    for chain in corefs['clusters']:
        refExps = ''
        for mention in chain['mentions']:
            refExps += mention['text']
            refExps += '    '
        simplified.append(refExps)


    goldCorefs[fileName] = simplified


In [10]:
goldCorefs['story1']

['A dragon    he    he    the dragon    He    his    he    he    he    dragon    the dragon    the dragon    him    he    he    he    the wicked dragon    the dragon    the dragon    the dragon    him    dragon    the dragon    himself    your    you    The dragon    he    him    him    the dragon    me    I    I    the dragon    the dragon    he    the dragon    you    your    The dragon    him    him    ',
 'Kiev    the city of Kiev    Kiev    his land    Kiev    Kiev    ',
 'heavy tribute from the people - a lovely maiden from every house , whom he then devoured    ',
 'the people    ',
 'a lovely maiden from every house , whom he then devoured    ',
 'every house    ',
 "it    the fate of the tsar 's daughter to go to the dragon    ",
 "the tsar 's daughter    her    her    her    she    a beauty    her    the princess    The princess    her    The princess    her    She    her    their daughter    her    The princess    the princess    she    her    her    the princess    ",
 'tsa

### features

In [58]:
tokensDir = '../data/ProppLearner/tokenized/'
corefDir = '../data/ProppLearner/corefs_allen/'
parsesDir = '../intermediate/ProppLearner/parses/'

srlParsesDir = parsesDir + 'SRL-bert/'
depParsesDir = parsesDir + 'deps-biaffine/'
nerParsesDir = parsesDir + 'NER-elmo/'
openParsesDir = parsesDir + 'openIE/'
constParsesDir = parsesDir + 'constituency/'


In [59]:
fileName = 'story1'
pFileName = 'story1.p'

document = open_dict(tokensDir + pFileName)
corefs = open_dict(corefDir + pFileName)
sentences = document['sents']
tokens = document['tokens']

srlParses = open_dict(srlParsesDir + pFileName)['parses']
depParses = open_dict(depParsesDir + pFileName)['parses']
nerParses = open_dict(nerParsesDir + pFileName)['parses']
openParses = open_dict(openParsesDir + pFileName)['parses']
constParses = open_dict(constParsesDir + pFileName)['parses']



#### SRL

In [60]:
nerFeat = ner_feat(nerParses, sentences, corefs)

In [61]:
pFileName = 'story7.p'
fileName = 'story7'

document = open_dict(tokensDir + pFileName)
corefs = open_dict(corefDir + pFileName)
sentences = document['sents']
tokens = document['tokens']

nerParses = open_dict(nerParsesDir + pFileName)
openieParses = open_dict(openParsesDir + pFileName)['parses']
depParses = open_dict(depParsesDir + pFileName)
srlParses = open_dict(srlParsesDir + pFileName)
annNER = open_dict('../intermediate/ProppLearner/parses/coreNLP_ner/' + pFileName)
corefs = open_dict(corefDir + pFileName)
ann = open_list('../intermediate/ProppLearner/parses/coreNLP/' + fileName + '.json')

In [62]:
from character_features_original import ner_feat_coreNLP, get_PER_list_core, get_people, ner_feat_original, get_dependency_subjs, get_chain_head, dep_feat_original, get_dependencies_list_core, dep_feat_coreNLP, get_triple_subjects, openie_feat_original, openie_feat_coreNLP, get_triple_subjects_core

In [11]:
get_triple_subjects(openieParses)

[[0, 0, 'This soldier'],
 [0, 1, 'This soldier'],
 [3, 1, 'His belly'],
 [7, 0, 'the hardtack the soldier'],
 [9, 0, 'the soldier'],
 [9, 1, 'the soldier'],
 [10, 0, 'the Unclean One with the appearance of an old man'],
 [13, 0, 'The devil'],
 [13, 1, 'the soldier'],
 [14, 1, "'s do a trade"],
 [14, 2, "'s"],
 [19, 1, 'whoever'],
 [19, 3, 'whoever looks at it'],
 [19, 4, 'whoever looks at it'],
 [21, 0, 'The soldier'],
 [21, 1, 'The soldier'],
 [23, 0, 'The Unclean One'],
 [23, 1, 'The Unclean One'],
 [23, 2, 'The Unclean One'],
 [23, 4, 'things'],
 [25, 5, 'this fiddle'],
 [27, 0, 'the soldier'],
 [29, 1, 'The soldier'],
 [31, 0, 'the Unclean One'],
 [35, 0, 'The soldier'],
 [35, 1, 'The soldier'],
 [35, 5, 'this unknown old man'],
 [37, 0, 'The devil'],
 [37, 1, 'a troika of fine horses'],
 [38, 0, 'in'],
 [40, 0, 'The soldier'],
 [40, 3, 'by his eyes'],
 [48, 0, 'The soldier'],
 [48, 1, 'The soldier'],
 [48, 2, 'they all'],
 [48, 3, 'they all'],
 [48, 6, 'leave'],
 [48, 7, 'that he 

In [48]:
positions = []
offset = 0
for i, parse in enumerate(openieParses):

    for j, verb in enumerate(parse['verbs']):

        print(verb['description'])

        tags = []
        for tag in verb['tags']:
            tags.append(tag.split('-')[-1])
            tags = list(set(tags))

        for j in range(len(tags)-1, -1, -1):
            if tags[j] == 'O':
                del tags[j]
        
        if len(tags) < 3:
            continue

        for j in range(0,5):
            if 'ARG' + str(j) in tags:
                subjTag = 'ARG' + str(j)
                break
        print(subjTag)

        for k, tag in enumerate(verb['tags']):
            if tag == 'B-'+subjTag:

                count = 1

                while k + count < len(verb['tags']):
                    if verb['tags'][k + count] != 'I-' + subjTag:
                        break
                    count += 1

                positions.append([k + offset, k + offset + count -1])
        
                for pos in range(k + offset, k + offset + count):
                    print(document['tokens'][pos] + ' ', end='')

                break
        print()

    offset += len(sentences[i]['tokens'])






[ARG0: This soldier] [V: got] [ARG1: permission to go on leave] .
ARG0
This soldier 
[ARG0: This soldier] got permission to [V: go] [ARG2: on leave] .
ARG0
This soldier 
[ARG1: He] [V: got] [ARG2: ready] and set off along the way .
ARG1
He 
[ARG1: He] got ready and [V: set] off [ARGM-LOC: along the way] .
ARG1
He 
[ARG0: He] [V: walked] and walked , but nowhere did he see any water and he wanted to wet his hardtack and eat a little along the way and road .
[ARG0: He] walked and [V: walked] , but nowhere did he see any water and he wanted to wet his hardtack and eat a little along the way and road .
He walked and walked , but nowhere [V: did] he see any water and he wanted to wet his hardtack and eat a little along the way and road .
He walked and walked , but nowhere did [ARG0: he] [V: see] [ARG1: any water] and he wanted to wet his hardtack and eat a little along the way and road .
ARG0
he 
He walked and walked , but nowhere did he see any water and [ARG0: he] [V: wanted] [ARG1: to we

In [51]:

def get_triple_subject_positions(openieParses, sentences):

    positions = []
    offset = 0
    for i, parse in enumerate(openieParses):

        for j, verb in enumerate(parse['verbs']):


            tags = []
            for tag in verb['tags']:
                tags.append(tag.split('-')[-1])
                tags = list(set(tags))

            for j in range(len(tags)-1, -1, -1):
                if tags[j] == 'O':
                    del tags[j]
            
            if len(tags) < 3:
                continue

            for j in range(0,5):
                if 'ARG' + str(j) in tags:
                    subjTag = 'ARG' + str(j)
                    break

            for k, tag in enumerate(verb['tags']):
                if tag == 'B-'+subjTag:

                    count = 1

                    while k + count < len(verb['tags']):
                        if verb['tags'][k + count] != 'I-' + subjTag:
                            break
                        count += 1

                    positions.append([k + offset, k + offset + count -1])
                    break

        offset += len(sentences[i]['tokens'])

    return positions

In [55]:
get_triple_subject_positions(openieParses, sentences)

[[0, 1],
 [0, 1],
 [9, 9],
 [9, 9],
 [27, 27],
 [32, 32],
 [32, 32],
 [32, 32],
 [48, 49],
 [64, 64],
 [73, 73],
 [73, 73],
 [81, 81],
 [81, 81],
 [81, 81],
 [103, 106],
 [115, 115],
 [115, 115],
 [126, 127],
 [126, 127],
 [142, 151],
 [181, 182],
 [188, 189],
 [206, 209],
 [206, 206],
 [211, 211],
 [218, 218],
 [233, 233],
 [238, 238],
 [247, 247],
 [257, 257],
 [265, 265],
 [279, 279],
 [286, 286],
 [286, 289],
 [286, 289],
 [301, 301],
 [305, 305],
 [310, 311],
 [310, 311],
 [323, 323],
 [332, 332],
 [332, 332],
 [342, 344],
 [342, 344],
 [342, 344],
 [360, 360],
 [364, 364],
 [368, 368],
 [380, 380],
 [388, 388],
 [399, 399],
 [405, 406],
 [408, 408],
 [422, 423],
 [427, 427],
 [437, 437],
 [452, 452],
 [452, 452],
 [457, 457],
 [463, 463],
 [463, 463],
 [477, 477],
 [477, 477],
 [485, 486],
 [492, 492],
 [497, 497],
 [506, 506],
 [506, 506],
 [520, 522],
 [525, 525],
 [530, 530],
 [540, 540],
 [540, 540],
 [567, 567],
 [574, 575],
 [574, 575],
 [583, 583],
 [583, 583],
 [583, 583]

In [63]:
def openie_feat(openieParses, sentences, corefChains):
    triplePositions = get_triple_subject_positions(openieParses, sentences)
    feature = np.zeros(len(corefChains['clusters']))

    for i, corefChain in enumerate(corefChains['clusters']):
        for mention in corefChain['mentions']:
            for position in triplePositions:
                if mention['position'] == position:
                    feature[i] += 1
                    break

    if np.std(feature) == 0:
        return np.zeros(len(corefChains['clusters']))
    
    return (feature - np.mean(feature)) / (np.std(feature))

In [64]:
feat = openie_feat(openieParses, sentences, corefs)

In [65]:
feat

array([ 5.30960016, -0.29680995, -0.29680995, -0.24074585,  1.94575409,
       -0.18468174, -0.12861764, -0.29680995, -0.29680995, -0.29680995,
       -0.18468174, -0.24074585, -0.18468174, -0.24074585, -0.12861764,
       -0.01648944, -0.29680995, -0.29680995, -0.29680995,  0.26383106,
       -0.24074585, -0.24074585, -0.29680995, -0.29680995, -0.24074585,
       -0.24074585, -0.24074585, -0.24074585, -0.18468174, -0.29680995,
       -0.29680995, -0.29680995, -0.24074585, -0.24074585])