In [1]:
### This notebook contains rough code to analyse the makeup of different datasets

In [1]:
import numpy as np
import sys
sys.path.append('../src')

from SVM_functions import param_selection, train_and_evaluate_model, combine_features
from misc import save_dict, get_file_names, open_dict
from eval_functions import get_ref_exps_from_coref_dict
from settings import settings

#### Get num coref chains & num labelled 'character' in each dataset

In [7]:
for expName, expSettings in settings.items():

    featuresDir = expSettings['featuresDir']
    fileNames = get_file_names(expSettings['corefDir'])
    y = combine_features(featuresDir, [expSettings['character labels dir extention']], fileNames).transpose()

    print(expName)
    print('num coref chains:', int(y.shape[0]))
    print('num characters:', int(np.sum(y)))
    print('frac characters:', round( int(np.sum(y)) / int(y.shape[0]) , 2))
    print()


ProppLearner_from_gold
num coref chains: 1633
num characters: 123
frac characters: 0.08

ProppLearner_from_allen
num coref chains: 2266
num characters: 702
frac characters: 0.31

ProppLearner_from_heads_only
num coref chains: 1912
num characters: 564
frac characters: 0.29

LitBank_from_gold
num coref chains: 2849
num characters: 66
frac characters: 0.02

LitBank_from_allen
num coref chains: 1348
num characters: 48
frac characters: 0.04

CEN_from_allen
num coref chains: 1900
num characters: 185
frac characters: 0.1

CEN_from_heads_only
num coref chains: 17251
num characters: 436
frac characters: 0.03



### Get num tokens in each text

In [2]:
datasets = {'CEN':'tokenized', 'ProppLearner':'tokenized', 'LitBank':'tokenized_shortened'}

lens = []

for dataset, folderName in datasets.items():
    lengths = []
    tokensDir = "../data/" + dataset + '/' + folderName + '/'

    fileNames = get_file_names(tokensDir, '.p')


    for fileName in fileNames:
        tokenized = open_dict(tokensDir + fileName + '.p')

        lengths.append(len(tokenized['tokens']))

    print(len(fileNames))
    print(len(lengths))
    lens.append(lengths)

    



30
30
46
46
38
38


In [4]:
for a in lens:
    print(sum(a)/len(a))

4414.066666666667
2370.021739130435
2076.684210526316


In [25]:
x = 6300

gt = 0
lt = 0
for lengths in lens:
    for l in lengths:
        if l >= x:
            gt += 1
        else:
            lt += 1

print('greater',gt)
print('less',lt)

greater 7
less 107


In [None]:
print(sum(lens[0]) / len())

In [16]:
print(len(lengths))

TypeError: 'int' object is not callable

In [21]:
### ProppLEarner Gold
datasets = {'ProppLearner':'tokenized'}

lens = []

for dataset, folderName in datasets.items():
    lengths = []
    tokensDir = "../data/" + dataset + '/' + folderName + '/'

    fileNames = get_file_names(tokensDir, '.p')


    for fileName in fileNames:
        if int(fileName[5:]) not in list(range(1,16)):
            continue
        tokenized = open_dict(tokensDir + fileName + '.p')

        lengths.append(len(tokenized['tokens']))

    print(len(fileNames))
    print(len(lengths))
    lens.append(lengths)

    


46
15


In [22]:
lengths

[754,
 932,
 1499,
 1591,
 2187,
 2169,
 2288,
 866,
 1121,
 1227,
 1522,
 1501,
 2128,
 1679,
 1818]

In [None]:
print()

### of animate, how many are character and how many are not character? what's the brakdown for inanimate? (propplearner gold)

In [9]:
featuresDir = '../intermediate/ProppLearner/from_gold_corefs/'
fileNames = get_file_names(featuresDir + 'animacy_labels_gold/', '.npy')


animLabels = combine_features(featuresDir, ['animacy_labels_gold'], fileNames)
charLabels = combine_features(featuresDir, ['character_labels_gold'], fileNames)

In [14]:
animChar = 0
animNotChar = 0

notAnimChar = 0
notAnimNotChar = 0

for animLabel, charLabel in zip(animLabels[0], charLabels[0]):

    if animLabel == 1.:
        if charLabel == 1.:
            animChar += 1
        else:
            animNotChar += 1

    else:
        if charLabel == 1.:
            notAnimChar += 1
        else:
            notAnimNotChar += 1
     

In [16]:
print(animChar, animNotChar, animChar+animNotChar)
print(notAnimChar, notAnimNotChar)

123 207 330
0 1303


### Quality of alen coref chains, qualititative analysis

corefs gold new format

vs

corefs allennlp (first 15 stories)

In [47]:
corefsDir = ['../data/ProppLearner/corefs_gold_new_format/','../data/ProppLearner/corefs_allen/']

featuresDir = ['../intermediate/ProppLearner/from_gold_corefs/character_labels_gold/','../intermediate/ProppLearner/from_allenNLP_corefs/character_labels_scraped/']

fileNames = get_file_names(corefsDir[0], '.p')

HRDir  = ['../data/ProppLearner/HR_corefs_gold_new_format/', '../data/ProppLearner/HR_corefs_allen/']

In [48]:
for fileName in fileNames:

    for i in range(2):

        corefs = open_dict(corefsDir[i] + fileName + '.p')
        charLabels = np.load(featuresDir[i] + fileName + '.npy')

        

        refs = get_ref_exps_from_coref_dict(corefs)

        stringToSave = ''
        for refNum, ref in enumerate(refs):
            stringToSave += str(charLabels[refNum])

            for r in ref:
                stringToSave += ' | ' + str(r)

            stringToSave += '\n'
            
        text_file = open( HRDir[i] + fileName + '.txt', "w")
        n = text_file.write(stringToSave)
        text_file.close()


In [49]:
    for i in [1]:

        fileName = 'story21'

        corefs = open_dict(corefsDir[i] + fileName + '.p')
        charLabels = np.load(featuresDir[i] + fileName + '.npy')

        

        refs = get_ref_exps_from_coref_dict(corefs)

        stringToSave = ''
        for refNum, ref in enumerate(refs):
            stringToSave += str(charLabels[refNum])

            for r in ref:
                stringToSave += ' | ' + str(r)

            stringToSave += '\n'
            
        text_file = open( HRDir[i] + fileName + '.txt', "w")
        n = text_file.write(stringToSave)
        text_file.close()