In [1]:
import tensorflow as tf
tf.enable_eager_execution()
tf.executing_eagerly()


True

In [2]:
import pickle
import os

In [3]:
class RecordPrep: 
    
    def __init__(self, _vocab_corpus_path, _sense_corpus_path, _tf_records_path):
        self.vocab_corpus_path = _vocab_corpus_path
        self.sense_corpus_path = _sense_corpus_path
        self.tf_records_path = _tf_records_path
    
    def sequence_to_tf_example(self, sequence_1, sequence_2):
            ex = tf.train.SequenceExample()
            # A non-sequential feature of our example
            sequence_length_1 = len(sequence_1) # list of word ids
            sequence_length_2 = len(sequence_2) # list of sense ids
            if sequence_length_1 != sequence_length_2:
                raise Exception("Sequence lengths not equal: %d, %d" % (sequence_length_1, sequence_length_2))
            ex.context.feature["length_1"].int64_list.value.append(sequence_length_1)
            ex.context.feature["length_2"].int64_list.value.append(sequence_length_2)


            # Feature lists for the two sequential features of our example
            fl_tokens_1 = ex.feature_lists.feature_list["vocab_ids"]
            fl_tokens_2 = ex.feature_lists.feature_list["sense_ids"]

            for token in sequence_1:
                fl_tokens_1.feature.add().int64_list.value.append(token)        

            for token in sequence_2:
                fl_tokens_2.feature.add().int64_list.value.append(token)

            return ex
   
    @staticmethod
    def parse(ex):
        '''
        Explain to TF how to go froma  serialized example back to tensors
        :param ex:
        :return:
        '''
        context_features = {
            "length_1": tf.FixedLenFeature([], dtype=tf.int64),
            "length_2": tf.FixedLenFeature([], dtype=tf.int64)
        }
        sequence_features = {
            "vocab_ids": tf.FixedLenSequenceFeature([], dtype=tf.int64),
            "sense_ids": tf.FixedLenSequenceFeature([], dtype=tf.int64),

        }

        # Parse the example (returns a dictionary of tensors)
        context_parsed, sequence_parsed = tf.parse_single_sequence_example(
            serialized=ex,
            context_features=context_features,
            sequence_features=sequence_features
        )
        return {"seq_1": sequence_parsed["vocab_ids"], "length_1": context_parsed["length_1"],
                "seq_2": sequence_parsed["sense_ids"], "length_2": context_parsed["length_2"]}
    
    def import_corpus_pickles(self):
        
        vocab_files = os.listdir(self.vocab_corpus_path)
        sense_files = os.listdir(self.sense_corpus_path)
        
        
        for _file in zip(vocab_files, sense_files):
            if _file[0].split('/')[-1] != _file[1].split('/')[-1]:
                raise Exception("Vocab and sense files do not match \n%s\n%s" % 
                                (_file[0].split('/')[-1], _file[1].split('/')[-1]))
            else:
                vocab_document, sense_document = self.import_document_pickles(self.vocab_corpus_path +'/' + _file[0],
                                                                              self.sense_corpus_path + '/' + _file[1])
                self.serialize_doc(vocab_document, sense_document, _file[0])
                
            
        return vocab_document, sense_document
    
    def import_document_pickles(self, vocab_corpus_path, sense_corpus_path):

        vocab_document = pickle.load(open(vocab_corpus_path, 'rb'))
        sense_document = pickle.load(open(sense_corpus_path, 'rb'))

        return vocab_document, sense_document
    
    
    def serialize_doc(self, vocab_list, sense_list, f_name):
        print(len(vocab_list))
        record_filename = self.tf_records_path + '/' + f_name + '.tfrecord'
        print("Serializing file %s into tfrecord, number of examples: %d" % (f_name, len(vocab_list)))
        
        with open(record_filename, 'w') as f:
            writer = tf.python_io.TFRecordWriter(f.name)
            for sentence in zip(vocab_list, sense_list):
                #print(sentence[0], sentence[1])
                example = self.sequence_to_tf_example(sentence[0], sentence[1])
                writer.write(example.SerializeToString())
    
    def read_dataset(self):
        # Read a tf record file. This makes a dataset of raw TFRecords
        dataset = tf.data.TFRecordDataset(os.listdir(self.tf_records_path))
        # Apply/map the parse function to every record. Now the dataset is a bunch of dictionaries of Tensors
        dataset =  dataset.map(self.parse,num_parallel_calls=5)
        #Shuffle the dataset
        #dataset = dataset.shuffle(buffer_size=10000)
        
        return dataset

        

In [4]:
sense_test_dir = '/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/api/test_sense'
vocab_test_dir = '/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/api/test_vocab'
tf_test_dir = '/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/api/test_tf'
def test_class(test_dir1, test_dir2, test_dir3):
    rprep = RecordPrep(test_dir1, test_dir2, test_dir3)
    rprep.import_corpus_pickles()
    _dset = rprep.read_dataset()
    iterator = tf.data.Iterator.from_structure(_dset.output_types,
                                               _dset.output_shapes)
    training_init_op = iterator.make_initializer(_dset)

    # This is an op that gets the next element from the iterator
    next_element = iterator.get_next()

    return next_element, training_init_op, iterator

In [None]:
n_elem, init_op, iterator = test_class(vocab_test_dir, sense_test_dir, tf_test_dir)

In [5]:
import_documents_pickles('/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/api/corpus/ids/pickles/bc#cctv#00#cctv_0000@all@cctv@bc@en@on.pickle')

NameError: name 'import_documents_pickles' is not defined

In [6]:
record_iterator = tf.python_io.tf_record_iterator(path='/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/api/test_tf/bc#cctv#00#cctv_0000@all@cctv@bc@en@on.pickle.tfrecord')

In [7]:
i = 0
for string_record in record_iterator:
    example = tf.train.Example()
    i += 1
    example.ParseFromString(string_record)
    
print(i)

214


In [None]:
seq1 = [1,2,3,4,5]
seq2 = [6,7,8,9,10,11]

In [None]:
def serialize_doc(vocab_list, sense_list):
    examples = []
    print(len(vocab_list))
    for sentence in zip(vocab_list, sense_list):
        #print(sentence[0], sentence[1])
        ex = sequence_to_tf_example(sentence[0], sentence[1])
        examples.append(ex)
    print(len(examples))
    return examples
    

In [None]:
vocab_list, sense_list = import_documents_pickles("/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/api/corpus/ids/pickles/bc#cctv#00#cctv_0000@all@cctv@bc@en@on.pickle",
                        "/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/api/corpus/senses/pickles/bc#cctv#00#cctv_0000@all@cctv@bc@en@on.pickle")

In [None]:
my_ex

In [None]:
sequences = [[1, 2, 3], [4, 5, 1], [1, 2]]
label_sequences = [[0, 1, 0], [1, 0, 0], [1, 1]]
 
def make_example(sequence, labels):
    # The object we return
    ex = tf.train.SequenceExample()
    # A non-sequential feature of our example
    sequence_length = len(sequence)
    ex.context.feature["length"].int64_list.value.append(sequence_length)
    # Feature lists for the two sequential features of our example
    fl_tokens = ex.feature_lists.feature_list["tokens"]
    fl_labels = ex.feature_lists.feature_list["labels"]
    for token, label in zip(sequence, labels):
        nested = fl_tokens.feature.add(ex.feature_lists.feature_list["nested"])
        for token in sequence:
            nested.feature.add().int64_list.value.append(token)
        fl_labels.feature.add().int64_list.value.append(label)
    return ex

In [None]:
exmpls = serialize_doc(vocab_list, sense_list)

In [None]:
path1 = "/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/api/corpus/senses/pickles/bc#cctv#00#cctv_0000@all@cctv@bc@en@on.pickle"

In [None]:
path1.split('/')[-1].replace('.pickle', '')

In [30]:

sense_ids = tf.contrib.eager.Variable([[1,2,3], [1,5,0], [1,5,7], [2,3,10], [2,0,0]])
related = tf.contrib.eager.Variable([[0],[0,1],[0,2],[1,0], [1,2,8,4,5],[2,1,5], [3,0]], [1,5,7,9], [4,5] )

ValueError: Can't convert non-rectangular Python sequence to Tensor.

In [26]:
b

<tf.Variable 'Variable:0' shape=(5, 3) dtype=int32, numpy=
array([[ 1,  2,  3],
       [ 1,  5,  0],
       [ 1,  5,  7],
       [ 2,  3, 58],
       [ 2,  0,  0]], dtype=int32)>

In [27]:
e = tf.nn.embedding_lookup(b, [0,2])
print(e)

tf.Tensor(
[[1 2 3]
 [1 5 7]], shape=(2, 3), dtype=int32)


In [28]:
tf.nn.embedding_lookup(b, e[0])

<tf.Tensor: id=65, shape=(3, 3), dtype=int32, numpy=
array([[ 1,  5,  0],
       [ 1,  5,  7],
       [ 2,  3, 58]], dtype=int32)>

In [None]:
import xml.etree.ElementTree as ET

In [None]:
tree = ET.parse('/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/data/files/data/english/metadata/sense-inventories/absorb-v.xml')
root = tree.getroot()

In [None]:
for child in root.findall('sense'):
    onto_sense = child.get('n')
    print('onto: ', int(onto_sense))
    for s in child.findall('mappings/wn'):
        if s.text is not None:
            wn_sense = [int(a) for a in s.text.split(',')]
            print(wn_sense)


In [None]:
import os
path = '/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/data/files/data/english/metadata/sense-inventories'
files = os.listdir('/Users/daniel/Desktop/Research/WSD_Data/ontonotes-release-5.0/data/files/data/english/metadata/sense-inventories')
for f in files[:10]:
    print("%s/%s" %(path,f))

In [18]:
t = tf.contrib.eager.Variable(tf.ones([2,4]))

In [23]:
t[1] = tf.contrib.eager.Variable(tf.zeros([1,4]))

TypeError: 'ResourceVariable' object does not support item assignment

In [24]:
print(3
     )

3
