In [1]:
import sys
sys.path.insert(0,'../models/')
sys.path.insert(0,'../datasets/')
sys.path.insert(0,'..')

import pandas as pd
import numpy as np
import json
import tensorflow as tf
from subprocess import Popen, PIPE, STDOUT
import re

from models import PropbankEncoder
import config

INPUT_DIR = '../datasets/binaries/'
PROPBANK_GLO50_PATH = '{:}deep_glo50.pickle'.format(INPUT_DIR)

<h1><center>Structured Predictions Network CWIS SRL (BR)</center></h1>

<center>In this notebook we solve the semantic role labeling task using structured predictions networks.</center>

## 1. Builds a "human friendly" version of the dataset

In [2]:
dfgs = pd.read_csv('../datasets/csvs/gs.csv', index_col=0, sep=',', encoding='utf-8')
column_files = [
    '../datasets/csvs/column_chunks/chunks.csv',
    '../datasets/csvs/column_predmarker/predicate_marker.csv',
    '../datasets/csvs/column_shifts_ctx_p/form.csv',
    '../datasets/csvs/column_shifts_ctx_p/gpos.csv',
    '../datasets/csvs/column_shifts_ctx_p/lemma.csv',
    '../datasets/csvs/column_t/t.csv',
    '../datasets/csvs/column_iob/iob.csv'
]

for col_f in column_files:
    _df = pd.read_csv(col_f, index_col=0, encoding='utf-8')
    dfgs = pd.concat((dfgs, _df), axis=1)

DISPLAY_COLUMNS = ['ID', 'P', 'FORM', 'ARG', 'T', 
                   'CHUNK_ID', 'CHUNK_START', 'CHUNK_FINISH', 'CHUNK_LEN', 'CHUNK_CANDIDATE_ID']            
dfgs[DISPLAY_COLUMNS].head(33)    

Unnamed: 0_level_0,ID,P,FORM,ARG,T,CHUNK_ID,CHUNK_START,CHUNK_FINISH,CHUNK_LEN,CHUNK_CANDIDATE_ID
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,1,Brasília,*,*,1,0,1,1,0
1,2,1,Pesquisa_Datafolha,(A0*,A0,2,1,4,3,35
2,3,1,publicada,*,A0,2,1,4,3,35
3,4,1,hoje,*),A0,2,1,4,3,35
4,5,1,revela,(V*),V,3,4,5,1,126
5,6,1,um,(A1*,A1,4,5,32,27,181
6,7,1,dado,*,A1,4,5,32,27,181
7,8,1,supreendente,*,A1,4,5,32,27,181
8,9,1,:,*,A1,4,5,32,27,181
9,10,1,recusando,*,A1,4,5,32,27,181


## 2. Gets encodings

Propbank Encoder holds an indexed version of propbank dataset an answers to FOUR different dataformats: 
* CAT: this is the raw categorical data.
* EMB: tokens are embedding using GloVe embeddings.
* HOT: onehot encoding of the words and tokens.
* IDX: dense indexed representations.

In [3]:
# LOAD ENCODER
propbank_encoder = PropbankEncoder.recover(PROPBANK_GLO50_PATH)
db = propbank_encoder.db
lex2idx = propbank_encoder.lex2idx
idx2lex = propbank_encoder.idx2lex

# FOR TEXTUAL DATA ONLY

lex2tok = propbank_encoder.lex2tok
tok2idx = propbank_encoder.tok2idx
embeddings = propbank_encoder.embeddings

In [4]:
print('attributes\t',
       len(db),
      '\n',             
      'records\t',
       len(db['ARG'].keys()))

attributes	 44 
 records	 141730


In [5]:
def filter_type(ds_type, db):
    '''Filters only records from train dataset
    '''
    ds_types = ('train', 'test', 'valid')
    if ds_type not in ds_types:
        _msg = 'ds_type must be in {:} got {:}'
        _msg = _msg.format(ds_types, ds_type)
        raise ValueError(_msg)
    elif ds_type in ('train',):
        lb = 0 
        ub = config.DATASET_TRAIN_SIZE
    elif ds_type in ('test',):        
        lb = config.DATASET_TRAIN_SIZE
        ub = lb + config.DATASET_VALID_SIZE         
    elif ds_type in ('valid',):                
        lb = config.DATASET_TRAIN_SIZE + config.DATASET_VALID_SIZE
        ub = lb + config.DATASET_TEST_SIZE         

    sel_keys_ = {key_ for key_, prop_ in db['P'].items() if prop_ > lb and prop_ <= ub}

    return {
                attr_:{ idx_: i_
                        for idx_, i_ in dict_.items() if idx_ in sel_keys_
                      }        
                for attr_, dict_  in db.items()
            }

def reindex_prop(db):
    '''Reindex db by propositions creating a nested dict in which the
        outer key is the proposition
        
        Converts the inner dicts into numpy arrays
    '''
    
    triple_list = []
    prev_prop = -1
    for idx, prop in db['P'].items():
        if prev_prop != prop:
            if idx > 0:
                ub = idx-1
                triple_list.append((lb, ub, prev_prop))
            lb = idx
        prev_prop = prop
    triple_list.append((lb, ub, prev_prop))
            

        
    prop_set = set(db['P'].values())
    return { prop_:
                    {
                        attr_:{ idx_: dict_[idx_]
                                for idx_ in range(lb_, ub_ + 1, 1)
                          }        
                        for attr_, dict_ in db.items() if attr_ not in ('P',)
                    }
             for lb_, ub_, prop_ in  triple_list
            }, {prop_: ub_ - lb_ + 1 for lb_, ub_, prop_ in  triple_list}   

In [6]:
traindb  = filter_type('train', db)
print('attributes\t',
       len(traindb),
      '\n',             
      'records\t',
       len(traindb['ARG'].keys()))

attributes	 44 
 records	 123846


In [7]:
traindb1, proplen_dict = reindex_prop(traindb)
print('attributes\t',
       len(traindb1[1]) + 1,
      '\n',             
      'records\t',
       sum([len(d['ARG']) for p, d in traindb1.items()]))

attributes	 44 
 records	 123837


In [22]:
def get_inputs(db1, proplen_dict, propid):
    '''Generate inputs
    '''
    propdb = db1[propid] # nested dict of columns and idx value
    proplen = proplen_dict[propid]
    if 'CHUNK_SPACE' not in propdb:
        propdb['CHUNK_SPACE'] = generate_chunk_space(proplen)

    word  = list(propdb['FORM'].values())
    pos   = list(propdb['GPOS'].values())
    chunk_type  = list(propdb['T'].values())
    chunk_start, chunk_finish = propdb['CHUNK_SPACE']
    
    return word, pos, chunk_type, chunk_start, chunk_finish
            
def generate_chunk_space(n):
    '''Generates all possible spaces for chunks
    '''
    start_list = []
    end_list = []
    for i in range(n):
        for j in range(i,n,1):
            start_list.append(i)
            end_list.append(j+1)
    return  start_list, end_list

def get_outputs(db1, t2idx, propid):
    ''' Generate outputs
    '''
    propdb = db1[propid] # nested dict of columns and idx value
    if 'OUTPUTS' not in propdb: 
        chunk_list = zip(propdb['CHUNK_CANDIDATE_ID'].values(), 
                         propdb['T'].values())
        chunk_list = list(set(chunk_list))

        n_targets = len(t2idx)        
        propdb['OUTPUTS'] = [ i * n_targets + j for i, j in chunk_list]

    return propdb['OUTPUTS']

In [23]:
%%timeit
propid = 2
word, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(traindb1, proplen_dict, propid)
y = get_outputs(traindb1, lex2idx['T'], propid)
# worst proposition 1120 size 92!

2.69 µs ± 94.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


 ## MODEL

In [24]:
propid = 1
word, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(traindb1, proplen_dict, propid)
y = get_outputs(traindb1, lex2idx['T'], propid)

_start  = chunk_start * len(lex2idx['T'])
_finish = chunk_finish * len(lex2idx['T'])
print([(_start[y_], _finish[y_]) for y_ in y])

[(0, 1), (2, 21), (4, 18), (12, 30), (25, 26)]


In [25]:
import struct_perc.colored_weighted_interval_scheduling as cwis
import struct_perc.weighted_interval_scheduling as wis
import struct_perc.utils as spu

 ## Tensorflow Graph

In [26]:
vocab_size = len(lex2idx['FORM']) + 1
embed_size = 50

n_pos = len(lex2idx['GPOS'])
# n_type = len(lex2idx['T'])
n_classes  = len(lex2idx['T'])

tf.reset_default_graph()

# word index and gpos 
tf_words = tf.placeholder(tf.int32, shape=(None,1))
tf_pos = tf.placeholder(tf.int32, shape=(None,1))
# t_x_type = tf.placeholder(tf.int32, shape=(None,1))

# índices de inicio de intervalo
tf_s = tf.placeholder(tf.int32, shape=(None,1))
# índices de fim de intervalo
tf_f = tf.placeholder(tf.int32, shape=(None,1))

# replicamos os indicies de inicio e fim para cada classe de chunk possivel
tf_sc = tf.reshape(
      tf.tile(tf_s,  [1, n_classes]), [-1,1])
tf_fc = tf.reshape(
      tf.tile(tf_f,  [1, n_classes]), [-1,1])

# n_features = (embed_size + n_pos + n_type)
n_features = (embed_size + n_pos)
# hidden_features = 300
W_shape = (n_features, n_classes)
EMBS = tf.constant(embeddings)
# tf_token = tf.Variable(initial_value=None, expected_shape=(embed_size,), dtype=tf.float32, trainable=False)

# geramos os paramteros do modelo
with tf.variable_scope("model"):
    W = tf.Variable(
        tf.random_normal(W_shape, 0, 1/np.sqrt(n_features * n_classes), name='W')
    )
    b = tf.Variable(
        tf.random_normal((n_classes,1), 0, 1/np.sqrt(n_classes), name='b')
    )
    

# tf_token = tf.nn.embedding_lookup(tf_embeddings, id) 
# Recuperamos os embeddings de cada palavra
tf_word_features = tf.gather_nd(EMBS, tf_words)

tf_pos_flat = tf.reshape(tf_pos, [-1])
tf_pos_features = tf.one_hot(tf_pos_flat, depth=n_pos)

# t_x_type_flat = tf.reshape(t_x_type,[-1])
# t_type_features = tf.one_hot(t_x_type_flat, depth=n_type)

# t_tok_features = tf.concat((t_word_features,t_pos_features,t_type_features),axis=1)
tf_tok_features = tf.concat((tf_word_features,tf_pos_features),axis=1)

# a partir das features do intervalo computamos o score
tf_scores = tf.matmul(tf_tok_features, W) + b

tf_pred = tf.argmax(tf_scores, axis=1)


 ## Tensorflow test session

In [32]:
propid  = 1
words, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(traindb1, proplen_dict, propid)
y = get_outputs(traindb1, lex2idx['T'], propid)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    a = sess.run(tf_tok_features, feed_dict={
        tf_words:np.transpose(np.array([words])),
        tf_pos:np.transpose(np.array([gpos])),
    })
print(a.shape)

InvalidArgumentError: flat indices[10, :] = [12812] does not index into param (shape: [12038,50]).
	 [[Node: GatherNd = GatherNd[Tindices=DT_INT32, Tparams=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Const, _arg_Placeholder_0_0)]]

Caused by op 'GatherNd', defined at:
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-26-befffd5df2b5>", line 45, in <module>
    tf_word_features = tf.gather_nd(Embs, tf_words)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1971, in gather_nd
    "GatherNd", params=params, indices=indices, name=name)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/Users/Varela/anaconda/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): flat indices[10, :] = [12812] does not index into param (shape: [12038,50]).
	 [[Node: GatherNd = GatherNd[Tindices=DT_INT32, Tparams=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Const, _arg_Placeholder_0_0)]]
