In [1]:
import sys
sys.path.insert(0,'../models/')
sys.path.insert(0,'../datasets/')
sys.path.insert(0,'..')

import pandas as pd
import numpy as np
import json
from subprocess import Popen, PIPE, STDOUT
import re
from collections import defaultdict

import tensorflow as tf
import tqdm
from models import PropbankEncoder
import config

INPUT_DIR = '../datasets/binaries/'
PROPBANK_GLO50_PATH = '{:}deep_glo50.pickle'.format(INPUT_DIR)
PEARL_SRLEVAL_PATH = '../srlconll-1.1/bin/srl-eval.pl'

<h1><center>Structured Prediction Network CWIS SRL (BR)</center></h1>

<center>In this notebook we solve the semantic role labeling task using structured predictions networks.</center>

## 1. Builds a "human friendly" version of the dataset

In [2]:
dfgs = pd.read_csv('../datasets/csvs/gs.csv', index_col=0, sep=',', encoding='utf-8')
column_files = [
    '../datasets/csvs/column_chunks/chunks.csv',
    '../datasets/csvs/column_predmarker/predicate_marker.csv',
    '../datasets/csvs/column_shifts_ctx_p/form.csv',
    '../datasets/csvs/column_shifts_ctx_p/gpos.csv',
    '../datasets/csvs/column_shifts_ctx_p/lemma.csv',
    '../datasets/csvs/column_t/t.csv',
    '../datasets/csvs/column_iob/iob.csv'
]

for col_f in column_files:
    _df = pd.read_csv(col_f, index_col=0, encoding='utf-8')
    dfgs = pd.concat((dfgs, _df), axis=1)

DISPLAY_COLUMNS = ['ID', 'P', 'FORM', 'ARG', 'T', 
                   'CHUNK_ID', 'CHUNK_START', 'CHUNK_FINISH', 'CHUNK_LEN', 'CHUNK_CANDIDATE_ID']            
dfgs[DISPLAY_COLUMNS].head(33)    

Unnamed: 0_level_0,ID,P,FORM,ARG,T,CHUNK_ID,CHUNK_START,CHUNK_FINISH,CHUNK_LEN,CHUNK_CANDIDATE_ID
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,1,Brasília,*,*,1,0,1,1,0
1,2,1,Pesquisa_Datafolha,(A0*,A0,2,1,4,3,35
2,3,1,publicada,*,A0,2,1,4,3,35
3,4,1,hoje,*),A0,2,1,4,3,35
4,5,1,revela,(V*),V,3,4,5,1,126
5,6,1,um,(A1*,A1,4,5,32,27,181
6,7,1,dado,*,A1,4,5,32,27,181
7,8,1,supreendente,*,A1,4,5,32,27,181
8,9,1,:,*,A1,4,5,32,27,181
9,10,1,recusando,*,A1,4,5,32,27,181


## 2. Gets encodings

Propbank Encoder holds an indexed version of propbank dataset an answers to FOUR different dataformats: 
* CAT: this is the raw categorical data.
* EMB: tokens are embedding using GloVe embeddings.
* HOT: onehot encoding of the words and tokens.
* IDX: dense indexed representations.

In [3]:
# LOAD ENCODER
propbank_encoder = PropbankEncoder.recover(PROPBANK_GLO50_PATH)
db = propbank_encoder.db
lex2idx = propbank_encoder.lex2idx
idx2lex = propbank_encoder.idx2lex

# FOR TEXTUAL DATA ONLY
lex2tok = propbank_encoder.lex2tok
tok2idx = propbank_encoder.tok2idx
embeddings = propbank_encoder.embeddings

n_targets = len(lex2idx['T'])

In [4]:
print('attributes\t',
       len(db),
      '\n',             
      'records\t',
       len(db['ARG'].keys()))

attributes	 44 
 records	 141730


In [5]:
def filter_type(ds_type, db):
    '''Filters only records from train dataset
    '''
    ds_types = ('train', 'test', 'valid')
    if ds_type not in ds_types:
        _msg = 'ds_type must be in {:} got {:}'
        _msg = _msg.format(ds_types, ds_type)
        raise ValueError(_msg)
    elif ds_type in ('train',):
        lb = 0 
        ub = config.DATASET_TRAIN_SIZE
    elif ds_type in ('test',):        
        lb = config.DATASET_TRAIN_SIZE
        ub = lb + config.DATASET_VALID_SIZE         
    elif ds_type in ('valid',):                
        lb = config.DATASET_TRAIN_SIZE + config.DATASET_VALID_SIZE
        ub = lb + config.DATASET_TEST_SIZE         

    sel_keys_ = {key_ for key_, prop_ in db['P'].items() if prop_ > lb and prop_ <= ub}

    return {
                attr_:{ idx_: i_
                        for idx_, i_ in dict_.items() if idx_ in sel_keys_
                      }        
                for attr_, dict_  in db.items()
            }

def make_propositions_dict(db):
    '''Reindex db by propositions creating a nested dict in which the
        outer key is the proposition        
    '''
    
    triple_list = []
    prev_prop = -1
    for idx, prop in db['P'].items():
        if prev_prop != prop:
            if idx > 0:
                ub = idx-1
                triple_list.append((lb, ub, prev_prop))
            lb = idx
        prev_prop = prop
    triple_list.append((lb, ub, prev_prop))
            

        
    prop_set = set(db['P'].values())
    return { prop_:
                    {
                        attr_:{ idx_: dict_[idx_]
                                for idx_ in range(lb_, ub_ + 1, 1)
                          }        
                        for attr_, dict_ in db.items() if attr_ not in ('P',)
                    }
             for lb_, ub_, prop_ in  triple_list
            }, {prop_: ub_ - lb_ + 1 for lb_, ub_, prop_ in  triple_list}   


def numpfy_propositions_dict(prop_dict, proplen_dict):
    '''Converts inner dict examples into numpy arrays
    '''
    prop_dict_ = defaultdict(dict)    
    for prop, columns_dict in prop_dict.items():
        len_ = proplen_dict[prop]
        shape_ = (len_, 1)
        for column, values_dict in columns_dict.items():
            tuple_list = [idx_value 
                          for idx_value in values_dict.items()]
            
            tuple_list = sorted(tuple_list, key=lambda x: x[0])            
            # Converts lexicon (raw/indexed) into token (embedded/indexed)
            if (('FORM' in column) or ('LEMMA' in column)):
                values_list = [tok2idx[lex2tok[idx2lex[column][tuple_[1]]]]                
                                   for tuple_ in tuple_list]
            else:
                values_list = [tuple_[1] for tuple_ in tuple_list]
            
            prop_dict_[prop][column]  = np.array(values_list).reshape(shape_)
    
    return prop_dict_        


In [6]:
traindb  = filter_type('train', db)
print('attributes\t',
       len(traindb),
      '\n',             
      'records\t',
       len(traindb['ARG'].keys()),
       '\n',             
      'vocab\t',
        max([form for _, form in traindb['FORM'].items()]))

attributes	 44 
 records	 123846 
 vocab	 13289


In [7]:
prop_dict, proplen_dict = make_propositions_dict(traindb)
print('attributes\t',
       len(prop_dict[1]) + 1,
      '\n',             
      'records\t',
       sum([len(d['ARG']) for p, d in prop_dict.items()]),
        '\n',             
      'vocab\t',
        max([form for _, prop in prop_dict.items() for _, form in prop['FORM'].items()]))

attributes	 44 
 records	 123837 
 vocab	 13289


In [8]:
prop_dict1 = numpfy_propositions_dict(prop_dict, proplen_dict)
print('attributes\t',
       len(prop_dict1[1]) + 1,
      '\n',             
      'records\t',
       sum([len_ for _, len_ in proplen_dict.items()]),
        '\n',             
      'vocab\t',
        max([max(form) for _, prop in prop_dict1.items() for form in prop['FORM']]))

attributes	 44 
 records	 123837 
 vocab	 12037


In [9]:
def get_inputs(db1, propid):
    '''Generate inputs
    '''
    propdb = db1[propid] # nested dict of columns and idx value
    if 'CHUNK_SPACE' not in propdb:
        proplen = len(propdb['ID'])
        propdb['CHUNK_SPACE'] = generate_chunk_space(proplen)

    word    = propdb['FORM']
    ctx_p_left  = propdb['FORM_CTX_P-1']
    ctx_p0  = propdb['FORM_CTX_P+0']
    ctx_p_right  = propdb['FORM_CTX_P+1']
    
    marker  = propdb['MARKER']
    pos     = propdb['GPOS']
    chunk_type  = propdb['T']
    chunk_start, chunk_finish = propdb['CHUNK_SPACE']
    
    return word, ctx_p_left, ctx_p0, ctx_p_right, marker, pos, chunk_type, chunk_start, chunk_finish
            
def generate_chunk_space(n):
    '''Generates all possible spaces for chunks
    '''
    start_list = []
    end_list = []
    for i in range(n):
        for j in range(i,n,1):
            start_list.append(i)
            end_list.append(j+1)
    shape_ = (len(start_list), 1)
    start_ = np.array(start_list).reshape(shape_)
    finish_ = np.array(end_list).reshape(shape_)
    return start_, finish_
            

def get_outputs(db1, propid, n_targets):
    ''' Generate outputs
    '''
    propdb = db1[propid] # nested dict of columns and idx value
    if 'OUTPUTS' not in propdb: 
        id_type = np.concatenate(
            ( propdb['CHUNK_CANDIDATE_ID'], propdb['T']), axis=1
        )

        id_type = np.unique( id_type, axis=0)
        propdb['OUTPUTS'] = id_type[:,0] * n_targets + id_type[:,1]

    return propdb['OUTPUTS']

In [10]:
%%timeit
propid = 1120
word, ctx_p_left, ctx_p0, ctx_p_right, marker, pos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1, propid)
targets = get_outputs(prop_dict1, propid, n_targets)
# worst proposition 1120 size 92!

888 ns ± 11 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


 ## MODEL

In [11]:
propid = 1
word, ctx_p_left, ctx_p0, ctx_p_right, marker, pos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1,  propid)
targets = get_outputs(prop_dict1, propid, n_targets)
# proplen = proplen_dict[propid]
# y = y.reshape((proplen,1))
print(targets)
_start  = np.repeat(chunk_start, n_targets)
_finish = np.repeat(chunk_finish, n_targets)
print(_start)
print(_finish)
print(list(zip(_start[targets].flatten(), _finish[targets].flatten())))


[    0  1261 20160  4571  6518]
[ 0  0  0 ..., 32 32 32]
[ 1  1  1 ..., 33 33 33]
[(0, 1), (1, 4), (32, 33), (4, 5), (5, 32)]


In [12]:
import struct_perc.colored_weighted_interval_scheduling as cwis
import struct_perc.weighted_interval_scheduling as wis
import struct_perc.utils as spu

 ## Tensorflow Graph

In [15]:
# vocab_size = len(lex2idx['FORM']) + 1
# embed_size = 50

# n_pos = len(lex2idx['GPOS'])
# # n_type = len(lex2idx['T'])
# n_classes  = len(lex2idx['T'])

# tf.reset_default_graph()

# # word index and gpos 
# tf_words = tf.placeholder(tf.int32, shape=(None,1))
# tf_pos = tf.placeholder(tf.int32, shape=(None,1))
# # t_x_type = tf.placeholder(tf.int32, shape=(None,1))

# # índices de inicio de intervalo
# tf_s = tf.placeholder(tf.int32, shape=(None,1))
# # índices de fim de intervalo
# tf_f = tf.placeholder(tf.int32, shape=(None,1))

# # replicamos os indicies de inicio e fim para cada classe de chunk possivel
# tf_sc = tf.reshape(
#       tf.tile(tf_s,  [1, n_classes]), [-1,1])
# tf_fc = tf.reshape(
#       tf.tile(tf_f,  [1, n_classes]), [-1,1])

# # n_features = (embed_size + n_pos + n_type)
# n_features = (embed_size + n_pos)
# # hidden_features = 300
# W_shape = (n_features, n_classes)
# EMBS = tf.constant(embeddings)
# # tf_token = tf.Variable(initial_value=None, expected_shape=(embed_size,), dtype=tf.float32, trainable=False)

# # geramos os paramteros do modelo
# with tf.variable_scope("model"):
#     W = tf.Variable(
#         tf.random_normal(W_shape, 0, 1/np.sqrt(n_features * n_classes), name='W')
#     )
#     b = tf.Variable(
#         tf.random_normal((n_classes,), 0, 1/np.sqrt(n_classes), name='b')
#     )
    

# # tf_token = tf.nn.embedding_lookup(tf_embeddings, id) 
# # Recuperamos os embeddings de cada palavra
# tf_word_features = tf.gather_nd(EMBS, tf_words)

# tf_pos_flat = tf.reshape(tf_pos, [-1])
# tf_pos_features = tf.one_hot(tf_pos_flat, depth=n_pos)

# # t_x_type_flat = tf.reshape(t_x_type,[-1])
# # t_type_features = tf.one_hot(t_x_type_flat, depth=n_type)

# # X = tf.concat((t_word_features,t_pos_features,t_type_features),axis=1)
# tf_tok_features = tf.concat((tf_word_features,tf_pos_features),axis=1)

# # a partir das features do intervalo computamos o score
# tf_scores = tf.matmul(tf_tok_features, W) + b

# tf_pred = tf.argmax(tf_scores, axis=1)


In [13]:
# Parameters
vocab_size = len(tok2idx)
embed_size = 50

n_pos = len(lex2idx['GPOS'])
n_classes = len(lex2idx['T'])
n_features = embed_size * 4 + 1 + n_pos

n_hidden = 100
# W_shape = (n_features, n_hidden)
W_shape = (n_hidden, n_hidden)
b_shape = (1, n_hidden)

W_interval_shape = (2 * n_hidden, n_classes)
# W_shape = (hidden_features, n_classes)
b_interval_shape = (1, n_classes)

# word index 
X_words = tf.placeholder(tf.int64, shape=(None,1), name='word')

# predicate context index (left, predicate, right)
X_ctx_p_left = tf.placeholder(tf.int64, shape=(None,1), name='ctx_p_left')
X_ctx_p = tf.placeholder(tf.int64, shape=(None,1), name='ctx_p0')
X_ctx_p_right = tf.placeholder(tf.int64, shape=(None,1), name='ctx_p_right')

# POS tagging feature
X_pos = tf.placeholder(tf.int64, shape=(None,1), name='gpos')
X_marker = tf.cast( tf.placeholder(tf.int64, shape=(None,1), name='marker'), tf.float32 )
EMBS = tf.Variable(embeddings, trainable=False)

# Embedded representation
with tf.variable_scope("features"):
    EMBS_words = tf.gather_nd(EMBS, X_words, name='word_features')

    EMBS_ctx_pleft = tf.gather_nd(EMBS, X_ctx_p_left, name='EMBS_ctx_pleft')
    EMBS_ctx_p0 = tf.gather_nd(EMBS, X_ctx_p, name='EMBS_ctx_p0')
    EMBS_ctx_pright = tf.gather_nd(EMBS, X_ctx_p_right, name='EMBS_ctx_pright')

    X_pos_flat = tf.reshape(X_pos, [-1], name='gpos_flat')
    X_pos_onehot = tf.one_hot(X_pos_flat, depth=n_pos, name='gpos_onehot')

    X = tf.concat((EMBS_words, EMBS_ctx_pleft, EMBS_ctx_p0,
                   EMBS_ctx_pright, X_pos_onehot, X_marker),
                  axis=1, name='X')
    X_batch = tf.expand_dims(X, 0)

with tf.variable_scope('gru', reuse=tf.AUTO_REUSE):

    fw = tf.nn.rnn_cell.GRUCell(num_units=n_hidden / 2)
    bw = tf.nn.rnn_cell.GRUCell(num_units=n_hidden / 2)
    
    Wo = tf.Variable(tf.truncated_normal(W_shape, stddev=1.0 / np.sqrt(n_features * n_hidden)), name='W' )
    bo = tf.Variable(tf.zeros(b_shape, dtype=tf.float32), name='b')

    hidden_outputs, states = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=fw,
        cell_bw=bw,
        inputs=X_batch,
        dtype=tf.float32
    )
    hidden_fw, hidden_bw = hidden_outputs
    Ho = tf.concat((tf.squeeze(hidden_fw, axis=0) ,tf.squeeze(hidden_fw, axis=0)),axis=1)

    Z = tf.nn.tanh( tf.matmul( Ho, Wo ) + bo, name='hidden_layer' )

# Those are the interval parameters
with tf.variable_scope("interval"):
    W_interval = tf.Variable(tf.random_normal(W_interval_shape, mean=0.0, stddev=1.0 / np.sqrt(1.0 * n_hidden * n_classes)), name='W_interval')
    b_interval = tf.Variable(tf.zeros(b_interval_shape, dtype=tf.float32), name='b_interval')
    
    # begin of interval
    IntervalStart = tf.placeholder(tf.int32, shape=(None,1))
    # end of interval
    IntervalFinish = tf.placeholder(tf.int32, shape=(None,1))

# features from intervals
IntervalFinishZ = tf.gather_nd(Z, IntervalFinish-1)
IntervalStartZ = tf.gather_nd(Z, IntervalStart)

IntervalZ = tf.concat((IntervalFinishZ, IntervalStartZ), axis=1)
IntervalScores = tf.matmul(IntervalZ, W_interval) + b_interval

ScoresFlat = tf.reshape(IntervalScores, (-1,1)) # column array n_classes * ((len + 1 ) * len) / 2 
ScoresMean = tf.reduce_mean(ScoresFlat) # scalar
ScoresDiff = ScoresFlat - ScoresMean   # centralize data --> mean zero

ScoresStd = tf.sqrt(tf.reduce_sum(ScoresDiff * ScoresDiff))
ScoresOp = ScoresDiff /( ScoresStd + 1e-8 )

 ## Tensorflow test session

In [14]:
propid  = 1
words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1, propid)
targets = get_outputs(prop_dict1, propid, n_targets)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
#     out_scores_interval = sess.run(scores_interval, feed_dict={
    arg_list = [Z, IntervalZ, IntervalStartZ, IntervalFinishZ, W_interval, b_interval, IntervalScores]
#     ZZ, scores, start_scores, finish_scores = sess.run(arg_list, feed_dict={
    data_list = sess.run(arg_list, feed_dict={
        X_words:words,
        X_ctx_p_left: ctx_p_left,
        X_ctx_p: ctx_p0,
        X_ctx_p_right: ctx_p_right,        
        X_marker: marker,                
        X_pos:gpos,
        IntervalStart: chunk_start,
        IntervalFinish: chunk_finish
    })
    ZZ, scores, start_scores, finish_scores, Wi, bi, inteval_scores = data_list
    # flat gives the score for each candidate

print(ZZ.shape, scores.shape, start_scores.shape, finish_scores.shape, Wi.shape, bi.shape, inteval_scores.shape)


(33, 100) (561, 200) (561, 100) (561, 100) (200, 36) (1, 36) (561, 36)


In [15]:
print(chunk_start.shape, chunk_finish.shape)

print(ZZ.shape)

(561, 1) (561, 1)
(33, 100)


In [16]:
start_scores_test = np.array([ZZ[s,:] for s in chunk_start[:,0].tolist()])
print(np.allclose(start_scores_test, start_scores))
finish_scores_test = np.array([ZZ[s-1,:] for s in chunk_finish[:,0].tolist()])
print(np.allclose(finish_scores_test, finish_scores))
print('Must be true', np.allclose(start_scores_test[0, :], finish_scores_test[0,:]))
print('Must be false', np.allclose(start_scores_test[1, :], finish_scores_test[1,:]))
scores_test = np.concatenate((finish_scores_test, start_scores_test), axis=1)
print('Must be true', np.allclose(scores_test[0, :], scores[0,:]))
print('Must be true', np.allclose(np.matmul(scores_test,  Wi) + bi, inteval_scores))
# start_scores_test = np.concatenate(start_scores_list, axis=1)
# print(start_scores_test.shape)


True
True
Must be true True
Must be false False
Must be true True
Must be true True


In [17]:
def pred(sess, x_words, x_ctx_p_left, x_ctx_p0, x_ctx_p_right, x_marker, x_pos, x_chunk_start, x_chunk_finish):
    scores = sess.run(ScoresOp,feed_dict={
        X_words: x_words,
        X_ctx_p_left: x_ctx_p_left,
        X_ctx_p: x_ctx_p0,
        X_ctx_p_right: x_ctx_p_right,        
        X_marker: x_marker,                
        X_pos:x_pos,
        IntervalStart: x_chunk_start,
        IntervalFinish: x_chunk_finish
    })
    # scores is a ((proplen + 1) * (proplen) / 2) * n_classes  
    starts = np.repeat(x_chunk_start, n_classes).reshape((-1,1))
    ends = np.repeat(x_chunk_finish, n_classes).reshape((-1,1))
    
    ck_len = len(x_chunk_start)
    colors = np.array(list(np.arange(n_classes))*ck_len)

    # Finds best allocation given the scores and the chunk_space
    r_int = cwis.compute_schedule(starts.flatten(), ends.flatten(), scores, colors) # index of the cadidates of predicted solution
    r_ext = list(zip(starts[r_int].flatten(),ends[r_int].flatten(), colors[r_int].flatten())) # from integer to triple
    return r_int, r_ext

## Testing prediction

In [18]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
p, pe = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)
print(p)
print(pe)

[20184 20112 20004 19860 19680 19464 19212 18924 18600 18240 17828 17396
 16928 16424 15884 15324 14712 14064 13380 12660 11904 11112 10284  9420
  8520  7584  6612  5604  4560  3480  2364  1212    24]
[(32, 33, 24), (31, 32, 24), (30, 31, 24), (29, 30, 24), (28, 29, 24), (27, 28, 24), (26, 27, 24), (25, 26, 24), (24, 25, 24), (23, 24, 24), (22, 23, 8), (21, 22, 8), (20, 21, 8), (19, 20, 8), (18, 19, 8), (17, 18, 24), (16, 17, 24), (15, 16, 24), (14, 15, 24), (13, 14, 24), (12, 13, 24), (11, 12, 24), (10, 11, 24), (9, 10, 24), (8, 9, 24), (7, 8, 24), (6, 7, 24), (5, 6, 24), (4, 5, 24), (3, 4, 24), (2, 3, 24), (1, 2, 24), (0, 1, 24)]


## Training code

In [None]:
# t_y_int = tf.placeholder(tf.int32, shape=(None,))
# t_y_rshp = tf.reshape(t_y_int, (-1,1))
# t_margin_int = tf.placeholder(tf.float32, shape=())

# t_margin_values = tf.ones(tf.shape(t_y_rshp))*t_margin_int
# t_margin_scores = tf.scatter_nd(t_y_rshp, -t_margin_values, tf.shape(t_scores_flat))
# t_scores_flat_w_margin = t_scores_flat + t_margin_scores

In [19]:
# indices of the correct intervals
T = tf.placeholder(tf.int32, shape=(None,), name='T')
L = tf.placeholder(tf.int32, shape=(), name='L')
# I = tf.to_int32(tf.range(L), name='indices')

T_flat = tf.reshape(T, (-1,1), name='T_flat') # column array
MarginFactor = tf.placeholder(tf.float32)
MarginIndex = tf.ones(tf.shape(T_flat)) * MarginFactor
MarginScores = tf.scatter_nd(T_flat, -MarginIndex, tf.shape(ScoresOp)) # oppposite of gather_nd
ScoresWithMargin = ScoresOp + MarginScores


In [None]:
# # índices dos intervalos computados pelo Weighted Interval Scheduling
# t_p_int = tf.placeholder(tf.int32, shape=(None,))
# # índices dos intervalos corretos
# # t_y_int = tf.placeholder(tf.int32, shape=(None,))

# # formatamos os indices dos intervalos preditos e corretos
# t_p_rshp = tf.reshape(t_p_int,(-1,1))
# # t_y_rshp = tf.reshape(t_y_int,(-1,1))

# # score dos intervalos preditos
# t_scores_int_p = tf.gather_nd(t_scores_flat_w_margin, t_p_rshp)
# # score dos intervalos corretos
# t_scores_int_y = tf.gather_nd(t_scores_flat_w_margin, t_y_rshp)

# ## função de custo do perceptron estruturado
# # WIS
# t_cost_int = tf.reduce_sum(t_scores_int_p) - tf.reduce_sum(t_scores_int_y)
# t_cost = t_cost_int

# # # gradiente descendente no custo do perceptron estruturado
# optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
# # optimizer = tf.train.GradientDescentOptimizer(0.003)
# train = optimizer.minimize(t_cost)

In [20]:
# indices containing the predicted labels from Weighted Interval Scheduling
Y = tf.placeholder(tf.int32, shape=(None,), name='predictions')
Y_flat = tf.reshape(Y, (-1, 1))

# score da estrutura predita
ScoreY = tf.gather_nd(ScoresWithMargin, Y_flat, name='predicted_score')
# score da estrutura correta
ScoreT = tf.gather_nd(ScoresWithMargin, T_flat, name='target_score')

# função de custo do perceptron estruturado
CostOp = tf.reduce_sum(ScoreY) - tf.reduce_sum(ScoreT)

# gradiente descendente no custo do perceptron estruturado
Optimizer = tf.train.AdamOptimizer(0.001)
TrainOp = Optimizer.minimize(CostOp)

In [21]:
predictions, _ = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)
print(targets.shape)
print(targets)
print(predictions.shape)
print(predictions)

(5,)
[    0  1261 20160  4571  6518]
(33,)
[20184 20112 20004 19860 19680 19464 19212 18924 18600 18240 17828 17396
 16928 16424 15884 15324 14712 14064 13380 12660 11904 11112 10284  9420
  8520  7584  6612  5604  4560  3480  2364  1212    24]


 ## Testing cost operation

In [24]:
predictions, chunk_ext = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)

interval_start = np.repeat(chunk_start, n_classes).reshape((-1,1))
interval_finish = np.repeat(chunk_finish, n_classes).reshape((-1,1))

scores_, cost_ = sess.run([ScoresOp, CostOp], feed_dict={
    X_words: words,
    X_ctx_p_left: ctx_p_left,
    X_ctx_p: ctx_p0,
    X_ctx_p_right: ctx_p_right,        
    X_marker: marker,        
    X_pos: gpos,
    IntervalStart: interval_start,
    IntervalFinish: interval_finish,
    T: targets.flatten(),
    Y: predictions.flatten(),        
    L: proplen_dict[propid],
    MarginFactor:0.01})


print(scores_.shape)
print(cost_)
print(np.max(predictions))
print(np.sum(scores_[predictions]) - np.sum(scores_[targets]))

(727056, 1)
0.112375
20184
0.0623754


In [25]:
colors = np.array(list(np.arange(n_classes))*len(chunk_start))
r_int = cwis.compute_schedule(interval_start.flatten(), interval_finish.flatten(), scores_, colors) 
print(np.mean(scores_[r_int]), ' ', np.mean(scores_[targets]))

0.00194049   -0.000565748


In [26]:
np.sum(scores_[predictions].flatten())

0.059546709

In [27]:
np.sum(scores_[targets].flatten())

-0.0028287403

In [28]:
print(sorted(predictions))
print(sorted(r_int))

[24, 1212, 2364, 3480, 4560, 5604, 6612, 7584, 8520, 9420, 10284, 11112, 11904, 12660, 13380, 14064, 14712, 15324, 15884, 16424, 16928, 17396, 17828, 18240, 18600, 18924, 19212, 19464, 19680, 19860, 20004, 20112, 20184]
[24, 1212, 2364, 3480, 4560, 5604, 6612, 7584, 8520, 9420, 10284, 11112, 11904, 12660, 13380, 14064, 14712, 15324, 15900, 16440, 16944, 17412, 17844, 18240, 18600, 18924, 19212, 19464, 19680, 19860, 20004, 20112, 20184]


In [32]:
propid  = 1
words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1, propid)
targets = get_outputs(prop_dict1, propid, n_targets)

starts = np.repeat(chunk_start,n_classes).reshape((-1,1))
ends = np.repeat(chunk_finish, n_classes).reshape((-1,1))

scores_ = sess.run(ScoresOp, feed_dict={
    X_words: words,
    X_ctx_p_left: ctx_p_left,
    X_ctx_p: ctx_p0,
    X_ctx_p_right: ctx_p_right,        
    X_marker: marker,        
    X_pos: gpos,
    IntervalStart: chunk_start,
    IntervalFinish: chunk_finish,
    T: targets.flatten(),
    Y: predictions.flatten(),        
    MarginFactor:0.01
})

colors = np.repeat(np.arange(n_classes), len(chunk_start))
predictions = cwis.compute_schedule(starts.flatten(), ends.flatten(), scores_, colors) 

cost_ = sess.run(CostOp, feed_dict={
    X_words: words,
    X_ctx_p_left: ctx_p_left,
    X_ctx_p: ctx_p0,
    X_ctx_p_right: ctx_p_right,        
    X_marker: marker,        
    X_pos: gpos,
    IntervalStart: chunk_start,
    IntervalFinish: chunk_finish,
    T:targets.flatten(),
    Y:predictions.flatten(),
    MarginFactor:0.01
})

print(cost_)
print(np.sum(scores_[predictions]) - np.sum(scores_[targets]))

0.55983
0.50983


In [33]:
print(scores_.shape)

(20196, 1)


## Training Single Proposition

In [36]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

propid = 1
words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1, propid)
targets = get_outputs(prop_dict1, propid, n_targets)

starts = np.repeat(chunk_start,n_classes).reshape((-1,1))
ends = np.repeat(chunk_finish, n_classes).reshape((-1,1))

for i in range(10000):
    
    scores_ = sess.run(ScoresOp, feed_dict={
        X_words: words,
        X_ctx_p_left: ctx_p_left,
        X_ctx_p: ctx_p0,
        X_ctx_p_right: ctx_p_right,        
        X_marker: marker,        
        X_pos: gpos,
        IntervalStart: chunk_start,
        IntervalFinish: chunk_finish,
        T: targets.flatten(),
        Y: predictions.flatten(),        
        MarginFactor:0.01
    })
    
    predictions, chunk_ext = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)    
    cost_ = sess.run(CostOp, feed_dict={
        X_words: words,
        X_ctx_p_left: ctx_p_left,
        X_ctx_p: ctx_p0,
        X_ctx_p_right: ctx_p_right,        
        X_marker: marker,        
        X_pos: gpos,
        IntervalStart: chunk_start,
        IntervalFinish: chunk_finish,
        T: targets.flatten(),
        Y: predictions.flatten(),        
        MarginFactor:0.01
    })
    
    predictions_score = np.sum(scores_[predictions])
    targets_score = np.sum(scores_[targets])
        
    sess.run(TrainOp, feed_dict={
        X_words: words,
        X_ctx_p_left: ctx_p_left,
        X_ctx_p: ctx_p0,
        X_ctx_p_right: ctx_p_right,        
        X_marker: marker,        
        X_pos: gpos,
        IntervalStart: chunk_start,
        IntervalFinish: chunk_finish,
        T: targets.flatten(),
        Y: predictions.flatten(),        
        MarginFactor:0.01})
    
    colors = np.repeat(np.arange(n_classes), len(chunk_start))
    predictions = cwis.compute_schedule(starts.flatten(), ends.flatten(), scores_, colors) 

    targets_set = set(targets.flatten())
    predictions_set = set(predictions.flatten())
    yp_common = targets_set.intersection(predictions_set)
    yp_total = targets_set.union(predictions_set)
    
    acc_int = len(yp_common)/len(yp_total)
    
    if i % 25 == 0:
        print(acc_int, ' ', cost_)
    
    if cost_ < 0:
        break

    if acc_int == 1:
        print(acc_int, ' ', cost_, ' learnt at epoch ', i)
        break


0.0   0.580276
0.0   0.258573
0.08571428571428572   0.145245
0.02702702702702703   0.139344
0.02702702702702703   0.0995918
0.05   0.0651398
0.17647058823529413   0.0280508
0.6666666666666666   0.0100175
1.0   9.31323e-10  learnt at epoch  177


In [37]:
print(list(zip(starts[predictions].flatten(), ends[predictions].flatten(), colors[predictions])))

[(32, 33, 35), (5, 32, 11), (4, 5, 8), (1, 4, 2), (0, 1, 0)]


 ## ConLL evaluation scripts

In [38]:
def tag_to_conll(sess, prop_dict, propid, idx2lex):
    gold_list = []
    eval_list = []
        
    words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict, propid)
    targets = get_outputs(prop_dict, propid, n_targets)

    predictions, chunk_ext = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)    


    n_words = len(words)
    pred_array = prop_dict[propid]['PRED']
    pred_array = pred_array.flatten()
    
    arg_array = prop_dict[propid]['ARG']
    arg_array = arg_array.flatten()
    
    pred_list = [idx2lex['PRED'][i] for i in pred_array.tolist()]
    gold_list_ = [idx2lex['ARG'][i] for i in arg_array.tolist()]
    
    gold_list += list(zip(pred_list, gold_list_))

    arg_list_ = []
    for triple_ in sorted(chunk_ext, key= lambda x: x[0]):
        lb, ub, arg = triple_            
        chunk_list_ = [idx2lex['T'][arg] if i == lb else '*' for i in range(lb, ub)] 
        if idx2lex['T'][arg] != '*':
            chunk_list_[0] = '({:}*'.format(chunk_list_[0])
            chunk_list_[-1] = '{:})'.format(chunk_list_[-1])

        arg_list_ += chunk_list_
        
    eval_list += list(zip(pred_list, arg_list_))
    eval_list.append(None)
    gold_list.append(None)
    return gold_list, eval_list

testamos o modelo na frase usada para treino

In [39]:
propid = 1
gold_list, eval_list = tag_to_conll(sess, prop_dict1, propid, idx2lex)
for i in range(proplen_dict[propid]):
    if gold_list[i] and eval_list[i]:
        print('{:}\t{:}\t{:}\t{:}'.format(*gold_list[i], *eval_list[i]))
    else:
        print('\n')
    

print(lex2idx['T'])

-	*	-	*
-	(A0*	-	(A0*
-	*	-	*
-	*)	-	*)
revelar	(V*)	revelar	(V*)
-	(A1*	-	(A1*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*)	-	*)
-	*	-	*
{'*': 0, 'A0': 1, 'A1': 2, 'A2': 3, 'A3': 4, 'A4': 5, 'A5': 6, 'AM-ADV': 7, 'AM-CAU': 8, 'AM-DIR': 9, 'AM-DIS': 10, 'AM-EXT': 11, 'AM-LOC': 12, 'AM-MED': 13, 'AM-MNR': 14, 'AM-NEG': 15, 'AM-PNC': 16, 'AM-PRD': 17, 'AM-REC': 18, 'AM-TMP': 19, 'C-A0': 20, 'C-A1': 21, 'C-A2': 22, 'C-A3': 23, 'C-AM-ADV': 24, 'C-AM-CAU': 25, 'C-AM-DIS': 26, 'C-AM-EXT': 27, 'C-AM-LOC': 28, 'C-AM-MNR': 29, 'C-AM-NEG': 30, 'C-AM-PNC': 31, 'C-AM-PRD': 32, 'C-AM-TMP': 33, 'C-V': 34, 'V': 35}


In [41]:
def evaluate(gold_list, eval_list, verbose=True):
    gold_path = 'train_gold.conll'    
    eval_path = 'train_eval.conll'

    with open(gold_path, mode='w') as f:        
        for tuple_ in gold_list:
            if tuple_ is None:
                f.write('\n')
            else:
                f.write('{:}\t{:}\n'.format(*tuple_))

    with open(eval_path, mode='w') as f:        
        for tuple_ in eval_list:
            if tuple_ is None:
                f.write('\n')
            else:
                f.write('{:}\t{:}\n'.format(*tuple_))

    pipe = Popen(['perl',PEARL_SRLEVAL_PATH, gold_path, eval_path], stdout=PIPE, stderr=PIPE)

    txt, err = pipe.communicate()
    txt = txt.decode('UTF-8')
    err = err.decode('UTF-8')
    
    if verbose:
        print(txt)

    float_list = re.findall(r'(\d+.\d+)', txt)
    f1 = float(float_list[3]) if len(float_list) > 3 else -1 
    return f1



In [42]:
propid = 1
gold_list, eval_list = tag_to_conll(sess, prop_dict1, propid, idx2lex)
f1_score = evaluate(gold_list, eval_list, verbose=True)
print('f1_score: ', f1_score)

Number of Sentences    :           1
Number of Propositions :           1
Percentage of perfect props : 100.00

              corr.  excess  missed    prec.    rec.      F1
------------------------------------------------------------
   Overall        2       0       0   100.00  100.00  100.00
----------
        A0        1       0       0   100.00  100.00  100.00
        A1        1       0       0   100.00  100.00  100.00
------------------------------------------------------------
         V        1       0       0   100.00  100.00  100.00
------------------------------------------------------------

f1_score:  100.0


In [43]:
import time
start = time.time()
print(1111)
gold_list, eval_list = tag_to_conll(sess, prop_dict1, propid, idx2lex)
f1_score = evaluate(gold_list, eval_list, verbose=True)
end = time.time()
print('tempo para avaliar: ', (end-start), 's')
print('f1_score: ', f1_score)

1111
Number of Sentences    :           1
Number of Propositions :           1
Percentage of perfect props : 100.00

              corr.  excess  missed    prec.    rec.      F1
------------------------------------------------------------
   Overall        2       0       0   100.00  100.00  100.00
----------
        A0        1       0       0   100.00  100.00  100.00
        A1        1       0       0   100.00  100.00  100.00
------------------------------------------------------------
         V        1       0       0   100.00  100.00  100.00
------------------------------------------------------------

tempo para avaliar:  0.6595838069915771 s
f1_score:  100.0


## Training

In [55]:
sess= tf.Session()
sess.run(tf.global_variables_initializer())

epochs = 1000
# indices = np.arange(config.DATASET_TRAIN_SIZE)
indices = np.arange(50) + 1

best_score = 0
saver = tf.train.Saver()

for j in range(epochs):
    np.random.shuffle(indices)
    total_err = 0
    total_size = 0
    gold_list = []
    eval_list = [] 
    for i, propid in enumerate(indices):     
        try: 
            words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1, propid)
            targets = get_outputs(prop_dict1, propid, n_targets)
        except KeyError:
            print(propid)

        predictions, chunk_ext = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)

        _, cost = sess.run([TrainOp, CostOp], feed_dict={
            X_words: words,
            X_ctx_p_left: ctx_p_left,
            X_ctx_p: ctx_p0,
            X_ctx_p_right: ctx_p_right,        
            X_marker: marker,        
            X_pos: gpos,
            IntervalStart: chunk_start,
            IntervalFinish: chunk_finish,
            T: targets.flatten(),
            Y: predictions.flatten(),        
            MarginFactor:0.01})

        total_err += len(set(predictions.tolist()) targets.tolist())
        total_size += predictions.shape[0]
        
        gold_list_, eval_list_ = tag_to_conll(sess, prop_dict1, propid, idx2lex)

        gold_list += gold_list_
        eval_list += eval_list_

        if i % 25 == 0:
            print('Iteration:', i + j*len(indices) ,'\tepoch:', j, ' \tacc:', 1 - total_err/total_size, '\tcost:', cost)

        
    f1_score = evaluate(gold_list, eval_list, verbose=False)    
    if f1_score > best_score:
        best_score = f1_score
        f1_score = evaluate(gold_list, eval_list, verbose=True)    
        save_path = saver.save(sess, "/tmp/model_spn-pt.ckpt")
        
    if best_score > .95:
        print('best_score:')
        break
    print('Epoch:', j, ' \tf1_score:', f1_score, ' \tbest_score:', best_score)



Iteration: 0 	epoch: 0  	acc: 0.9375 	cost: 0.568005
Iteration: 25 	epoch: 0  	acc: 0.95211786372 	cost: 0.170991
Epoch: 0  	f1_score: 0.0  	best_score: 0




Iteration: 50 	epoch: 1  	acc: 0.96 	cost: 0.18601
Iteration: 75 	epoch: 1  	acc: 0.953068592058 	cost: 0.175636
Number of Sentences    :          50
Number of Propositions :          50
Percentage of perfect props :   0.00

              corr.  excess  missed    prec.    rec.      F1
------------------------------------------------------------
   Overall        4     606     121     0.66    3.20    1.09
----------
        A0        1      14      27     6.67    3.57    4.65
        A1        2      93      45     2.11    4.26    2.82
        A2        1      45      11     2.17    8.33    3.45
        A3        0       3       2     0.00    0.00    0.00
        A4        0     100       2     0.00    0.00    0.00
        A5        0       9       0     0.00    0.00    0.00
    AM-ADV        0      32       4     0.00    0.00    0.00
    AM-CAU        0       4       4     0.00    0.00    0.00
    AM-DIR        0      19       0     0.00    0.00    0.00
    AM-DIS        0      43     



Iteration: 100 	epoch: 2  	acc: 0.973684210526 	cost: 0.147291
Iteration: 125 	epoch: 2  	acc: 0.95600676819 	cost: 0.197758
Epoch: 2  	f1_score: 0.14  	best_score: 0.66
Iteration: 150 	epoch: 3  	acc: 0.923076923077 	cost: 0.101777




Iteration: 175 	epoch: 3  	acc: 0.953153153153 	cost: 0.0933091
Epoch: 3  	f1_score: -1  	best_score: 0.66




Iteration: 200 	epoch: 4  	acc: 0.96875 	cost: 0.121876
Iteration: 225 	epoch: 4  	acc: 0.952641165756 	cost: 0.0879981
Epoch: 4  	f1_score: -1  	best_score: 0.66




Iteration: 250 	epoch: 5  	acc: 0.964285714286 	cost: 0.105846
Iteration: 275 	epoch: 5  	acc: 0.949317738791 	cost: 0.0898303
Epoch: 5  	f1_score: -1  	best_score: 0.66




Iteration: 300 	epoch: 6  	acc: 0.947368421053 	cost: 0.102984
Iteration: 325 	epoch: 6  	acc: 0.939110070258 	cost: 0.0711961
Epoch: 6  	f1_score: -1  	best_score: 0.66




Iteration: 350 	epoch: 7  	acc: 0.96 	cost: 0.108778
Iteration: 375 	epoch: 7  	acc: 0.937649880096 	cost: 0.0682027
Epoch: 7  	f1_score: -1  	best_score: 0.66




Iteration: 400 	epoch: 8  	acc: 0.96875 	cost: 0.0977199
Iteration: 425 	epoch: 8  	acc: 0.950664136622 	cost: 0.0759887
Epoch: 8  	f1_score: -1  	best_score: 0.66
Iteration: 450 	epoch: 9  	acc: 0.833333333333 	cost: 0.0471078




Iteration: 475 	epoch: 9  	acc: 0.928767123288 	cost: 0.0673258
Epoch: 9  	f1_score: -1  	best_score: 0.66




Iteration: 500 	epoch: 10  	acc: 0.933333333333 	cost: 0.0487361
Iteration: 525 	epoch: 10  	acc: 0.889273356401 	cost: 0.0389142
Epoch: 10  	f1_score: -1  	best_score: 0.66
Iteration: 550 	epoch: 11  	acc: 0.333333333333 	cost: 0.0192447




Iteration: 575 	epoch: 11  	acc: 0.890675241158 	cost: 0.0485354
Epoch: 11  	f1_score: -1  	best_score: 0.66




Iteration: 600 	epoch: 12  	acc: 0.972972972973 	cost: 0.125892
Iteration: 625 	epoch: 12  	acc: 0.732394366197 	cost: 0.0571445
Epoch: 12  	f1_score: -1  	best_score: 0.66
Iteration: 650 	epoch: 13  	acc: -4.0 	cost: 0.0558225




Iteration: 675 	epoch: 13  	acc: 0.715976331361 	cost: 0.0618007
Epoch: 13  	f1_score: -1  	best_score: 0.66
Iteration: 700 	epoch: 14  	acc: -4.0 	cost: 0.0562314




Iteration: 725 	epoch: 14  	acc: 0.722826086957 	cost: 0.0412992
Epoch: 14  	f1_score: -1  	best_score: 0.66




Iteration: 750 	epoch: 15  	acc: 0.952380952381 	cost: 0.0877968
Iteration: 775 	epoch: 15  	acc: 0.766393442623 	cost: 0.044323
Epoch: 15  	f1_score: -1  	best_score: 0.66
Iteration: 800 	epoch: 16  	acc: 0.833333333333 	cost: 0.0615832




Iteration: 825 	epoch: 16  	acc: 0.738095238095 	cost: 0.0750299
Epoch: 16  	f1_score: -1  	best_score: 0.66
Iteration: 850 	epoch: 17  	acc: -4.0 	cost: 0.0518702




Iteration: 875 	epoch: 17  	acc: 0.658536585366 	cost: 0.0457314
Epoch: 17  	f1_score: -1  	best_score: 0.66




Iteration: 900 	epoch: 18  	acc: 0.5 	cost: 0.0529566
Iteration: 925 	epoch: 18  	acc: 0.716577540107 	cost: 0.0415998
Number of Sentences    :          50
Number of Propositions :          50
Percentage of perfect props :   0.00

              corr.  excess  missed    prec.    rec.      F1
------------------------------------------------------------
   Overall        5     102     120     4.67    4.00    4.31
----------
        A0        0      13      28     0.00    0.00    0.00
        A1        3      42      44     6.67    6.38    6.52
        A2        0      10      12     0.00    0.00    0.00
        A3        0       6       2     0.00    0.00    0.00
        A4        0       0       2     0.00    0.00    0.00
    AM-ADV        0       2       4     0.00    0.00    0.00
    AM-CAU        0       3       4     0.00    0.00    0.00
    AM-DIS        1       2       2    33.33   33.33   33.33
    AM-EXT        0       7       0     0.00    0.00    0.00
    AM-LOC        1       

In [56]:
print(predictions.shape)

(5,)


(5,)


In [57]:
predictions

array([3240, 2667, 2278, 1990,  144])

In [58]:
print(targets)

[ 180 2319 2771 2991 3240]
