In [1]:
import sys
sys.path.insert(0,'../models/')
sys.path.insert(0,'../datasets/')
sys.path.insert(0,'..')

import pandas as pd
import numpy as np
import json
from subprocess import Popen, PIPE, STDOUT
import re
from collections import defaultdict

import tensorflow as tf
import tqdm
from models import PropbankEncoder
import config

INPUT_DIR = '../datasets/binaries/'
PROPBANK_GLO50_PATH = '{:}deep_glo50.pickle'.format(INPUT_DIR)
PEARL_SRLEVAL_PATH = '../srlconll-1.1/bin/srl-eval.pl'

<h1><center>Structured Prediction Network CWIS SRL (BR)</center></h1>

<center>In this notebook we solve the semantic role labeling task using structured predictions networks.</center>

## 1. Builds a "human friendly" version of the dataset

In [2]:
dfgs = pd.read_csv('../datasets/csvs/gs.csv', index_col=0, sep=',', encoding='utf-8')
column_files = [
    '../datasets/csvs/column_chunks/chunks.csv',
    '../datasets/csvs/column_predmarker/predicate_marker.csv',
    '../datasets/csvs/column_shifts_ctx_p/form.csv',
    '../datasets/csvs/column_shifts_ctx_p/gpos.csv',
    '../datasets/csvs/column_shifts_ctx_p/lemma.csv',
    '../datasets/csvs/column_t/t.csv',
    '../datasets/csvs/column_iob/iob.csv'
]

for col_f in column_files:
    _df = pd.read_csv(col_f, index_col=0, encoding='utf-8')
    dfgs = pd.concat((dfgs, _df), axis=1)

DISPLAY_COLUMNS = ['ID', 'P', 'FORM', 'ARG', 'T', 
                   'CHUNK_ID', 'CHUNK_START', 'CHUNK_FINISH', 'CHUNK_LEN', 'CHUNK_CANDIDATE_ID']            
dfgs[DISPLAY_COLUMNS].head(33)    

Unnamed: 0_level_0,ID,P,FORM,ARG,T,CHUNK_ID,CHUNK_START,CHUNK_FINISH,CHUNK_LEN,CHUNK_CANDIDATE_ID
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,1,Brasília,*,*,1,0,1,1,0
1,2,1,Pesquisa_Datafolha,(A0*,A0,2,1,4,3,35
2,3,1,publicada,*,A0,2,1,4,3,35
3,4,1,hoje,*),A0,2,1,4,3,35
4,5,1,revela,(V*),V,3,4,5,1,126
5,6,1,um,(A1*,A1,4,5,32,27,181
6,7,1,dado,*,A1,4,5,32,27,181
7,8,1,supreendente,*,A1,4,5,32,27,181
8,9,1,:,*,A1,4,5,32,27,181
9,10,1,recusando,*,A1,4,5,32,27,181


## 2. Gets encodings

Propbank Encoder holds an indexed version of propbank dataset an answers to FOUR different dataformats: 
* CAT: this is the raw categorical data.
* EMB: tokens are embedding using GloVe embeddings.
* HOT: onehot encoding of the words and tokens.
* IDX: dense indexed representations.

In [3]:
# LOAD ENCODER
propbank_encoder = PropbankEncoder.recover(PROPBANK_GLO50_PATH)
db = propbank_encoder.db
lex2idx = propbank_encoder.lex2idx
idx2lex = propbank_encoder.idx2lex

# FOR TEXTUAL DATA ONLY
lex2tok = propbank_encoder.lex2tok
tok2idx = propbank_encoder.tok2idx
embeddings = propbank_encoder.embeddings

n_targets = len(lex2idx['T'])

In [4]:
print('attributes\t',
       len(db),
      '\n',             
      'records\t',
       len(db['ARG'].keys()))

attributes	 44 
 records	 141730


In [5]:
def filter_type(ds_type, db):
    '''Filters only records from train dataset
    '''
    ds_types = ('train', 'test', 'valid')
    if ds_type not in ds_types:
        _msg = 'ds_type must be in {:} got {:}'
        _msg = _msg.format(ds_types, ds_type)
        raise ValueError(_msg)
    elif ds_type in ('train',):
        lb = 0 
        ub = config.DATASET_TRAIN_SIZE
    elif ds_type in ('test',):        
        lb = config.DATASET_TRAIN_SIZE
        ub = lb + config.DATASET_VALID_SIZE         
    elif ds_type in ('valid',):                
        lb = config.DATASET_TRAIN_SIZE + config.DATASET_VALID_SIZE
        ub = lb + config.DATASET_TEST_SIZE         

    sel_keys_ = {key_ for key_, prop_ in db['P'].items() if prop_ > lb and prop_ <= ub}

    return {
                attr_:{ idx_: i_
                        for idx_, i_ in dict_.items() if idx_ in sel_keys_
                      }        
                for attr_, dict_  in db.items()
            }

def make_propositions_dict(db):
    '''Reindex db by propositions creating a nested dict in which the
        outer key is the proposition        
    '''
    
    triple_list = []
    prev_prop = -1
    for idx, prop in db['P'].items():
        if prev_prop != prop:
            if idx > 0:
                ub = idx-1
                triple_list.append((lb, ub, prev_prop))
            lb = idx
        prev_prop = prop
    triple_list.append((lb, ub, prev_prop))
            

        
    prop_set = set(db['P'].values())
    return { prop_:
                    {
                        attr_:{ idx_: dict_[idx_]
                                for idx_ in range(lb_, ub_ + 1, 1)
                          }        
                        for attr_, dict_ in db.items() if attr_ not in ('P',)
                    }
             for lb_, ub_, prop_ in  triple_list
            }, {prop_: ub_ - lb_ + 1 for lb_, ub_, prop_ in  triple_list}   


def numpfy_propositions_dict(prop_dict, proplen_dict):
    '''Converts inner dict examples into numpy arrays
    '''
    prop_dict_ = defaultdict(dict)    
    for prop, columns_dict in prop_dict.items():
        len_ = proplen_dict[prop]
        shape_ = (len_, 1)
        for column, values_dict in columns_dict.items():
            tuple_list = [idx_value 
                          for idx_value in values_dict.items()]
            
            tuple_list = sorted(tuple_list, key=lambda x: x[0])            
            # Converts lexicon (raw/indexed) into token (embedded/indexed)
            if (('FORM' in column) or ('LEMMA' in column)):
                values_list = [tok2idx[lex2tok[idx2lex[column][tuple_[1]]]]                
                                   for tuple_ in tuple_list]
            else:
                values_list = [tuple_[1] for tuple_ in tuple_list]
            
            prop_dict_[prop][column]  = np.array(values_list).reshape(shape_)
    
    return prop_dict_        


In [6]:
traindb  = filter_type('train', db)
print('attributes\t',
       len(traindb),
      '\n',             
      'records\t',
       len(traindb['ARG'].keys()),
       '\n',             
      'vocab\t',
        max([form for _, form in traindb['FORM'].items()]))

attributes	 44 
 records	 123846 
 vocab	 13289


In [7]:
prop_dict, proplen_dict = make_propositions_dict(traindb)
print('attributes\t',
       len(prop_dict[1]) + 1,
      '\n',             
      'records\t',
       sum([len(d['ARG']) for p, d in prop_dict.items()]),
        '\n',             
      'vocab\t',
        max([form for _, prop in prop_dict.items() for _, form in prop['FORM'].items()]))

attributes	 44 
 records	 123837 
 vocab	 13289


In [8]:
prop_dict1 = numpfy_propositions_dict(prop_dict, proplen_dict)
print('attributes\t',
       len(prop_dict1[1]) + 1,
      '\n',             
      'records\t',
       sum([len_ for _, len_ in proplen_dict.items()]),
        '\n',             
      'vocab\t',
        max([max(form) for _, prop in prop_dict1.items() for form in prop['FORM']]))

attributes	 44 
 records	 123837 
 vocab	 12037


In [10]:
def get_inputs(db1, propid):
    '''Generate inputs
    '''
    propdb = db1[propid] # nested dict of columns and idx value
    if 'CHUNK_SPACE' not in propdb:
        proplen = len(propdb['ID'])
        propdb['CHUNK_SPACE'] = generate_chunk_space(proplen)

    word    = propdb['FORM']
    ctx_p_left  = propdb['FORM_CTX_P-1']
    ctx_p0  = propdb['FORM_CTX_P+0']
    ctx_p_right  = propdb['FORM_CTX_P+1']
    
    marker  = propdb['MARKER']
    pos     = propdb['GPOS']
    chunk_type  = propdb['T']
    chunk_start, chunk_finish = propdb['CHUNK_SPACE']
    
    return word, ctx_p_left, ctx_p0, ctx_p_right, marker, pos, chunk_type, chunk_start, chunk_finish
            
def generate_chunk_space(n):
    '''Generates all possible spaces for chunks
    '''
    start_list = []
    end_list = []
    for i in range(n):
        for j in range(i,n,1):
            start_list.append(i)
            end_list.append(j+1)
    shape_ = (len(start_list), 1)
    start_ = np.array(start_list).reshape(shape_)
    finish_ = np.array(end_list).reshape(shape_)
    return start_, finish_
            

def get_outputs(db1, propid, n_targets):
    ''' Generate outputs
    '''
    propdb = db1[propid] # nested dict of columns and idx value
    if 'OUTPUTS' not in propdb: 
        id_type = np.concatenate(
            ( propdb['CHUNK_CANDIDATE_ID'], propdb['T']), axis=1
        )

        id_type = np.unique( id_type, axis=0)
        propdb['OUTPUTS'] = id_type[:,0] * n_targets + id_type[:,1]

    return propdb['OUTPUTS']

In [11]:
%%timeit
propid = 1120
word, ctx_p_left, ctx_p0, ctx_p_right, marker, pos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1, propid)
targets = get_outputs(prop_dict1, propid, n_targets)
# worst proposition 1120 size 92!

938 ns ± 18.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


 ## MODEL

In [12]:
propid = 1
word, ctx_p_left, ctx_p0, ctx_p_right, marker, pos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1,  propid)
targets = get_outputs(prop_dict1, propid, n_targets)
# proplen = proplen_dict[propid]
# y = y.reshape((proplen,1))
print(targets)
_start  = np.repeat(chunk_start, n_targets)
_finish = np.repeat(chunk_finish, n_targets)
print(_start)
print(_finish)
print(list(zip(_start[targets].flatten(), _finish[targets].flatten())))


[    0  1261 20160  4571  6518]
[ 0  0  0 ..., 32 32 32]
[ 1  1  1 ..., 33 33 33]
[(0, 1), (1, 4), (32, 33), (4, 5), (5, 32)]


In [13]:
import struct_perc.colored_weighted_interval_scheduling as cwis
import struct_perc.weighted_interval_scheduling as wis
import struct_perc.utils as spu

 ## Tensorflow Graph

In [15]:
# vocab_size = len(lex2idx['FORM']) + 1
# embed_size = 50

# n_pos = len(lex2idx['GPOS'])
# # n_type = len(lex2idx['T'])
# n_classes  = len(lex2idx['T'])

# tf.reset_default_graph()

# # word index and gpos 
# tf_words = tf.placeholder(tf.int32, shape=(None,1))
# tf_pos = tf.placeholder(tf.int32, shape=(None,1))
# # t_x_type = tf.placeholder(tf.int32, shape=(None,1))

# # índices de inicio de intervalo
# tf_s = tf.placeholder(tf.int32, shape=(None,1))
# # índices de fim de intervalo
# tf_f = tf.placeholder(tf.int32, shape=(None,1))

# # replicamos os indicies de inicio e fim para cada classe de chunk possivel
# tf_sc = tf.reshape(
#       tf.tile(tf_s,  [1, n_classes]), [-1,1])
# tf_fc = tf.reshape(
#       tf.tile(tf_f,  [1, n_classes]), [-1,1])

# # n_features = (embed_size + n_pos + n_type)
# n_features = (embed_size + n_pos)
# # hidden_features = 300
# W_shape = (n_features, n_classes)
# EMBS = tf.constant(embeddings)
# # tf_token = tf.Variable(initial_value=None, expected_shape=(embed_size,), dtype=tf.float32, trainable=False)

# # geramos os paramteros do modelo
# with tf.variable_scope("model"):
#     W = tf.Variable(
#         tf.random_normal(W_shape, 0, 1/np.sqrt(n_features * n_classes), name='W')
#     )
#     b = tf.Variable(
#         tf.random_normal((n_classes,), 0, 1/np.sqrt(n_classes), name='b')
#     )
    

# # tf_token = tf.nn.embedding_lookup(tf_embeddings, id) 
# # Recuperamos os embeddings de cada palavra
# tf_word_features = tf.gather_nd(EMBS, tf_words)

# tf_pos_flat = tf.reshape(tf_pos, [-1])
# tf_pos_features = tf.one_hot(tf_pos_flat, depth=n_pos)

# # t_x_type_flat = tf.reshape(t_x_type,[-1])
# # t_type_features = tf.one_hot(t_x_type_flat, depth=n_type)

# # X = tf.concat((t_word_features,t_pos_features,t_type_features),axis=1)
# tf_tok_features = tf.concat((tf_word_features,tf_pos_features),axis=1)

# # a partir das features do intervalo computamos o score
# tf_scores = tf.matmul(tf_tok_features, W) + b

# tf_pred = tf.argmax(tf_scores, axis=1)


In [14]:
# Parameters
vocab_size = len(tok2idx)
embed_size = 50

n_pos = len(lex2idx['GPOS'])
n_classes = len(lex2idx['T'])
n_features = embed_size * 4 + 1 + n_pos

n_hidden = 100
# W_shape = (n_features, n_hidden)
W_shape = (n_hidden, n_hidden)
b_shape = (1, n_hidden)

W_interval_shape = (2 * n_hidden, n_classes)
# W_shape = (hidden_features, n_classes)
b_interval_shape = (1, n_classes)

# word index 
X_words = tf.placeholder(tf.int64, shape=(None,1), name='word')

# predicate context index (left, predicate, right)
X_ctx_p_left = tf.placeholder(tf.int64, shape=(None,1), name='ctx_p_left')
X_ctx_p = tf.placeholder(tf.int64, shape=(None,1), name='ctx_p0')
X_ctx_p_right = tf.placeholder(tf.int64, shape=(None,1), name='ctx_p_right')

# POS tagging feature
X_pos = tf.placeholder(tf.int64, shape=(None,1), name='gpos')
X_marker = tf.cast( tf.placeholder(tf.int64, shape=(None,1), name='marker'), tf.float32 )
EMBS = tf.Variable(embeddings, trainable=False)

# Embedded representation
with tf.variable_scope("features"):
    EMBS_words = tf.gather_nd(EMBS, X_words, name='word_features')

    EMBS_ctx_pleft = tf.gather_nd(EMBS, X_ctx_p_left, name='EMBS_ctx_pleft')
    EMBS_ctx_p0 = tf.gather_nd(EMBS, X_ctx_p, name='EMBS_ctx_p0')
    EMBS_ctx_pright = tf.gather_nd(EMBS, X_ctx_p_right, name='EMBS_ctx_pright')

    X_pos_flat = tf.reshape(X_pos, [-1], name='gpos_flat')
    X_pos_onehot = tf.one_hot(X_pos_flat, depth=n_pos, name='gpos_onehot')

    X = tf.concat((EMBS_words, EMBS_ctx_pleft, EMBS_ctx_p0,
                   EMBS_ctx_pright, X_pos_onehot, X_marker),
                  axis=1, name='X')
    X_batch = tf.expand_dims(X, 0)

with tf.variable_scope('gru', reuse=tf.AUTO_REUSE):

    fw = tf.nn.rnn_cell.GRUCell(num_units=n_hidden / 2)
    bw = tf.nn.rnn_cell.GRUCell(num_units=n_hidden / 2)
    
    Wo = tf.Variable(tf.truncated_normal(W_shape, stddev=1.0 / np.sqrt(n_features * n_hidden)), name='W' )
    bo = tf.Variable(tf.zeros(b_shape, dtype=tf.float32), name='b')

#     h1, last_states =tf.nn.bidirectional_dynamic_rnn(
#         cell_fw=lstmcellf,
#         cell_bw=lstmcellb,
#         dtype=tf.float32,
#         inputs=t_tok_features_shp
#     )

    

    hidden_outputs, states = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=fw,
        cell_bw=bw,
        inputs=X_batch,
        dtype=tf.float32
    )
    hidden_fw, hidden_bw = hidden_outputs
    Ho = tf.concat((tf.squeeze(hidden_fw, axis=0) ,tf.squeeze(hidden_fw, axis=0)),axis=1)
#     Hs = tf.scan(
#                 lambda a, x: tf.matmul(x, Wo),
#                 Ho, initializer=tf.matmul(Ho[0], Wo)) + bo
#     Z = tf.nn.tanh( tf.matmul( tf.squeeze(outputs, axis=0), W) + b, name='hidden_layer' )
#     Z = tf.nn.tanh( Hs, name='hidden_layer' )
    Z = tf.nn.tanh( tf.matmul( Ho, Wo ) + bo, name='hidden_layer' )

# Those are the interval parameters
with tf.variable_scope("interval"):
    W_interval = tf.Variable(tf.random_normal(W_interval_shape, mean=0.0, stddev=1.0 / np.sqrt(1.0 * n_hidden * n_classes)), name='W_interval')
    b_interval = tf.Variable(tf.zeros(b_interval_shape, dtype=tf.float32), name='b_interval')
    
    # begin of interval
    i_s = tf.placeholder(tf.int32, shape=(None,1))
    # end of interval
    i_f = tf.placeholder(tf.int32, shape=(None,1))
# features from intervals
i_f_score = tf.gather_nd(Z, i_f-1)
i_s_score = tf.gather_nd(Z, i_s)

i_score = tf.concat((i_f_score, i_s_score), axis=1)
scores_interval = tf.matmul(i_score, W_interval) + b_interval

scores_flat = tf.reshape(scores_interval, (-1,1)) # column matrix
scores_mean = tf.reduce_mean(scores_flat) # scalar
scores_diff = scores_flat - scores_mean   # centralize data --> mean zero

scores_std = tf.sqrt(tf.reduce_sum(scores_diff * scores_diff))
scores_op = scores_diff /( scores_std + 1e-8 )


# a partir das features do intervalo computamos o score
# score_op = tf.matmul(X, W_interval, name='xW') + b_interval

# predict_op = tf.argmax(scores_op, axis=1, name='Prediction')

 ## Tensorflow test session

In [15]:
propid  = 1
words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1, propid)
targets = get_outputs(prop_dict1, propid, n_targets)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    out_scores_interval = sess.run(scores_interval, feed_dict={
        X_words:words,
        X_ctx_p_left: ctx_p_left,
        X_ctx_p: ctx_p0,
        X_ctx_p_right: ctx_p_right,        
        X_marker: marker,                
        X_pos:gpos,
        i_s: chunk_start,
        i_f: chunk_finish
    })
    # flat gives the score for each candidate
print(out_scores_interval.shape)


(561, 36)


In [17]:
def pred(sess, x_words, x_ctx_p_left, x_ctx_p0, x_ctx_p_right, x_marker, x_pos, x_chunk_start, x_chunk_finish):
    scores = sess.run(scores_op,feed_dict={
        X_words: x_words,
        X_ctx_p_left: x_ctx_p_left,
        X_ctx_p: x_ctx_p0,
        X_ctx_p_right: x_ctx_p_right,        
        X_marker: x_marker,                
        X_pos:x_pos,
        i_s: x_chunk_start,
        i_f: x_chunk_finish
    })
    starts = np.repeat(x_chunk_start, n_classes).reshape((-1,1))
    ends = np.repeat(x_chunk_finish, n_classes).reshape((-1,1))
    
    ck_len = len(x_chunk_start)
    colors = np.array(list(np.arange(n_classes))*ck_len)
    
    r_int = cwis.compute_schedule(starts.flatten(), ends.flatten(), scores, colors) # index of the cadidates of predicted solution
    r_ext = list(zip(starts[r_int].flatten(),ends[r_int].flatten(), colors[r_int].flatten())) # from integer to triple
    return r_int, r_ext

## Testing prediction

In [18]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
p, pe = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)
print(p)
print(pe)

[20180 20108 20000 19856 19676 19460 19208 18920 18596 18236 17840 17408
 16940 16436 15896 15320 14708 14060 13376 12656 11900 11108 10280  9416
  8516  7580  6608  5600  4556  3476  2360  1208    20]
[(32, 33, 20), (31, 32, 20), (30, 31, 20), (29, 30, 20), (28, 29, 20), (27, 28, 20), (26, 27, 20), (25, 26, 20), (24, 25, 20), (23, 24, 20), (22, 23, 20), (21, 22, 20), (20, 21, 20), (19, 20, 20), (18, 19, 20), (17, 18, 20), (16, 17, 20), (15, 16, 20), (14, 15, 20), (13, 14, 20), (12, 13, 20), (11, 12, 20), (10, 11, 20), (9, 10, 20), (8, 9, 20), (7, 8, 20), (6, 7, 20), (5, 6, 20), (4, 5, 20), (3, 4, 20), (2, 3, 20), (1, 2, 20), (0, 1, 20)]


## Training code

In [None]:
# t_y_int = tf.placeholder(tf.int32, shape=(None,))
# t_y_rshp = tf.reshape(t_y_int, (-1,1))
# t_margin_int = tf.placeholder(tf.float32, shape=())

# t_margin_values = tf.ones(tf.shape(t_y_rshp))*t_margin_int
# t_margin_scores = tf.scatter_nd(t_y_rshp, -t_margin_values, tf.shape(t_scores_flat))
# t_scores_flat_w_margin = t_scores_flat + t_margin_scores

In [19]:
# indices of the correct intervals
T = tf.placeholder(tf.int32, shape=(None,), name='T')
L = tf.placeholder(tf.int32, shape=(), name='L')
I = tf.to_int32(tf.range(L), name='indices')

T_flat = tf.reshape(T, (-1,1), name='T_flat')
Margin_interval = tf.placeholder(tf.float32)
Margin_values = tf.ones(tf.shape(T_flat)) * Margin_interval
Margin_scores_flat = tf.scatter_nd(T_flat, -Margin_values, tf.shape(scores_op)) # oppposite of gather_nd
Scores_flat_with_margin = scores_op + Margin_scores_flat


In [None]:
# # índices dos intervalos computados pelo Weighted Interval Scheduling
# t_p_int = tf.placeholder(tf.int32, shape=(None,))
# # índices dos intervalos corretos
# # t_y_int = tf.placeholder(tf.int32, shape=(None,))

# # formatamos os indices dos intervalos preditos e corretos
# t_p_rshp = tf.reshape(t_p_int,(-1,1))
# # t_y_rshp = tf.reshape(t_y_int,(-1,1))

# # score dos intervalos preditos
# t_scores_int_p = tf.gather_nd(t_scores_flat_w_margin, t_p_rshp)
# # score dos intervalos corretos
# t_scores_int_y = tf.gather_nd(t_scores_flat_w_margin, t_y_rshp)

# ## função de custo do perceptron estruturado
# # WIS
# t_cost_int = tf.reduce_sum(t_scores_int_p) - tf.reduce_sum(t_scores_int_y)
# t_cost = t_cost_int

# # # gradiente descendente no custo do perceptron estruturado
# optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
# # optimizer = tf.train.GradientDescentOptimizer(0.003)
# train = optimizer.minimize(t_cost)

In [42]:
# indices containing the predicted labels from Weighted Interval Scheduling
Y = tf.placeholder(tf.int32, shape=(None,), name='predictions')
Y_flat = tf.reshape(Y, (-1, 1))

# score da estrutura predita
ScoreY = tf.gather_nd(Scores_flat_with_margin, Y_flat, name='predicted_score')
# score da estrutura correta
ScoreT = tf.gather_nd(Scores_flat_with_margin, T_flat, name='target_score')

# função de custo do perceptron estruturado
cost_op = tf.reduce_sum(ScoreY) - tf.reduce_sum(ScoreT)

# gradiente descendente no custo do perceptron estruturado
optimizer = tf.train.AdamOptimizer(0.001)
train_op = optimizer.minimize(cost_op)

In [21]:
predictions, _ = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)
print(targets.shape)
print(targets)
print(predictions.shape)
print(predictions)

(5,)
[    0  1261 20160  4571  6518]
(33,)
[20180 20108 20000 19856 19676 19460 19208 18920 18596 18236 17840 17408
 16940 16436 15896 15320 14708 14060 13376 12656 11900 11108 10280  9416
  8516  7580  6608  5600  4556  3476  2360  1208    20]


 ## Testing cost operation

In [22]:
predictions, chunk_ext = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)

interval_start = np.repeat(chunk_start, n_classes).reshape((-1,1))
interval_finish = np.repeat(chunk_finish, n_classes).reshape((-1,1))

scores_, cost_ = sess.run([scores_op, cost_op], feed_dict={
    X_words: words,
    X_ctx_p_left: ctx_p_left,
    X_ctx_p: ctx_p0,
    X_ctx_p_right: ctx_p_right,        
    X_marker: marker,        
    X_pos: gpos,
    i_s: interval_start,
    i_f: interval_finish,
    T: targets.flatten(),
    Y: predictions.flatten(),        
    L: proplen_dict[propid],
    Margin_interval:0.1})


print(scores_.shape)
print(cost_)
print(np.max(predictions))
print(np.sum(scores_[predictions]) - np.sum(scores_[targets]))

(727056, 1)
0.562447
20180
0.0624465


In [23]:
colors = np.array(list(np.arange(n_classes))*len(chunk_start))
r_int = cwis.compute_schedule(interval_start.flatten(), interval_finish.flatten(), scores_, colors) 
print(np.mean(scores_[r_int]), ' ', np.mean(scores_[targets]))

0.0018212   -0.000469358


In [24]:
np.sum(scores_[predictions].flatten())

0.060099695

In [25]:
np.sum(scores_[targets].flatten())

-0.0023467897

In [26]:
print(sorted(predictions))
print(sorted(r_int))

[20, 1208, 2360, 3476, 4556, 5600, 6608, 7580, 8516, 9416, 10280, 11108, 11900, 12656, 13376, 14060, 14708, 15320, 15896, 16436, 16940, 17408, 17840, 18236, 18596, 18920, 19208, 19460, 19676, 19856, 20000, 20108, 20180]
[20, 1208, 2360, 3476, 4556, 5600, 6608, 7580, 8516, 9416, 10280, 11108, 11900, 12656, 13376, 14060, 14708, 15320, 15896, 16436, 16940, 17408, 17840, 18236, 18596, 18920, 19208, 19460, 19676, 19856, 20000, 20108, 20180]


In [27]:
propid  = 1
words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1, propid)
targets = get_outputs(prop_dict1, propid, n_targets)

starts = np.repeat(chunk_start,n_classes).reshape((-1,1))
ends = np.repeat(chunk_finish, n_classes).reshape((-1,1))

scores_ = sess.run(scores_op, feed_dict={
    X_words: words,
    X_ctx_p_left: ctx_p_left,
    X_ctx_p: ctx_p0,
    X_ctx_p_right: ctx_p_right,        
    X_marker: marker,        
    X_pos: gpos,
    i_s: chunk_start,
    i_f: chunk_finish,
    T: targets.flatten(),
    Y: predictions.flatten(),        
    Margin_interval:0.1
})

colors = np.repeat(np.arange(n_classes), len(chunk_start))
predictions = cwis.compute_schedule(starts.flatten(), ends.flatten(), scores_, colors) 

cost_ = sess.run(cost_op, feed_dict={
    X_words: words,
    X_ctx_p_left: ctx_p_left,
    X_ctx_p: ctx_p0,
    X_ctx_p_right: ctx_p_right,        
    X_marker: marker,        
    X_pos: gpos,
    i_s: chunk_start,
    i_f: chunk_finish,
    T:targets.flatten(),
    Y:predictions.flatten(),
    Margin_interval:0.1
})

print(cost_)
print(np.sum(scores_[predictions]) - np.sum(scores_[targets]))

1.08781
0.587809


In [28]:
print(scores_.shape)

(20196, 1)


## Training Single Proposition

In [29]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

propid = 1
words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1, propid)
targets = get_outputs(prop_dict1, propid, n_targets)

starts = np.repeat(chunk_start,n_classes).reshape((-1,1))
ends = np.repeat(chunk_finish, n_classes).reshape((-1,1))

for i in range(10000):
    
    scores_ = sess.run(scores_op, feed_dict={
        X_words: words,
        X_ctx_p_left: ctx_p_left,
        X_ctx_p: ctx_p0,
        X_ctx_p_right: ctx_p_right,        
        X_marker: marker,        
        X_pos: gpos,
        i_s: chunk_start,
        i_f: chunk_finish,
        T: targets.flatten(),
        Y: predictions.flatten(),        
        Margin_interval:0.1
    })
    
    predictions, chunk_ext = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)    
    cost_ = sess.run(cost_op, feed_dict={
        X_words: words,
        X_ctx_p_left: ctx_p_left,
        X_ctx_p: ctx_p0,
        X_ctx_p_right: ctx_p_right,        
        X_marker: marker,        
        X_pos: gpos,
        i_s: chunk_start,
        i_f: chunk_finish,
        T: targets.flatten(),
        Y: predictions.flatten(),        
        Margin_interval:0.1
    })
    
    predictions_score = np.sum(scores_[predictions])
    targets_score = np.sum(scores_[targets])
        
    sess.run(train_op, feed_dict={
        X_words: words,
        X_ctx_p_left: ctx_p_left,
        X_ctx_p: ctx_p0,
        X_ctx_p_right: ctx_p_right,        
        X_marker: marker,        
        X_pos: gpos,
        i_s: chunk_start,
        i_f: chunk_finish,
        T: targets.flatten(),
        Y: predictions.flatten(),        
        Margin_interval:0.1})
    
    colors = np.repeat(np.arange(n_classes), len(chunk_start))
    predictions = cwis.compute_schedule(starts.flatten(), ends.flatten(), scores_, colors) 

    targets_set = set(targets.flatten())
    predictions_set = set(predictions.flatten())
    yp_common = targets_set.intersection(predictions_set)
    yp_total = targets_set.union(predictions_set)
    
    acc_int = len(yp_common)/len(yp_total)
    
    if i % 25 == 0:
        print(acc_int, ' ', cost_)
    
    if cost_ < 0:
        break

    if acc_int == 1:
        print(acc_int, ' ', cost_, ' learnt at epoch ', i)
        break


0.0   0.961165
0.02702702702702703   0.598262
0.02702702702702703   0.520023
0.09375   0.258072
0.13333333333333333   0.326198
0.1875   0.210761
0.08571428571428572   0.286066
0.08571428571428572   0.280022
0.08571428571428572   0.265583
0.08571428571428572   0.243902
0.23076923076923078   0.204181
1.0   0.0  learnt at epoch  265


In [30]:
print(list(zip(starts[predictions].flatten(), ends[predictions].flatten(), colors[predictions])))

[(32, 33, 35), (5, 32, 11), (4, 5, 8), (1, 4, 2), (0, 1, 0)]


 ## ConLL evaluation scripts

In [31]:
def tag_to_conll(sess, prop_dict, propid, idx2lex):
    gold_list = []
    eval_list = []
        
    words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict, propid)
    targets = get_outputs(prop_dict, propid, n_targets)

    predictions, chunk_ext = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)    


    n_words = len(words)
    pred_array = prop_dict[propid]['PRED']
    pred_array = pred_array.flatten()
    
    arg_array = prop_dict[propid]['ARG']
    arg_array = arg_array.flatten()
    
    pred_list = [idx2lex['PRED'][i] for i in pred_array.tolist()]
    gold_list_ = [idx2lex['ARG'][i] for i in arg_array.tolist()]
    
    gold_list += list(zip(pred_list, gold_list_))

    arg_list_ = []
    for triple_ in sorted(chunk_ext, key= lambda x: x[0]):
        lb, ub, arg = triple_            
        chunk_list_ = [idx2lex['T'][arg] if i == lb else '*' for i in range(lb, ub)] 
        if idx2lex['T'][arg] != '*':
            chunk_list_[0] = '({:}*'.format(chunk_list_[0])
            chunk_list_[-1] = '{:})'.format(chunk_list_[-1])

        arg_list_ += chunk_list_
        
    eval_list += list(zip(pred_list, arg_list_))
    eval_list.append(None)
    gold_list.append(None)
    return gold_list, eval_list

testamos o modelo na frase usada para treino

In [32]:
propid = 1
gold_list, eval_list = tag_to_conll(sess, prop_dict1, propid, idx2lex)
for i in range(proplen_dict[propid]):
    if gold_list[i] and eval_list[i]:
        print('{:}\t{:}\t{:}\t{:}'.format(*gold_list[i], *eval_list[i]))
    else:
        print('\n')
    

print(lex2idx['T'])

-	*	-	*
-	(A0*	-	(A0*
-	*	-	*
-	*)	-	*)
revelar	(V*)	revelar	*
-	(A1*	-	(A1*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*	-	*
-	*)	-	*)
-	*	-	*
{'*': 0, 'A0': 1, 'A1': 2, 'A2': 3, 'A3': 4, 'A4': 5, 'A5': 6, 'AM-ADV': 7, 'AM-CAU': 8, 'AM-DIR': 9, 'AM-DIS': 10, 'AM-EXT': 11, 'AM-LOC': 12, 'AM-MED': 13, 'AM-MNR': 14, 'AM-NEG': 15, 'AM-PNC': 16, 'AM-PRD': 17, 'AM-REC': 18, 'AM-TMP': 19, 'C-A0': 20, 'C-A1': 21, 'C-A2': 22, 'C-A3': 23, 'C-AM-ADV': 24, 'C-AM-CAU': 25, 'C-AM-DIS': 26, 'C-AM-EXT': 27, 'C-AM-LOC': 28, 'C-AM-MNR': 29, 'C-AM-NEG': 30, 'C-AM-PNC': 31, 'C-AM-PRD': 32, 'C-AM-TMP': 33, 'C-V': 34, 'V': 35}


In [None]:
def evaluate(gold_list, eval_list, verbose=True):
    gold_path = 'train_gold.conll'    
    eval_path = 'train_eval.conll'

    with open(gold_path, mode='w') as f:        
        for tuple_ in gold_list:
            if tuple_ is None:
                f.write('\n')
            else:
                f.write('{:}\t{:}\n'.format(*tuple_))

    with open(eval_path, mode='w') as f:        
        for tuple_ in eval_list:
            if tuple_ is None:
                f.write('\n')
            else:
                f.write('{:}\t{:}\n'.format(*tuple_))

    pipe = Popen(['perl',PEARL_SRLEVAL_PATH, gold_path, eval_path], stdout=PIPE, stderr=PIPE)

    txt, err = pipe.communicate()
    txt = txt.decode('UTF-8')
    err = err.decode('UTF-8')
    
    if verbose:
        print(txt)

    float_list = re.findall(r'(\d+.\d+)', txt)
    f1 = float(re.findall(r'(\d+.\d+)', txt)[3]) if len(float_list) > 3 else -1 
    return f1



In [34]:
propid = 1
gold_list, eval_list = tag_to_conll(sess, prop_dict1, propid, idx2lex)
f1_score = evaluate(gold_list, eval_list, verbose=True)
print('f1_score: ', f1_score)

Number of Sentences    :           1
Number of Propositions :           1
Percentage of perfect props : 100.00

              corr.  excess  missed    prec.    rec.      F1
------------------------------------------------------------
   Overall        2       0       0   100.00  100.00  100.00
----------
        A0        1       0       0   100.00  100.00  100.00
        A1        1       0       0   100.00  100.00  100.00
------------------------------------------------------------
         V        0       0       1     0.00    0.00    0.00
------------------------------------------------------------

f1_score:  100.0


In [35]:
import time
start = time.time()
print(1111)
gold_list, eval_list = tag_to_conll(sess, prop_dict1, propid, idx2lex)
f1_score = evaluate(gold_list, eval_list, verbose=True)
end = time.time()
print('tempo para avaliar: ', (end-start), 's')
print('f1_score: ', f1_score)

1111
Number of Sentences    :           1
Number of Propositions :           1
Percentage of perfect props : 100.00

              corr.  excess  missed    prec.    rec.      F1
------------------------------------------------------------
   Overall        2       0       0   100.00  100.00  100.00
----------
        A0        1       0       0   100.00  100.00  100.00
        A1        1       0       0   100.00  100.00  100.00
------------------------------------------------------------
         V        0       0       1     0.00    0.00    0.00
------------------------------------------------------------

tempo para avaliar:  0.660865068435669 s
f1_score:  100.0


## Training

In [43]:
sess= tf.Session()
sess.run(tf.global_variables_initializer())

epochs = 100
# indices = np.arange(config.DATASET_TRAIN_SIZE)
indices = np.arange(50) + 1

best = 0
saver = tf.train.Saver()

for j in range(epochs):
    np.random.shuffle(indices)
    total_err = 0
    total_size = 0
    gold_list = []
    eval_list = [] 
    for i, propid in enumerate(indices):     
        try: 
            words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_type, chunk_start, chunk_finish = get_inputs(prop_dict1, propid)
            targets = get_outputs(prop_dict1, propid, n_targets)
        except KeyError:
            print(propid)

        predictions, chunk_ext = pred(sess, words, ctx_p_left, ctx_p0, ctx_p_right, marker, gpos, chunk_start, chunk_finish)

        _, cost = sess.run([train_op, cost_op], feed_dict={
            X_words: words,
            X_ctx_p_left: ctx_p_left,
            X_ctx_p: ctx_p0,
            X_ctx_p_right: ctx_p_right,        
            X_marker: marker,        
            X_pos: gpos,
            i_s: chunk_start,
            i_f: chunk_finish,
            T: targets.flatten(),
            Y: predictions.flatten(),        
            Margin_interval:0.1})

        total_err += np.sum(predictions.flatten()!= targets.flatten())
        total_size += len(predictions.flatten())
        
        gold_list_, eval_list_ = tag_to_conll(sess, prop_dict1, propid, idx2lex)

        gold_list += gold_list_
        eval_list += eval_list_

        if i % 25 == 0:
            print('Iteration:', i,'\tepoch:', j, ' \tacc:', 1 - total_err/total_size, '\tcost:', cost)
        
    f1_score = evaluate(gold_list, eval_list, verbose=False)    
    if f1_score > best:
        best = f1_score
        f1_score = evaluate(gold_list, eval_list, verbose=True)    
        save_path = saver.save(sess, "/tmp/model_spn-pt.ckpt")
        
    print('Epoch:', j, ' \tf1_score:', f1_score)



Iteration: 0 	epoch: 0  	acc: 0.96 	cost: 0.674129
Iteration: 25 	epoch: 0  	acc: 0.948207171315 	cost: 0.235144
Number of Sentences    :          50
Number of Propositions :          50
Percentage of perfect props :   0.00

              corr.  excess  missed    prec.    rec.      F1
------------------------------------------------------------
   Overall        1       4     124    20.00    0.80    1.54
----------
        A0        0       1      28     0.00    0.00    0.00
        A1        1       3      46    25.00    2.13    3.92
        A2        0       0      12     0.00    0.00    0.00
        A3        0       0       2     0.00    0.00    0.00
        A4        0       0       2     0.00    0.00    0.00
    AM-ADV        0       0       4     0.00    0.00    0.00
    AM-CAU        0       0       4     0.00    0.00    0.00
    AM-DIS        0       0       3     0.00    0.00    0.00
    AM-LOC        0       0       7     0.00    0.00    0.00
    AM-MNR        0       0     



Iteration: 25 	epoch: 1  	acc: 0.948514851485 	cost: 0.504724
Number of Sentences    :          50
Number of Propositions :          50
Percentage of perfect props :   0.00

              corr.  excess  missed    prec.    rec.      F1
------------------------------------------------------------
   Overall        4     210     121     1.87    3.20    2.36
----------
        A0        1      40      27     2.44    3.57    2.90
        A1        3     139      44     2.11    6.38    3.17
        A2        0       0      12     0.00    0.00    0.00
        A3        0       0       2     0.00    0.00    0.00
        A4        0       0       2     0.00    0.00    0.00
    AM-ADV        0       0       4     0.00    0.00    0.00
    AM-CAU        0       0       4     0.00    0.00    0.00
    AM-DIS        0      17       3     0.00    0.00    0.00
    AM-LOC        0       0       7     0.00    0.00    0.00
    AM-MNR        0       1       1     0.00    0.00    0.00
    AM-NEG        0   




Iteration: 25 	epoch: 2  	acc: 0.950286806883 	cost: -0.217334
Epoch: 2  	f1_score: 0.6




Iteration: 0 	epoch: 3  	acc: 0.95652173913 	cost: 0.145946
Iteration: 25 	epoch: 3  	acc: 0.952898550725 	cost: 0.156299
Epoch: 3  	f1_score: 0.0




Iteration: 0 	epoch: 4  	acc: 0.933333333333 	cost: 0.194091
Iteration: 25 	epoch: 4  	acc: 0.946391752577 	cost: -0.305584
Epoch: 4  	f1_score: 0.55
Iteration: 0 	epoch: 5  	acc: 0.923076923077 	cost: 0.294442




Iteration: 25 	epoch: 5  	acc: 0.951127819549 	cost: 0.141609
Epoch: 5  	f1_score: 1.17




Iteration: 0 	epoch: 6  	acc: 0.969696969697 	cost: 0.0692396


KeyboardInterrupt: 