# Cross-lingual "Paraphrase" Classifer using Child-Sum TreeLSTMs

Currently only for German-English and Spanish-English language pairs

In [1]:
# boilerplate
import codecs
import functools
import os
import sys
import re
import math
import tempfile
import zipfile
import scipy.stats
import pandas as pd
from nltk.tree import Tree
from nltk.parse import DependencyGraph

from nltk.tokenize import sexpr
import numpy as np
from six.moves import urllib
import tensorflow as tf
import tensorflow_fold as td
# sess = tf.Session(config=tf.ConfigProto(
#     allow_soft_placement=True, log_device_placement=True))
sess = tf.Session()

## 1. Load data

File path for the [bilingual word embeddings](http://www.aclweb.org/anthology/W15-1521).

In [2]:
data_dir = '/home/cliffrwong/Documents/code/
bipara_dir = os.path.join(data_dir, 'bipara/')
treelstm_dir = os.path.join(data_dir, 'treelstm/')

# Choose between German (de) or Spanish (es) language pairs
lang = "de"

if lang == "de":
    sem_dir = os.path.join(treelstm_dir, "data/de_dep/")
    bipara_dir += "de/"
    full_bipara_en_path = os.path.join(bipara_dir, 'unsup.512.en')
    full_bipara_de_path = os.path.join(bipara_dir, 'unsup.512.de')
elif lang == "es":
    sem_dir = treelstm_dir + "data/es/"
    bipara_dir += "es/"
    full_bipara_en_path = os.path.join(bipara_dir, 'es/out.en')
    full_bipara_de_path = os.path.join(bipara_dir, 'es/out.es')
else:
    raise ("language not available")
    

Filter out words in the bilingual word2vec matrix that are not present in the training, dev, or test data

In [3]:
if lang == "de":
    parseLangDir = os.path.join(treelstm_dir,"data/de/")
elif lang == "es":
    parseLangDir = os.path.join(treelstm_dir,"data/es")
else:
    print('language not available')
        
def filter_bipara(curLang):
    vocab = set()
    for set1 in ['train', 'dev', 'test']:
        parseDir = os.path.join(*[parseLangDir, set1, set1])
        if curLang == "en":
            set2 = ".src"
        else:
            set2 = ".mt"
        parseFile = parseDir + set2
        with open(parseFile, 'r') as fin:
            for line in fin:
                vocab.update([x.lower() for x in line.strip().replace('\\', '').split()])
    nread = 0
    nwrote = 0
    if lang == "de":
        full_bipara_path = os.path.join(bipara_dir, 'unsup.512.'+ curLang) 
    elif lang == "es":
        full_bipara_path = os.path.join(bipara_dir, 'out.'+ curLang) 

    filtered_bipara_path = os.path.join(bipara_dir, 'filtered_bipara_{0}.txt'.format(curLang))
    with codecs.open(full_bipara_path, encoding='utf-8') as f:
        with codecs.open(filtered_bipara_path, 'w', encoding='utf-8') as out:
            for line in f:
                nread += 1
                line = line.strip()
                if not line: continue
                if line.split(u' ', 1)[0] in vocab:
                    out.write(line + '\n')
                    nwrote += 1
    print('read %s lines, wrote %s' % (nread, nwrote))

Filter out words that don't appear in the dataset, since the full dataset is a bit large (5GB). This is purely a performance optimization and has no effect on the final results.

In [4]:
filter_bipara('en')
if lang == "de":
    filter_bipara('de')
elif lang == "es":
    filter_bipara('es')
else:
    raise("error")

read 40055 lines, wrote 4727
read 95196 lines, wrote 5152


In [5]:
dependParse = True

Load the filtered word embeddings into a matrix and build an dict from words to indices into the matrix. Add a random embedding vector for out-of-vocabulary words.

In [6]:
def load_embeddings(lang):
  """Loads embedings, returns weight matrix and dict from words to indices."""
  print('loading word embeddings from lang %s' % lang)
  weight_vectors = []
  word_idx = {}
  embedding_path = os.path.join(bipara_dir, 'filtered_bipara_{0}.txt'.format(lang))
  with codecs.open(embedding_path, encoding='utf-8') as f:
    for line in f:
      word, vec = line.split(u' ', 1)
      word_idx[word.lower()] = len(weight_vectors)
      weight_vectors.append(np.array(vec.split(), dtype=np.float32))
  # Random embedding vector for unknown words.
  weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
  return np.stack(weight_vectors), word_idx

In [7]:
src_weight_matrix, src_word_idx = load_embeddings('en')
if lang == "de":
    target_weight_matrix, target_word_idx = load_embeddings('de')
elif lang == "es":
    target_weight_matrix, target_word_idx = load_embeddings('es')
else:
    raise('error')

loading word embeddings from lang en
loading word embeddings from lang de


### Load the Dependency Trees (from Google's SyntaxNet)

In [8]:
def readDepTree(set1):
    data_dir = os.path.join(sem_dir, set1) + "/"
    results = []
    for fext in ['src', 'mt']:
        setResult = []
        fileIn = data_dir + set1 + "_dep." + fext
        with open(fileIn, 'r') as fin:
            curSent = ""
            for line in fin:
                if not line.isspace():
                    curSent += line
                else:
                    dg = DependencyGraph(curSent)
                    tree = dg.tree()
                    setResult.append(tree)
                    curSent = ""
        results.append(setResult)
    return results

In [9]:
train_tree = readDepTree("train")
dev_tree = readDepTree("dev")
test_tree = readDepTree("test")

Generate the training examples from the dependency trees and convert the label HTER values into an array representation. See TreeLSTM paper for more details

In [11]:
NUM_CLASSES = 5
def convDist(y):
    y = float(y)/25
    if y >= 4:
        return [0,0,0,0,1]
    result = [0]*NUM_CLASSES
    y_bar = math.floor(y)
    result[int(y_bar)+1] = y - y_bar
    result[int(y_bar)] = y_bar - y + 1
    return result

def load_examples(tree, dataSet):
    examples = []
    filename = sem_dir + dataSet + "/" + dataSet + ".hter"
    with codecs.open(filename, encoding='utf-8') as f:
        for aTree, bTree, line in zip(tree[0], tree[1], f):
            line = float(line)
            if lang == "de":
                examples.append((min(line, 100), convDist(line), 
                            aTree, bTree))
            elif lang == "es":
                examples.append((min(line*100, 100), convDist(100*line), 
                            aTree, bTree))
            
    return examples

In [12]:
train_trees = load_examples(train_tree, "train")
dev_trees = load_examples(dev_tree, "dev")
test_trees = load_examples(test_tree, "test")

In [13]:
print(train_trees[0][2])


(press
  (reverse To (direction the (scrolling of)))
  ,
  (key the minus sign ( - ))
  .)


## 2. Build the Model (Tensorflow Fold)

The part of the model deals with building the TreeLSTM to represent the input text as a vector

In [14]:
class ChildSumTreeLSTMCell(tf.contrib.rnn.BasicLSTMCell):

  def __init__(self, num_units, keep_prob=1.0):
    super(ChildSumTreeLSTMCell, self).__init__(num_units)
    self._keep_prob = keep_prob

  def __call__(self, inputs, state, scope=None):
    with tf.variable_scope(scope or type(self).__name__):
        child_h_sum, fc = state  
        concat = tf.contrib.layers.linear(
            tf.concat([inputs, child_h_sum], 1), 3 * self._num_units)
    
        i, u, o = tf.split(value=concat, num_or_size_splits=3, axis=1)
      
        u = self._activation(u)
        if not isinstance(self._keep_prob, float) or self._keep_prob < 1:
            u = tf.nn.dropout(u, self._keep_prob)

        new_c = fc + (tf.sigmoid(i) * u)
        new_h = self._activation(new_c) * tf.sigmoid(o)

        new_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)
      
        return new_h, new_state

In [15]:
keep_prob_ph = tf.placeholder_with_default(1.0, [])

The LSTM Layer with the ChildSumLSTMCell. Change the "name_or_scope" to be the same or different for the src and tgt trees for shared or separate parameters.

In [16]:
lstm_num_units = 150
src_tree_lstm = td.ScopedLayer(
      tf.contrib.rnn.DropoutWrapper(
          ChildSumTreeLSTMCell(lstm_num_units, keep_prob=keep_prob_ph),
          input_keep_prob=keep_prob_ph, output_keep_prob=keep_prob_ph),
      name_or_scope='src_tree_lstm')

tgt_tree_lstm = td.ScopedLayer(
      tf.contrib.rnn.DropoutWrapper(
          ChildSumTreeLSTMCell(lstm_num_units, keep_prob=keep_prob_ph),
          input_keep_prob=keep_prob_ph, output_keep_prob=keep_prob_ph),
      name_or_scope='src_tree_lstm')

The word embedding for the src and target language

In [17]:
src_word_embedding = td.Embedding(
    *src_weight_matrix.shape, initializer=src_weight_matrix, name='src_word_embedding', trainable=False)

target_word_embedding = td.Embedding(
    *target_weight_matrix.shape, initializer=target_weight_matrix, name='tgt_word_embedding', trainable=False)

Forward Declaration for recursively traversing the TreeLSTM

In [18]:
src_embed_subtree = td.ForwardDeclaration(name='src_embed_subtree')
target_embed_subtree = td.ForwardDeclaration(name='target_embed_subtree')

In [19]:
def combLinear(x, y, scope=None):
    with tf.variable_scope(scope):
        concat = tf.sigmoid(tf.contrib.layers.linear(
              tf.concat([x, y], 1), lstm_num_units))
    return concat
    
ffc = td.ScopedLayer(combLinear, name_or_scope='ffc')

def tree2vec2():
     return td.AllOf(td.GetItem(0), 
             td.AllOf(td.GetItem(1) >> td.GetItem(1) >> td.Sum(),
                     td.AllOf(td.AllOf(td.GetItem(0) >> td.Broadcast(), 
                                       td.GetItem(1) >> td.GetItem(1)) >>
                              td.Zip() >> td.Map(ffc),
                              td.GetItem(1) >> td.GetItem(0)) >> 
                     td.Zip() >> td.Map(td.Function(tf.multiply)) >> td.Sum()))


In [20]:
def src_word2vec():
    unknown_idx = len(src_word_idx)
    lookup_src_word = lambda word: src_word_idx.get(word.lower(), unknown_idx)  
    return td.InputTransform(lookup_src_word) >> td.Scalar('int32') >> src_word_embedding

def target_word2vec():
    unknown_idx = len(target_word_idx)
    lookup_target_word = lambda word: target_word_idx.get(word.lower(), unknown_idx)  
    return td.InputTransform(lookup_target_word) >> td.Scalar('int32') >> target_word_embedding


In [21]:
def zero_state():
    return ((td.Zeros(src_tree_lstm.state_size,),) >>
            td.AllOf(td.Map(td.GetItem(0)), td.Map(td.GetItem(1))))

def src_logits_and_state():

  leaf_case = td.AllOf(src_word2vec(), zero_state())
  node_case = (td.AllOf(td.InputTransform(lambda x:x.label()) >> src_word2vec(), 
                        td.InputTransform(list) >> td.Map(src_embed_subtree()) >> 
                        td.AllOf(td.Map(td.GetItem(0)), td.Map(td.GetItem(1)))))
                       
  tree2vec = (td.OneOf(lambda x: str(type(x)), 
                       {"<class 'nltk.tree.Tree'>": node_case, 
                                "<class 'str'>":leaf_case} ))
  return tree2vec >> tree2vec2() >> src_tree_lstm >> td.GetItem(1)

def target_logits_and_state():

  leaf_case = td.AllOf(target_word2vec(), zero_state())
  node_case = (td.AllOf(td.InputTransform(lambda x:x.label()) >> target_word2vec(), 
                        td.InputTransform(list) >> td.Map(target_embed_subtree()) >> 
                        td.AllOf(td.Map(td.GetItem(0)), td.Map(td.GetItem(1)))))
                       
  tree2vec = (td.OneOf(lambda x: str(type(x)), 
                       {"<class 'nltk.tree.Tree'>": node_case, 
                                "<class 'str'>":leaf_case} ))
  return tree2vec >> tree2vec2() >> tgt_tree_lstm >> td.GetItem(1)

In [23]:
model =  td.Record((td.Scalar('float32'), 
                    td.Vector(NUM_CLASSES, dtype='float32'), 
                    src_logits_and_state(), 
                    target_logits_and_state()))

Resolve the forward declarations by calling this function when the forward declaraction var is encountered

In [24]:
src_embed_subtree.resolve_to(src_logits_and_state())
target_embed_subtree.resolve_to(target_logits_and_state())

Compile the tensorflow fold model

In [25]:
compiler = td.Compiler.create(model)
print('input type: %s' % model.input_type)
print('output type: %s' % model.output_type)

input type: PyObjectType()
output type: TupleType(TensorType((), 'float32'), TensorType((5,), 'float32'), TupleType(TensorType((150,), 'float32'), TensorType((150,), 'float32')), TupleType(TensorType((150,), 'float32'), TensorType((150,), 'float32')))


## 3. Build the Model (Tensorflow)

This part of the model deals with the classifier layer and calculating the losses and other metrics

Training Parameters

In [26]:
LEARNING_RATE = 0.03

KEEP_PROB = 1.0
BATCH_SIZE = 25
EPOCHS = 20
REGLAMBDA = 1e-3

Use the Adagrad Optimizer from the TreeLSTM paper. 

In [27]:
train_feed_dict = {keep_prob_ph: KEEP_PROB}
opt = tf.train.AdagradOptimizer(LEARNING_RATE)


Return the outputs to Tensorflow to build the rest of the model for the classifier layer and calculate the losses.

In [28]:
label, pk, lVecC, lVecH, rVecC, rVecH = compiler.output_tensors
hx = lVecH * rVecH
hplus = tf.abs(lVecH - rVecH)
temp = tf.concat([hx,hplus], 1)

hs = tf.contrib.layers.fully_connected(temp, 50, 
                                       activation_fn=tf.nn.sigmoid,
                                        biases_initializer=tf.constant_initializer(0.01),
                                        scope="hs_FC")
pth = tf.contrib.layers.fully_connected(hs, NUM_CLASSES, 
                                       activation_fn=tf.nn.softmax,
                                        biases_initializer=tf.constant_initializer(0.01),
                                        scope="pth_FC")
                    


l2 = tf.constant(REGLAMBDA/2) * sum(
    tf.nn.l2_loss(tf_var)
        for tf_var in tf.trainable_variables() 
        if not ("bias" in tf_var.name)
        )
for tf_var in tf.trainable_variables():
    print(tf_var)

diff = tf.log(pk+tf.constant(1e-5))-tf.log(pth+tf.constant(1e-5))
loss = tf.reduce_sum(pk*diff)+l2

grads_and_vars = opt.compute_gradients(loss)
train = opt.apply_gradients(grads_and_vars)
saver = tf.train.Saver()

Tensor("ffc/fully_connected/weights/read:0", shape=(662, 150), dtype=float32)
Tensor("ffc/fully_connected/biases/read:0", shape=(150,), dtype=float32)
Tensor("src_tree_lstm/fully_connected/weights/read:0", shape=(662, 450), dtype=float32)
Tensor("src_tree_lstm/fully_connected/biases/read:0", shape=(450,), dtype=float32)
Tensor("src_tree_lstm_2/fully_connected/weights/read:0", shape=(662, 450), dtype=float32)
Tensor("src_tree_lstm_2/fully_connected/biases/read:0", shape=(450,), dtype=float32)
Tensor("hs_FC/weights/read:0", shape=(300, 50), dtype=float32)
Tensor("hs_FC/biases/read:0", shape=(50,), dtype=float32)
Tensor("pth_FC/weights/read:0", shape=(50, 5), dtype=float32)
Tensor("pth_FC/biases/read:0", shape=(5,), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Calculate predicted value, the mean average error, and the room mean square error

In [29]:
r = tf.constant(list(range(0,101,25)), dtype='float32')
pred = tf.reduce_sum(tf.multiply(r, pth), axis=1)
mae = tf.reduce_mean(tf.abs(label-pred))
rmse = tf.sqrt(tf.reduce_mean((label-pred)*(label-pred)))


The TF graph is now complete; initialize the variables.

In [30]:
sess.run(tf.global_variables_initializer())

## Training the model

Start by defining a function that does a single step of training on a batch and returns the loss.

In [31]:
def train_step(batch):
  train_feed_dict[compiler.loom_input_tensor] = batch
  _, batch_loss = sess.run([train, loss], train_feed_dict)
  return batch_loss

Now similarly for an entire epoch of training.

In [32]:
def train_epoch(train_set_local):
  return sum(train_step(batch) for batch in td.group_by_batches(train_set_local, BATCH_SIZE))

Transform the training and development set data into loom inputs for tensorflow fold

In [33]:
train_set = compiler.build_loom_inputs(train_trees)

In [34]:
dev_feed_dict = compiler.build_feed_dict(dev_trees)

Evaluate the dev set with Pearson r, Spearman rho, mae, rmse

In [35]:
def dev_eval(epoch, train_loss):
  dev_mae, dev_rmse, dev_label, dev_pred, dev_loss = sess.run([mae, rmse, label, pred, loss], dev_feed_dict)

  pearson = scipy.stats.pearsonr(dev_pred, dev_label)
  spearman = scipy.stats.spearmanr(dev_pred, dev_label)

  print('dev results: pearson: %.3e, spearman: %.3e, mae: %.3e, rmse: %.3e, loss: %.3e, train_loss: %.3e'
        % (pearson[0], spearman[0], dev_mae, dev_rmse, dev_loss, train_loss))
  return pearson[0]

This loop trains the model and saves the best seen model so far.

In [36]:
best_dev_pearson = 0.0
data_dir = '/home/cliffrwong/Documents/code/testTF_Fold/models/'
save_path = os.path.join(data_dir, 'qualest_similarity_model')

for epoch, shuffled in enumerate(td.epochs(train_set, EPOCHS, shuffle=True), 1):
  train_loss = train_epoch(shuffled)
  dev_pearson = dev_eval(epoch, train_loss)
  if dev_pearson > best_dev_pearson:
    best_dev_pearson = dev_pearson
    checkpoint_path = saver.save(sess, save_path, global_step=epoch)
    print('model saved in file: %s' % checkpoint_path)

dev results: pearson: 2.944e-01, spearman: 3.194e-01, mae: 1.457e+01, rmse: 1.983e+01, loss: 7.449e+02, train_loss: 9.602e+03
model saved in file: /home/cliffrwong/Documents/code/testTF_Fold/models/qualest_similarity_model-1
dev results: pearson: 3.175e-01, spearman: 3.546e-01, mae: 1.460e+01, rmse: 1.934e+01, loss: 7.118e+02, train_loss: 9.383e+03
model saved in file: /home/cliffrwong/Documents/code/testTF_Fold/models/qualest_similarity_model-2
dev results: pearson: 3.242e-01, spearman: 3.556e-01, mae: 1.436e+01, rmse: 2.000e+01, loss: 7.492e+02, train_loss: 9.092e+03
model saved in file: /home/cliffrwong/Documents/code/testTF_Fold/models/qualest_similarity_model-3
dev results: pearson: 3.669e-01, spearman: 4.006e-01, mae: 1.453e+01, rmse: 1.896e+01, loss: 7.019e+02, train_loss: 8.798e+03
model saved in file: /home/cliffrwong/Documents/code/testTF_Fold/models/qualest_similarity_model-4
dev results: pearson: 3.680e-01, spearman: 3.984e-01, mae: 1.416e+01, rmse: 1.915e+01, loss: 7.046e+

KeyboardInterrupt: 

## Evaulate on Test Set

In [37]:
saver.restore(sess, checkpoint_path)

See how we did.

In [38]:
import scipy.stats

test_feed_dict = compiler.build_feed_dict(test_trees)
test_mae, test_rmse, test_label, test_pred, test_loss = sess.run([mae, rmse, label, pred, loss], test_feed_dict)

pearson = scipy.stats.pearsonr(test_pred, test_label)
spearman = scipy.stats.spearmanr(test_pred, test_label)

print('test results: pearson: %.3e, spearman: %.3e, mae: %.3e, rmse: %.3e, loss: %.3e'
        % (pearson[0], spearman[0], test_mae, test_rmse, test_loss))
    

test results: pearson: 3.559e-01, spearman: 3.767e-01, mae: 1.426e+01, rmse: 1.918e+01, loss: 1.606e+03
