# CNTK 302b: Evaluation ReasoNet for Machine Comprehension with CNN Dataset



## Data preparation

### Download data


In [15]:
import os
import sys
sys.path.insert(0, "../Examples/LanguageUnderstanding/")
from ReasoNet.prepare_cnn_data import file_exists,merge_files,download_cnn
data_root = "../Examples/LanguageUnderstanding/ReasoNet/Data"
raw_train_data=os.path.join(data_root, "cnn/training.txt")
raw_test_data=os.path.join(data_root, "cnn/test.txt")
if not (file_exists(raw_train_data) and file_exists(raw_test_data)):
  download_cnn(data_root)

merge_files(os.path.join(data_root, "cnn/questions/training"), raw_train_data)
merge_files(os.path.join(data_root, "cnn/questions/test"), raw_test_data)
print("All necessary data are downloaded to {0}".format(data_root))

All necessary data are downloaded to ../Examples/LanguageUnderstanding/ReasoNet/Data


### Convert to CNTK Text Format


In [16]:
from ReasoNet.wordvocab import *
vocab_path=os.path.join(data_root, "cnn/cnn.vocab")
train_ctf=os.path.join(data_root, "cnn/training.ctf")
test_ctf=os.path.join(data_root, "cnn/test.ctf")
vocab_size=101000
if not (file_exists(train_ctf) and file_exists(test_ctf)):
  entity_vocab, word_vocab = Vocabulary.build_vocab(raw_train_data, vocab_path, vocab_size)
  Vocabulary.build_corpus(entity_vocab, word_vocab, raw_test_data, test_ctf)
print("Data conversion finished.")

Data conversion finished.


### Download model

In [17]:
import io
import os
import re
import requests
import sys
import shutil
def download_model(src, target):
  target_dir=os.path.dirname(target)
  if not os.path.exists(target_dir):
    os.makedirs(target_dir)    
  url=src
  print("Start to download model data from {0} to {1}".format(url, target))
  request = requests.get(url)
  with open(target, mode='wb') as outf:
    outf.write(request.content)
  
  print("Finished to download {0} to {1}".format(url, target))

model_src="http://cntk.ai/jup/models/reasonet/model_cnn.epoch.00.bin"
model_path="models/model_cnn.epoch.00.bin"
if not file_exists(model_path):
  download_model(model_src, model_path)
print("Succeeded to download model to local.")

Succeeded to download model to local.


## Basic CNTK imports

In [18]:
import sys
from datetime import datetime
import numpy as np
import cntk
from cntk import device
from cntk.ops import sequence, element_times, reshape, greater, slice, hardmax, input

# Check for an environment variable defined in CNTK's test infrastructure
envvar = 'CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'
def is_test(): return envvar in os.environ

# Select the right target device when this notebook is being tested
# Currently supported only for GPU 

if 'TEST_DEVICE' in os.environ:
    if os.environ['TEST_DEVICE'] == 'cpu':
        raise ValueError('This notebook is currently not support on CPU') 
    else:
        cntk.device.set_default_device(cntk.device.gpu(0))
cntk.device.set_default_device(cntk.device.gpu(0))

True

### Predict
After we get the model, we can use it to predict answers given a paragraph and a query. 

In [19]:
from ReasoNet.reasonet import *
def predict(model, params):
  """
  Compute the prediction result of the given model
  """
  model_args = {arg.name:arg for arg in model.arguments}
  context = model_args['context']
  entity_ids_mask = model_args['entity_ids_mask']
  entity_condition = greater(entity_ids_mask, 0, name='condidion')
  # Get all the enities in the paragraph via gather operator, which will create a new dynamic sequence axis
  entities_all = sequence.gather(entity_condition, entity_condition, name='entities_all')
  # The generated dynamic axis has the same length as the input enity id sequence,
  # so we asign it as the entity id's dynamic axis.
  entity_ids = input(shape=(params.entity_dim), is_sparse=True,
                              dynamic_axes=entities_all.dynamic_axes, name='entity_ids')
  wordvocab_dim = params.vocab_dim
  answers = sequence.scatter(sequence.gather(model.outputs[-1], entity_condition), entities_all, name='Final_Ans')
  # The id zero is used for unknown tokens, and entity ids start with 1. 
  # So we will trim the first column in the entity id matrix
  entity_id_matrix = slice(reshape(entity_ids, params.entity_dim), -1, 1, params.entity_dim)
  expand_pred = sequence.reduce_sum(element_times(answers, entity_id_matrix))
  pred_max = hardmax(expand_pred, name='pred_max')
  return pred_max


#### Mapping the prediction to entities
The prediction result is a one hot vector that 1 means the entity at that position is the predicted answer and 0 means not. To make the predition result readable, we need to convert that vector to entity id and remapping it back to the real entity.

In [20]:
import sys
import os
import cntk.device as device
import numpy as np
import math
from cntk import load_model

def pred_cnn_model(model_path):
  logger.init("cnn_test")
  test_path = os.path.join(data_root, "cnn/test.ctf")
  test_size=2291183
  vocab_path = os.path.join(data_root, "cnn/cnn.vocab")
  vocab_dim = 101585
  entity_dim = 586
  hidden_dim=256
  max_rl_steps=5
  embedding_dim=300
  att_dim = 384
  minibatch_size=1
  share_rnn = True

  test_data = create_reader(test_path, vocab_dim, entity_dim, False)
  embedding_init = None

  params = model_params(vocab_dim = vocab_dim, entity_dim = entity_dim, hidden_dim = hidden_dim,
                        embedding_dim = embedding_dim, attention_dim=att_dim, max_rl_steps = max_rl_steps,
                        embedding_init = embedding_init, dropout_rate = 0.2, share_rnn_param = share_rnn)

  entity_table, word_table = Vocabulary.load_vocab(vocab_path)
  model = load_model(model_path)
  predict_func = predict(model, params)
  bind = bind_data(predict_func, test_data)
  context_stream = get_context_bind_stream(bind)
  samples_sum = 0
  i = 0
  predicted_results = []
  max_num = 5
  start = os.times()
  while i<test_size:
    mbs = min(test_size - i, minibatch_size)
    mb = test_data.next_minibatch(mbs, bind)
    pred = predict_func.eval(mb)
    # Convert entity one hot vector to entity id
    ans = np.nonzero(pred)
    # Remapping entity id to real entity
    for id in ans[1]:
      predicted_results += [ entity_table.lookup_by_id(id) ]    
    i += mb[context_stream].num_samples
    samples = mb[context_stream].num_sequences
    samples_sum += samples
    sys.stdout.write('.')
    sys.stdout.flush()
    if samples_sum >= max_num:
      break
  end = os.times()
  total = end.elapsed - start.elapsed
  print("")
  print("Evaluated samples: {0} in {1} seconds".format(samples_sum, total))
  raw_test_path = os.path.join(data_root, "cnn/test.txt")
  instance_id = 0
  with open(raw_test_path, 'r', encoding='utf-8') as raw:
    content = raw.readlines()
    for record in content:
      fields = record.strip().split('\t')
      query = fields[0]
      answer = fields[1]
      doc = fields[2]
      print("===============")
      print("[{0}] Doc: {1}\n Query: {2}\n Answer: {3}\n Expected: {4}".\
            format(instance_id, doc, query, predicted_results[instance_id], answer))
      print()
      instance_id+=1
      if instance_id >= len(predicted_results):
        break

pred_cnn_model(model_path)

Log with log file: log/cnn_test_03-30_02.24.13.log
.....
Evaluated samples: 5 in 2.7899999991059303 seconds
[0] Doc: @entity3 ( @entity2 ) @entity1 heavyweight boxing champion @entity0 has an important title defense coming up , but his thoughts continue to be dominated by the ongoing fight for democracy in @entity8 . speaking to @entity2 from his @entity3 training base ahead of the april 25 showdown with @entity12 challenger @entity11 in @entity13 , @entity0 said the crisis in his homeland has left him shocked and upset . " my country is unfortunately suffering in the war with @entity18 -- not that @entity8 tried to give any aggression to any other nation , in this particular case @entity18 , unfortunately it 's the other way around , " @entity0 told @entity2 . " i never thought that our brother folk is going to have war with us , so that @entity8 and @entity18 are going to be divided with blood , " he added . " unfortunately , we do n't know how far it 's going to go and how worse it 