In [1]:
from __future__ import absolute_import, division, print_function

import argparse
import logging
import os
import random
import glob

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from tensorboardX import SummaryWriter

from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForQuestionAnswering, BertTokenizer,
                                  XLMConfig, XLMForQuestionAnswering,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForQuestionAnswering,
                                  XLNetTokenizer,
                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)

from pytorch_transformers import AdamW, WarmupLinearSchedule

from utils_squad import (read_squad_examples, convert_examples_to_features,
                         RawResult, write_predictions,
                         RawResultExtended, write_predictions_extended)
from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad

#import tensorflow as tf

#from keras.backend.tensorflow_backend import set_session
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
#config.gpu_options.visible_device_list = "0" #only the gpu 0 is allowed

# config.gpu_options.per_process_gpu_memory_fraction = 0.01

#set_session(tf.Session(config=config))

In [2]:
logger = logging.getLogger(__name__)

ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())

In [3]:
MODEL_CLASSES = {
    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
}

In [4]:
class struct():
    def __init__(self):
        self.train_file = 'train-v2.0.json'
        self.predict_file = 'dev-v2.0.json'
        # exam the new dataset
        #self.predict_file = 'test.json'
        self.model_type = 'bert'
        self.model_name = 'bert-large-uncased-whole-word-masking'
        self.task_name = 'MRPC'
        self.do_train = True
        self.do_eval = True
        self.do_lower_case = True
        self.data_dir = 'GLUE_DIR/MRPC/'
        self.max_seq_length = 128
        self.per_gpu_eval_batch_size = 8
        self.per_gpu_train_batch_size = 8
        # the default is 2, we make it quicker
        self.num_train_epochs = 3.0
        self.learning_rate = 4e-5
        self.output_dir = 'tmp/mrpc_output/'
        self.overwrite_output_dir = True
        self.overwrite_cache = True   
        self.local_rank = -1
        self.version_2_with_negative = True
        self.doc_stride = 128
        self.max_query_length = 64
        self.n_gpu = 1
        self.max_steps = -1
        self.gradient_accumulation_steps = 1
        self.weight_decay = 0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1
        self.warmup_steps = 0
        self.n_best_size = 20
        self.max_answer_length = 30
        self.verbose_logging = True
        self.logging_steps = 50
        self.save_steps = 5000
        self.fp16 = True
        self.fp16_opt_level = 'O1'
        self.seed = 42
        self.no_cuda = False
        self.evaluate_during_training = False

In [5]:
args = struct()

In [6]:
# set up device
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend='nccl')
    args.n_gpu = 1
args.device = device

In [7]:
# Load a trained model and vocabulary that you have fine-tuned
args.model_type = args.model_type.lower()
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.model_name)
model = model_class.from_pretrained(args.output_dir)
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
model.to(args.device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12,

In [8]:
import json
filename1 = 'Thai_cook.json'
with open(filename1, 'r') as f:
    Thai = json.load(f)

In [96]:
# pick several data from the reviews
data = {}
idx = 1
for i in range(1,len(Thai)+1):
    key = str(i)
    content = Thai[key]['review']
    content = content.replace('(Translated by Google) ','')
    content = content.split('(Original)')[0]
    content = content.replace('\n\n','')
    content.replace('\n','')
    Thai[key]['review'] = content
    if len(content) > 0:
        data[idx] = content
        idx = idx + 1

In [35]:
context = data[3]

In [36]:
# create similar input data
test_grandson = {}
test_grandson['qas'] = [{'question':'What to eat?',
                         'id': '1',
                          'answers': [{'text':'food','answer_start': 0}],
                          'is_impossible': False},
                        {'question':'What to try?',
                         'id': '2',
                          'answers': [{'text':'food','answer_start': 0}],
                          'is_impossible': False},
                       {'question':'What is good?',
                         'id': '3',
                          'answers': [{'text':'food','answer_start': 0}],
                          'is_impossible': False},
                       {'question':'What is delicious?',
                         'id': '4',
                          'answers': [{'text':'food','answer_start': 0}],
                          'is_impossible': False},
                       {'question':'What to recommend?',
                         'id': '5',
                          'answers': [{'text':'food','answer_start': 0}],
                          'is_impossible': False},
                       {'question':'What do you prefer?',
                         'id': '6',
                          'answers': [{'text':'food','answer_start': 0}],
                          'is_impossible': False},
                       {'question':'How is the service?',
                         'id': '7',
                          'answers': [{'text':'food','answer_start': 0}],
                          'is_impossible': False},
                        {'question':'How is the price?',
                         'id': '8',
                          'answers': [{'text':'food','answer_start': 0}],
                          'is_impossible': False},
                       {'question':'How long to wait in this place?',
                         'id': '9',
                          'answers': [{'text':'food','answer_start': 0}],
                          'is_impossible': False},
                       {'question':'Is the place clean?',
                         'id': '10',
                          'answers': [{'text':'food','answer_start': 0}],
                          'is_impossible': True}]
test_grandson['context'] = context

test_child = {}
test_child['title'] =  'Thai'
test_child['paragraphs'] = [test_grandson]

test = {}
test['version'] = 'test'
test['data'] = [test_child]

In [44]:
# save test to json
with open('test.json', 'w') as outfile:
    json.dump(test, outfile)

In [45]:
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [46]:
# we need to change the datasets
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = 'cached_features_file'
    test_features_file = 'test_features_file'
    
    logger.info("Creating features from dataset file at %s", input_file)
    examples = read_squad_examples(input_file=input_file,
                                        is_training=not evaluate,
                                        version_2_with_negative=args.version_2_with_negative)
    features = convert_examples_to_features(examples=examples,
                                        tokenizer=tokenizer,
                                        max_seq_length=args.max_seq_length,
                                        doc_stride=args.doc_stride,
                                        max_query_length=args.max_query_length,
                                            is_training=not evaluate)
    
    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index, all_cls_index, all_p_mask)
    else:
        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_start_positions, all_end_positions,
                                all_cls_index, all_p_mask)

    if output_examples:
        return dataset, examples, features
    return dataset

In [47]:
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
                      }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4],
                               'p_mask':    batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(unique_id            = unique_id,
                                           start_top_log_probs  = to_list(outputs[0][i]),
                                           start_top_index      = to_list(outputs[1][i]),
                                           end_top_log_probs    = to_list(outputs[2][i]),
                                           end_top_index        = to_list(outputs[3][i]),
                                           cls_logits           = to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id    = unique_id,
                                   start_logits = to_list(outputs[0][i]),
                                   end_logits   = to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(examples, features, all_results, args.n_best_size,
                        args.max_answer_length, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.predict_file,
                        model.config.start_n_top, model.config.end_n_top,
                        args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        write_predictions(examples, features, all_results, args.n_best_size,
                        args.max_answer_length, args.do_lower_case, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                        args.version_2_with_negative, args.null_score_diff_threshold)

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
                                 pred_file=output_prediction_file,
                                 na_prob_file=output_null_log_odds_file)
    results = evaluate_on_squad(evaluate_options)
    return results

In [48]:
# evaluate the QA for the new dataset
# evaluate the final results
args.predict_file = 'test.json'
dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
# Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

In [49]:
prefix=1
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
logger.info("***** Running evaluation {} *****".format(prefix))
logger.info("  Num examples = %d", len(dataset))
logger.info("  Batch size = %d", args.eval_batch_size)
all_results = []
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()
    batch = tuple(t.to(args.device) for t in batch)
    with torch.no_grad():
        inputs = {'input_ids':      batch[0],
                    'attention_mask': batch[1],
                    'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
                    }
        example_indices = batch[3]
        if args.model_type in ['xlnet', 'xlm']:
            inputs.update({'cls_index': batch[4],
                            'p_mask':    batch[5]})
        outputs = model(**inputs)

    for i, example_index in enumerate(example_indices):
        eval_feature = features[example_index.item()]
        unique_id = int(eval_feature.unique_id)
        if args.model_type in ['xlnet', 'xlm']:
            # XLNet uses a more complex post-processing procedure
            result = RawResultExtended(unique_id            = unique_id,
                                        start_top_log_probs  = to_list(outputs[0][i]),
                                        start_top_index      = to_list(outputs[1][i]),
                                        end_top_log_probs    = to_list(outputs[2][i]),
                                        end_top_index        = to_list(outputs[3][i]),
                                        cls_logits           = to_list(outputs[4][i]))
        else:
            result = RawResult(unique_id    = unique_id,
                                start_logits = to_list(outputs[0][i]),
                                end_logits   = to_list(outputs[1][i]))
        all_results.append(result)

Evaluating: 100%|██████████| 1/1 [00:00<00:00, 10.31it/s]


In [50]:
# Compute predictions
output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
# assume some question does not have an answer
if args.version_2_with_negative:
    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))

In [51]:
args.null_score_diff_threshold = 0
args.version_2_with_negative = True
if args.model_type in ['xlnet', 'xlm']:
    # XLNet uses a more complex post-processing procedure
    write_predictions_extended(examples, features, all_results, args.n_best_size,
                    args.max_answer_length, output_prediction_file,
                    output_nbest_file, output_null_log_odds_file, args.predict_file,
                    model.config.start_n_top, model.config.end_n_top,
                    args.version_2_with_negative, tokenizer, args.verbose_logging)
else:
    write_predictions(examples, features, all_results, args.n_best_size,
                    args.max_answer_length, args.do_lower_case, output_prediction_file,
                    output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                    args.version_2_with_negative, args.null_score_diff_threshold)

In [52]:
# read the answer
import json
filename1 = output_prediction_file
with open(filename1, 'r') as f:
    answer = json.load(f)

In [53]:
answer

{'1': 'Vegetarian dishes were bland and dry',
 '2': 'Pad Thai (both chicken and shrimp) and bowls were very good. Thai ice tea was delicious. Vegetarian dishes were bland and dry',
 '3': 'Pad Thai (both chicken and shrimp) and bowls were very good',
 '4': 'Thai ice tea',
 '5': '',
 '6': '',
 '7': '',
 '8': '',
 '9': '',
 '10': ''}

In [98]:
# collect all answers from reviews
import time
answer_collect = {}
for key in range(1,905):
    context = data[key]
    test_grandson = {}
    test_grandson['qas'] = [{'question':'What to eat?',
                             'id': '1',
                              'answers': [{'text':'food','answer_start': 0}],
                              'is_impossible': False},
                            {'question':'What to try?',
                             'id': '2',
                              'answers': [{'text':'food','answer_start': 0}],
                              'is_impossible': False},
                           {'question':'What is good?',
                             'id': '3',
                              'answers': [{'text':'food','answer_start': 0}],
                              'is_impossible': False},
                           {'question':'What is delicious?',
                             'id': '4',
                              'answers': [{'text':'food','answer_start': 0}],
                              'is_impossible': False},
                           {'question':'What to recommend?',
                             'id': '5',
                              'answers': [{'text':'food','answer_start': 0}],
                              'is_impossible': False},
                           {'question':'What do you prefer?',
                             'id': '6',
                              'answers': [{'text':'food','answer_start': 0}],
                              'is_impossible': False},
                           {'question':'How is the service?',
                             'id': '7',
                              'answers': [{'text':'food','answer_start': 0}],
                              'is_impossible': False},
                            {'question':'How is the price?',
                             'id': '8',
                              'answers': [{'text':'food','answer_start': 0}],
                              'is_impossible': False},
                           {'question':'How long to wait in this place?',
                             'id': '9',
                              'answers': [{'text':'food','answer_start': 0}],
                              'is_impossible': False},
                           {'question':'Is the place clean?',
                             'id': '10',
                              'answers': [{'text':'food','answer_start': 0}],
                              'is_impossible': True}]
    test_grandson['context'] = context

    test_child = {}
    test_child['title'] =  'Thai'
    test_child['paragraphs'] = [test_grandson]

    test = {}
    test['version'] = 'test'
    test['data'] = [test_child]
    with open('test.json', 'w') as outfile:
        json.dump(test, outfile)
    #time.sleep(1)
    # evaluate the QA for the new dataset
    # evaluate the final results
    args.predict_file = 'test.json'
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    
    prefix=1
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                        'attention_mask': batch[1],
                        'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
                        }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4],
                                'p_mask':    batch[5]})
            outputs = model(**inputs)
    
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(unique_id            = unique_id,
                                            start_top_log_probs  = to_list(outputs[0][i]),
                                            start_top_index      = to_list(outputs[1][i]),
                                            end_top_log_probs    = to_list(outputs[2][i]),
                                            end_top_index        = to_list(outputs[3][i]),
                                            cls_logits           = to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id    = unique_id,
                                    start_logits = to_list(outputs[0][i]),
                                    end_logits   = to_list(outputs[1][i]))
            all_results.append(result)
            
    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
    # assume some question does not have an answer
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
        
    args.null_score_diff_threshold = 0
    args.version_2_with_negative = True
    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(examples, features, all_results, args.n_best_size,
                        args.max_answer_length, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.predict_file,
                        model.config.start_n_top, model.config.end_n_top,
                        args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        write_predictions(examples, features, all_results, args.n_best_size,
                        args.max_answer_length, args.do_lower_case, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                        args.version_2_with_negative, args.null_score_diff_threshold)
        
    filename1 = output_prediction_file
    with open(filename1, 'r') as f:
        answer = json.load(f)    
    #time.sleep(1)    
    answer_collect[key] = answer

Evaluating: 100%|██████████| 2/2 [00:00<00:00, 10.59it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.11it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.45it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.47it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.48it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.51it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 10.62it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.51it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.48it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.47it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.48it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.36it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.49it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.48it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.51it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.51it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.50it/s]
Evaluating: 10

Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.40it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.41it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.42it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.22it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.44it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.40it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.42it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.42it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.28it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.25it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.20it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.32it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.40it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.38it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.43it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.37it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.33it/s]
Evaluating: 10

Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.39it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.38it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.37it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.24it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.31it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.39it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.36it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.36it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.18it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.31it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.36it/s]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  7.55it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.21it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.39it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.39it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.37it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.37it/s]
Evaluating: 10

Evaluating: 100%|██████████| 1/1 [00:00<00:00, 11.76it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.35it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.32it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.05it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.32it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.17it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.17it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.34it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.37it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.15it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.36it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 11.75it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.19it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.16it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 11.11it/s]
Evaluating: 100%|██████████| 2/2 [00:00<00:00, 11.44it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.27it/s]
Evaluating: 10

In [107]:
answer_collect[200]

{'1': '',
 '2': 'Thai iced tea, coconut juice, or traditional Thai beer',
 '3': '',
 '4': 'noodles',
 '5': 'beef noodle or the duck noodle',
 '6': 'beef noodle or the duck noodle',
 '7': '',
 '8': '',
 '9': '',
 '10': ''}

In [108]:
# save answer collection to json
with open('answer_collect.json', 'w') as outfile:
    json.dump(answer_collect, outfile)

In [1]:
import json
with open('answer_collect.json', 'r') as f:
    answer_collection = json.load(f)

In [4]:
answer_collection['1']['1']

'The appetizers'

In [6]:
To_eat = []
for i in range(len(answer_collection)):
    To_eat.append(answer_collection[str(i+1)]['1'])

In [11]:
Delicious = []
for i in range(len(answer_collection)):
    Delicious.append(answer_collection[str(i+1)]['4'])

In [29]:
Recommend = []
for i in range(len(answer_collection)):
    Recommend.append(answer_collection[str(i+1)]['5'])

In [30]:
Prefer = []
for i in range(len(answer_collection)):
    Prefer.append(answer_collection[str(i+1)]['6'])

In [31]:
Menu = To_eat + Delicious + recommend

In [40]:
Menu = []
for i in range(len(To_eat)):
    a = [To_eat[i]] + [Delicious[i]] + [Recommend[i]] + [Prefer[i]]
    Menu += list(set(a))

In [48]:
Menu = [x.lower() for x in Menu if len(x) > 0]

In [56]:
Menu_filter = []
for x in Menu:
    if 'food' not in x.split(' '):
        Menu_filter.append(x)

In [59]:
# save answer collection to json
with open('Menu_filter.json', 'w') as outfile:
    json.dump(Menu_filter, outfile)