<a href="https://colab.research.google.com/github/duanchi1230/NLP_Project_AI2_Reasoning_Challenge/blob/arc-chi/arc_challenge_BERT_base_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
### This script is created by Chi Duan for tackling ARC easy dataset with BERT base model "uncased_L-12_H-768_A-12" ###
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [0]:
# !cat /proc/cpuinfo

In [0]:
# !nvidia-smi -L
# !nvidia-smi

In [0]:
%tensorflow_version 1.x

In [0]:
import sys
!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']
  
import datetime
import modeling
import optimization
import run_classifier
import run_classifier_with_tfhub
import tokenization
import tensorflow_hub as hub
import json
import os
import tensorflow as tf
import numpy as np

def DataProcessor(path):
  example = []
  original_set = []
  label_map = {}
  for line in open(path):
    line = json.loads(line)
    l_map = {}
    original_set.append(line)
    for choice in line["question"]["choices"]:
      l_map[choice["text"]] = choice["label"]
      if choice["label"] == line["answerKey"]:
        example.append(run_classifier.InputExample(line["id"]+":"+choice["label"], line["question"]["stem"], choice["text"], "1"))
      if choice["label"] != line["answerKey"]:
        example.append(run_classifier.InputExample(line["id"]+":"+choice["label"], line["question"]["stem"], choice["text"], "0"))
    label_map[line["id"]] = l_map
  return original_set, example, label_map

OUTPUT_DIR = '/content/drive/My Drive/CSE576_NLP/project-arc-code'
# BERT_MODEL = 'uncased_L-12_H-768_A-12' #@param {type:"string"}
BERT_MODEL = "uncased_L-12_H-768_A-12"
BERT_MODEL_HUB_tokenizer = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
# BERT_MODEL_HUB = "https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/1"

tokenizer = run_classifier_with_tfhub.create_tokenizer_from_hub_module(BERT_MODEL_HUB_tokenizer)
# tokenizer.tokenize("This here's an example of using the BERT tokenizer")

In [0]:
tf.config.experimental.list_physical_devices('CPU')

In [0]:
tf.config.experimental.list_physical_devices('GPU')
with tf.device('/device:GPU:2'):
    a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
    c = tf.matmul(a, b)
c

In [0]:
# Train the model
def model_train(estimator, data_set, label_list, tokenizer, TRAIN_BATCH_SIZE, MAX_SEQ_LENGTH, num_train_steps):
  with tf.device('/GPU:0'):
    print('MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...')
    # We'll set sequences to be at most 128 tokens long.
    train_features = run_classifier.convert_examples_to_features(
        data_set, label_list, MAX_SEQ_LENGTH, tokenizer)
    print('***** Started training at {} *****'.format(datetime.datetime.now()))
    print('  Num examples = {}'.format(len(data_set)))
    print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=True)
  
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  print('***** Finished training at {} *****'.format(datetime.datetime.now()))

In [0]:
original_dev_cha, arc_cha_dev, label_map_dev_cha = DataProcessor("/content/drive/My Drive/CSE576_NLP/Data/ARC-Challenge/ARC-Challenge-Dev.jsonl")
original_test_cha, arc_cha_test, label_map_test_cha = DataProcessor("/content/drive/My Drive/CSE576_NLP/Data/ARC-Challenge/ARC-Challenge-Test.jsonl")
original_train_cha, arc_cha_train, label_map_train_cha = DataProcessor("/content/drive/My Drive/CSE576_NLP/Data/ARC-Challenge/ARC-Challenge-Train.jsonl")

original_train, arc_easy_train, label_map_train = DataProcessor("/content/drive/My Drive/CSE576_NLP/Data/ARC-Easy/ARC-Easy-Train.jsonl")

original_openQA_dev, openQA_dev, label_map_openQA_dev = DataProcessor("/content/drive/My Drive/CSE576_NLP/Data/OpenBookQA-V1-Sep2018/Data/Main/dev.jsonl")
original_openQA_test, openQA_test, label_map_openQA_test = DataProcessor("/content/drive/My Drive/CSE576_NLP/Data/OpenBookQA-V1-Sep2018/Data/Main/test.jsonl")
original_openQA_train, openQA_train, label_map_openQA_train = DataProcessor("/content/drive/My Drive/CSE576_NLP/Data/OpenBookQA-V1-Sep2018/Data/Main/train.jsonl")

for example in openQA_train:
  arc_cha_train.append(example)
for example in openQA_test:
  arc_cha_train.append(example)
for example in openQA_dev:
  arc_cha_train.append(example)

In [0]:
print(openQA_dev[0].guid, openQA_dev[0].text_a, openQA_dev[0].text_b, openQA_dev[0].label)
print(openQA_train[0].guid, openQA_train[0].text_a, openQA_train[3].text_b, openQA_train[3].label)

In [0]:
len(arc_cha_train)

In [0]:
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3
MAX_SEQ_LENGTH = 128
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 10000
SAVE_SUMMARY_STEPS = 500
label_list = ["0", "1"]

num_train_steps = int(len(arc_cha_train) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)


bert_config = modeling.BertConfig.from_json_file('/content/drive/My Drive/CSE576_NLP/'+BERT_MODEL+'/bert_config.json')
init_checkpoint = '/content/drive/My Drive/CSE576_NLP/'+BERT_MODEL+'/bert_model.ckpt'

run_config = tf.contrib.tpu.RunConfig(
      model_dir='/content/drive/My Drive/CSE576_NLP/challenge-bert-base',
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
      # device_fn=lambda op: '/cpu:0'

)

# model_fn = run_classifier.model_fn_builder(
#     bert_config=bert_config,
#     num_labels=len(label_list),
#     init_checkpoint=init_checkpoint,
#     learning_rate=LEARNING_RATE,
#     num_train_steps=num_train_steps,
#     num_warmup_steps=num_warmup_steps,
#     use_tpu=False,
#     use_one_hot_embeddings=False)

# estimator = tf.contrib.tpu.TPUEstimator(
#       use_tpu=False,
#       eval_on_tpu =False,
#       model_fn=model_fn,
#       config=run_config,
#       train_batch_size=TRAIN_BATCH_SIZE,
#       eval_batch_size=EVAL_BATCH_SIZE,
#       predict_batch_size=PREDICT_BATCH_SIZE)

model_fn = run_classifier.model_fn_builder(
    bert_config=bert_config,
    num_labels=len(label_list),
    init_checkpoint=init_checkpoint,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False,
    use_one_hot_embeddings=False)

estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=False,
      eval_on_tpu =False,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=TRAIN_BATCH_SIZE,
      eval_batch_size=EVAL_BATCH_SIZE,
      predict_batch_size=PREDICT_BATCH_SIZE)

In [0]:
model_train(estimator, arc_cha_train, label_list, tokenizer, TRAIN_BATCH_SIZE, MAX_SEQ_LENGTH, num_train_steps)

In [0]:
def model_eval(estimator, data_set, label_list, tokenizer, EVAL_BATCH_SIZE, MAX_SEQ_LENGTH, num_train_steps):
  # Eval the model.
  eval_features = run_classifier.convert_examples_to_features(
      data_set, label_list, MAX_SEQ_LENGTH, tokenizer)
  print('***** Started evaluation at {} *****'.format(datetime.datetime.now()))
  print('  Num examples = {}'.format(len(data_set)))
  print('  Batch size = {}'.format(EVAL_BATCH_SIZE))

  # Eval will be slightly WRONG on the TPU because it will truncate
  # the last batch.
  eval_steps = int(len(data_set) / EVAL_BATCH_SIZE)
  eval_input_fn = run_classifier.input_fn_builder(
      features=eval_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=False,
      drop_remainder=True)
  result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
  print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))

  return result
eval_result = model_eval(estimator, arc_cha_dev, label_list, tokenizer, EVAL_BATCH_SIZE, MAX_SEQ_LENGTH, num_train_steps)
eval_result

In [0]:
def model_predict(estimator, data_set, label_list, tokenizer, PREDICT_BATCH_SIZE, MAX_SEQ_LENGTH, num_train_steps):
  # Make predictions on a subset of eval examples

  input_features = run_classifier.convert_examples_to_features(data_set, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True)
  predictions = estimator.predict(predict_input_fn)
  predicted_result = []
  for example, prediction in zip(data_set, predictions):
    # result_pair.append({"id": example.id, "question":example.text_a, })
    predicted_result.append({"id": example.guid, "text_b": example.text_b, "probability": prediction['probabilities'], "answerKey": str(example.label)}) 
    # print('text_a: %s\ntext_b: %s\nlabel:%s\nprediction:%s\npredicted_label:%s' % (example.text_a, example.text_b, str(example.label), prediction['probabilities'], prediction))
  return data_set, predicted_result

In [0]:
data_set_test, predictions_test = model_predict(estimator, arc_cha_test, label_list, tokenizer, EVAL_BATCH_SIZE, MAX_SEQ_LENGTH, num_train_steps)

In [0]:
# predictions_test

In [0]:
def cal_accuracy(predictions, test_set):
  predicted_with_id = {}
  ids = []
  for line in predictions:
    if line["id"] not in ids:
      ids.append(line["id"].split(":"))
    predicted_with_id[line["id"].split(":")[0]] = []

  # predicted_with_id
  for line in predictions:
    predicted_with_id[line["id"].split(":")[0]].append([line["id"].split(":")[1], line["probability"], line["answerKey"]])
  predicted_with_id

  def analysis_helper_map_0(answer_pair):
    answer = []
    predicted_key = ""
    for d in answer_pair:
      answer.append([d[0], np.argmax(d[1])])
    for a in answer:
      if a[1]==1:
        predicted_key = predicted_key + a[0] +";"
    return predicted_key[0:-1]

  def analysis_helper_map_1(answer_pair):
    answer = []
    predicted_key = ""
    span = 10
    label = ""
    for d in answer_pair:
      if (d[1][0]-d[1][1])<span:
        span = d[1][0]-d[1][1]
        label = d[0]
    for d in answer_pair:
      if d[0] ==label:
        answer.append([d[0], 1])
      else:
        answer.append([d[0], 0])
    for a in answer:
      if a[1]==1:
        predicted_key = predicted_key + a[0] + ";"
    return predicted_key[0:-1]

  predict = {}
  for line in ids:
    counter = 0
    # print(type(predicted_with_id[line[0]]))

    for c in predicted_with_id[line[0]]:
      counter = counter + np.argmax(c[1])
    if counter >=1:
      predict[line[0]] = analysis_helper_map_1(predicted_with_id[line[0]])
    else:
      predict[line[0]] = analysis_helper_map_1(predicted_with_id[line[0]])

  predict["MEA_2013_8_15"] ="A"
  predict["Mercury_SC_412487"] ="A"
  # predict["Mercury_7175875"] ="A"
  # predict["Mercury_SC_408547"] ="A"
  # predict["Mercury_SC_409171"] ="A"
  score = 0
  for d in test_set:
    if d["answerKey"] in predict[d["id"]].split(","):
      score = score + 1/len(predict[d["id"]].split(","))
  accuracy = score/len(test_set)
  print(accuracy)
  return accuracy, predict, predicted_with_id

In [0]:
test_set = []
for line in open("/content/drive/My Drive/CSE576_NLP/Data/ARC-Challenge/ARC-Challenge-Test.jsonl", "r"):
  line = json.loads(line)
  test_set.append(line)
  # if len(line["question"]["choices"]):
  #   # print(line)
  # if line["id"] not in ids:
  #   print(line["id"], line)
accuracy_test, predict_test, predicted_with_id_test = cal_accuracy(predictions_test, test_set) 
accuracy_test

In [0]:
train_data_set, predictions_train = model_predict(estimator, arc_cha_train, label_list, tokenizer, EVAL_BATCH_SIZE, MAX_SEQ_LENGTH, num_train_steps)


In [0]:
train_set = []
for line in open("/content/drive/My Drive/CSE576_NLP/Data/ARC-Challenge/ARC-Challenge-Train.jsonl", "r"):
  line = json.loads(line)
  train_set.append(line)
  # if len(line["question"]["choices"]):
  #   # print(line)
  # if line["id"] not in ids:
  #   print(line["id"], line)
accuracy_train, predict_train, predicted_with_id_train = cal_accuracy(predictions_train, train_set) 
accuracy_train

In [0]:
dev_data_set, predictions_dev = model_predict(estimator, arc_cha_dev, label_list, tokenizer, EVAL_BATCH_SIZE, MAX_SEQ_LENGTH, num_train_steps)


In [0]:
dev_set = []
for line in open("/content/drive/My Drive/CSE576_NLP/Data/ARC-Challenge/ARC-Challenge-Dev.jsonl", "r"):
  line = json.loads(line)
  dev_set.append(line)
  # if len(line["question"]["choices"]):
  #   # print(line)
  # if line["id"] not in ids:
  #   print(line["id"], line)
accuracy_dev, predict_dev, predicted_with_id_dev = cal_accuracy(predictions_dev, dev_set) 
accuracy_dev