<a href="https://colab.research.google.com/github/cyyeh/kaggle/blob/master/google-qa/google_qa_shortans_albert_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries and Environment Setup

In [0]:
# make sure colab use tf2.x
try:
  %tensorflow_version 2.x
except Exception:
  pass

TensorFlow 2.x selected.


In [0]:
import json
import os
import pandas as pd
import numpy as np

In [0]:
!pip install transformers # BertModel

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ee/fc/bd726a15ab2c66dc09306689d04da07a3770dad724f0883f0a4bfb745087/transformers-2.4.1-py3-none-any.whl (475kB)
[K     |████████████████████████████████| 481kB 2.7MB/s 
Collecting tokenizers==0.0.11
[?25l  Downloading https://files.pythonhosted.org/packages/5e/36/7af38d572c935f8e0462ec7b4f7a46d73a2b3b1a938f50a5e8132d5b2dc5/tokenizers-0.0.11-cp36-cp36m-manylinux1_x86_64.whl (3.1MB)
[K     |████████████████████████████████| 3.1MB 59.0MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 34.0MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K  

In [0]:
from transformers import AlbertTokenizer

# Prepare YES/NO Answer Dataset

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


Check if training/testing dataset is available in your google drive. If it's not available, you should run code inside the "Prepare Kaggle Dataset" section.

In [0]:
if os.path.exists('drive/My Drive/yes_no_ans_df.pkl') and \
os.path.exists('drive/My Drive/short_ans_raw_df.pkl'):
  print("Training dataset is available!")
else:
  print("Training dataset is not found, please run code inside the 'Prepare Kaggle Dataset' section.")

Training dataset is not found, please run code inside the 'Prepare Kaggle Dataset' section.


## Prepare Kaggle Dataset for [TensorFlow 2.0 Question Answering](https://www.kaggle.com/c/tensorflow2-question-answering)

### Download Question Answering Dataset

In [0]:
import os
os.environ['KAGGLE_USERNAME'] = "chihyuyeh" # username from the json file
os.environ['KAGGLE_KEY'] = "f21b340fc8082977cbf954c80ad69ae1" # key from the json file
!kaggle competitions download -c tensorflow2-question-answering

Downloading simplified-nq-train.jsonl.zip to /content
 86% 3.84G/4.46G [00:48<00:13, 50.6MB/s]

In [0]:
!unzip simplified-nq-train.jsonl.zip
!unzip simplified-nq-test.jsonl.zip

### Generate Short Answer Raw Data Dataframe

In [0]:
train_path = 'simplified-nq-train.jsonl'
test_path = 'simplified-nq-test.jsonl'

In [0]:
def extracting_text_using_start_end_token_id(document_text, start_token, end_token):
    splitted_document_text = document_text.split()
    return ' '.join(splitted_document_text[start_token:end_token])

In [0]:
def has_long_answer(long_answer_candidate):
  return long_answer_candidate['start_token'] != -1 \
  and long_answer_candidate['candidate_index'] != -1 \
  and long_answer_candidate['end_token'] != -1

In [0]:
def data_cleaning_for_short_answer(
  json_obj,
  task='both',
  example_id=True):
  '''12
  keys of the output dictionary: 
    'example_id' # optional 
    'question_text'
    'long_answer_text'
    'yes_no_answer' # exist only if task == 'both' or 'classing'
    'short_answer_start_token' # exist only  if task == 'both' or 'squading'
    'short_answer_end_token' # exist only if task == 'both' or 'squading'
  ''' 
  assert task == 'classing' or task == 'squading' or task == 'both'
  new_data_d = {}
  # assignment for both tasks  
  annotations = json_obj['annotations'][0]
  long_answer_candidate = annotations['long_answer']
  if example_id:
    new_data_d['example_id'] = json_obj['example_id']
  new_data_d['question_text'] = json_obj['question_text']
  long_ans_start = long_answer_candidate['start_token']
  long_ans_end = long_answer_candidate['end_token']
  new_data_d['long_answer_text'] = (
    extracting_text_using_start_end_token_id(
      json_obj['document_text'],
      long_ans_start,
      long_ans_end
    ))
  if task != 'both':
    if task == 'squading':
      short_answer_candidate = annotations['short_answers']
      if not short_answer_candidate:
        short_ans_start = -1
        short_ans_end = -1
      else:
        short_ans_start = short_answer_candidate[0]['start_token'] - long_ans_start
        short_ans_end = short_answer_candidate[0]['end_token'] - long_ans_start
      new_data_d['short_answer_start_token'] = short_ans_start
      new_data_d['short_answer_end_token'] = short_ans_end
    elif task == 'classing':
      new_data_d['yes_no_answer'] = annotations['yes_no_answer']
  else:
    # get squading labels 
    short_answer_candidate = annotations['short_answers']
    if not short_answer_candidate:
      short_ans_start = -1
      short_ans_end = -1
    else:
      short_ans_start = short_answer_candidate[0]['start_token'] - long_ans_start
      short_ans_end = short_answer_candidate[0]['end_token'] - long_ans_start
    new_data_d['short_answer_start_token'] = short_ans_start
    new_data_d['short_answer_end_token'] = short_ans_end
    # get classing labels 
    new_data_d['yes_no_answer'] = annotations['yes_no_answer']
  return new_data_d 

In [0]:
def create_short_answer_dataset(path):
  short_answer_dataset = []
  with open(path) as f:
    for line in f:
      old_data_d = json.loads(line)
      if has_long_answer(old_data_d['annotations'][0]['long_answer']):
        new_data_d = data_cleaning_for_short_answer(old_data_d)
        short_answer_dataset.append(new_data_d)
  return pd.DataFrame(short_answer_dataset)

In [0]:
raw_df = create_short_answer_dataset(train_path)

print(len(raw_df))
print(raw_df.columns)

### Save Short Answer Dataset to Pickle Format and Export It To Google Drive

In [0]:
raw_df.to_pickle("./short_ans_raw_df.pkl")

In [0]:
!cp ./short_ans_raw_df.pkl /content/drive/My\ Drive

### Create YES/NO Answer Dataset

In [0]:
raw_df = pd.read_pickle("drive/My Drive/short_ans_raw_df.pkl")
raw_df = raw_df[:5000]
raw_df.head()

Unnamed: 0,example_id,question_text,long_answer_text,yes_no_answer
0,5655493461695504401,which is the most common use of opt-in e-mail ...,<P> A common example of permission marketing i...,NONE
1,5328212470870865242,how i.met your mother who is the mother,"<P> Tracy McConnell , better known as `` The M...",NONE
2,4435104480114867852,what type of fertilisation takes place in humans,<P> The process of fertilization involves a sp...,NONE
3,5289242154789678439,who had the most wins in the nfl,<P> Active quarterback Tom Brady holds the rec...,NONE
4,-2500044561429484630,who played mantis guardians of the galaxy 2,<P> Pom Klementieff ( born 3 May 1986 ) is a F...,NONE


### Original Version

In [0]:
def get_question_tokens(tokenizer, question_text):
  question_tokens = ['[CLS]'] + tokenizer.tokenize(question_text) + ['[SEP]']
  return question_tokens


def get_long_answer_tokens(tokenizer, question_tokens,long_answer_text):
  tokens = question_tokens + tokenizer.tokenize(long_answer_text)\
    + ['[SEP]']
  return tokens


def get_long_answer_tokens_and_start_end_tokens(
  tokenizer, 
  question_tokens,
  long_answer_text,
  short_answer_start_token, 
  short_answer_end_token
  ):
  long_answer_tokens = long_answer_text.split()
  chunk_1 = ' '.join(long_answer_tokens[:short_answer_start_token])
  chunk_2 = ' '.join(long_answer_tokens[short_answer_start_token:short_answer_end_token])
  chunk_3 = ' '.join(long_answer_tokens[short_answer_end_token:])
  # handle new start end token
  tokens = question_tokens + tokenizer.tokenize(chunk_1)
  label_start_token = len(tokens)
  tokens = tokens + tokenizer.tokenize(chunk_2)
  label_end_token = len(tokens)
  tokens = tokens + tokenizer.tokenize(chunk_3) + ['[SEP]']
  return tokens, label_start_token, label_end_token


def generate_short_ans_feature(row, task, MAX_LENGTH = 512):
  # This function takes a row of the short answer dataframe as input
  # and outputs a dict with the following 
  # keys: 
  # 1. token_ids
  # 2. segment_ids 
  # 3. mask_ids 
  # 4. label_yes_no (if task == classing or both)  
  # 5. label_start/end_token (if task == classing or squading) 
  label_yes_no_map = {
    'YES': 0,
    'NO': 1,
    'NONE': 2
  }
  short_ans_feature_dict = {}
  question_text = row.question_text
  long_answer_text = row.long_answer_text
  if task == 'squading':
    short_answer_start_token = row.short_answer_start_token
    short_answer_end_token = row.short_answer_end_token
  elif task == 'classing': 
    short_ans_feature_dict['label_yes_no'] = label_yes_no_map[row.yes_no_answer]
  else:
    short_answer_start_token = row.short_answer_start_token
    short_answer_end_token = row.short_answer_end_token
    short_ans_feature_dict['label_yes_no'] = label_yes_no_map[row.yes_no_answer]
  question_tokens = get_question_tokens(
    tokenizer,
    row.question_text)
  sentence_A_len = len(question_tokens)
  # tokenize long answer if no short answer entities exists 
  if task == 'classing' or short_answer_start_token == -1:
    tokens = get_long_answer_tokens(
      tokenizer, 
      question_tokens,
      long_answer_text)
    sentence_len = len(tokens)
    label_start_token = 0
    label_end_token = 0
  # tokenize short answer span
  else:
    # cut long answer into 3 chunks
    tokens, label_start_token, label_end_token = get_long_answer_tokens_and_start_end_tokens(
      tokenizer, 
      question_tokens,
      long_answer_text,
      short_answer_start_token, 
      short_answer_end_token
      )
    sentence_len = len(tokens)
  # apply truncating
  if sentence_len > MAX_LENGTH:
    tokens = tokens[:MAX_LENGTH-1] + ['[SEP]']
    sentence_len = MAX_LENGTH
  if label_end_token > MAX_LENGTH - 1: # should not exceed last token [SEP]
    label_start_token = 0
    label_end_token = 0
  # create segment_id and mask_id
  segment_ids = sentence_A_len * [0] + (sentence_len - sentence_A_len) * [1] 
  mask_ids = sentence_len * [1]
  # apply padding
  if (sentence_len < MAX_LENGTH):
    pad_len = MAX_LENGTH - sentence_len
    tokens = tokens + pad_len * ['[PAD]']
    segment_ids = segment_ids + pad_len * [0]
    mask_ids = mask_ids + pad_len * [0]
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  # append to lists
  short_ans_feature_dict['token_ids'] = token_ids
  short_ans_feature_dict['segment_ids'] = segment_ids
  short_ans_feature_dict['mask_ids'] = mask_ids
  if task != 'classing':
    short_ans_feature_dict['label_start_tokens'] = label_start_token
    short_ans_feature_dict['label_end_tokens'] = label_end_token
  return short_ans_feature_dict


def create_short_ans_features(
  raw_df, 
  tokenizer, 
  task='classing'):
  '''
  parameters:
  raw_df: short answer dataframe
  task: 'classing' (default): yes/no answer; 'squading': short answer entity
  returns:
  dataframe of tokenized short answer dataset
  '''
  # assertions 
  assert task == 'classing' or task == 'squading' or task == 'both'
  if task == 'classing':
    assert "yes_no_answer" in raw_df.columns 
  elif task == 'squading':
    assert "short_answer_start_token" in raw_df.columns 
    assert "short_answer_end_token" in raw_df.columns 
  else:
    assert "yes_no_answer" in raw_df.columns 
    assert "short_answer_start_token" in raw_df.columns 
    assert "short_answer_end_token" in raw_df.columns

  dict_list = [generate_short_ans_feature(row,task) for _, row in raw_df.iterrows()]
  return pd.DataFrame(dict_list)

In [0]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

In [0]:
yes_no_ans_df = create_short_ans_features(raw_df, tokenizer, task='squading')
yes_no_ans_df.head()

1 loop, best of 3: 36.6 s per loop


### Save YES/NO Answer Dataset to Pickle Format and Export It To Google Drive

In [0]:
# save dataframe to pickle file
yes_no_ans_df.to_pickle("./yes_no_ans_df.pkl")

In [0]:
!cp ./yes_no_ans_df.pkl /content/drive/My\ Drive