<a href="https://colab.research.google.com/github/cyyeh/kaggle/blob/master/google-qa/google_qa_yesno_albert_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries and Environment Setup

In [61]:
!pip install transformers # BertModel



In [0]:
import json
import os
import pandas as pd

# Prepare YES/NO Answer Dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


Check if training/testing dataset is available in your google drive. If it's not available, you should run code inside the "Prepare Kaggle Dataset" section.

In [99]:
if os.path.exists('drive/My Drive/yes_no_ans_df.pkl') and \
os.path.exists('drive/My Drive/short_ans_raw_df.pkl'):
  print("Training dataset is available!")
else:
  print("Training dataset is not found, please run code inside the 'Prepare Kaggle Dataset' section.")

Training dataset is available!


## Prepare Kaggle Dataset for [TensorFlow 2.0 Question Answering](https://www.kaggle.com/c/tensorflow2-question-answering)

### Download Question Answering Dataset

In [6]:
import os
os.environ['KAGGLE_USERNAME'] = "chihyuyeh" # username from the json file
os.environ['KAGGLE_KEY'] = "f21b340fc8082977cbf954c80ad69ae1" # key from the json file
!kaggle competitions download -c tensorflow2-question-answering

Downloading simplified-nq-test.jsonl.zip to /content
100% 4.78M/4.78M [00:00<00:00, 49.6MB/s]

Downloading simplified-nq-train.jsonl.zip to /content
100% 4.46G/4.46G [00:58<00:00, 65.2MB/s]
100% 4.46G/4.46G [00:58<00:00, 82.0MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/18.2k [00:00<?, ?B/s]
100% 18.2k/18.2k [00:00<00:00, 15.6MB/s]


In [7]:
!unzip simplified-nq-train.jsonl.zip
!unzip simplified-nq-test.jsonl.zip

Archive:  simplified-nq-train.jsonl.zip
  inflating: simplified-nq-train.jsonl  
Archive:  simplified-nq-test.jsonl.zip
  inflating: simplified-nq-test.jsonl  


### Generate Short Answer Raw Data Dataframe

In [0]:
train_path = 'simplified-nq-train.jsonl'
test_path = 'simplified-nq-test.jsonl'

In [0]:
def extracting_text_using_start_end_token_id(document_text, start_token, end_token):
    splitted_document_text = document_text.split()
    return ' '.join(splitted_document_text[start_token:end_token])

In [0]:
def has_long_answer(long_answer_candidate):
  return long_answer_candidate['start_token'] != -1 \
  and long_answer_candidate['candidate_index'] != -1 \
  and long_answer_candidate['end_token'] != -1

In [0]:
def data_cleaning_for_short_answer(json_obj):
  annotations = json_obj['annotations'][0]
  long_answer_candidate = annotations['long_answer']
  long_ans_start = long_answer_candidate['start_token']
  long_ans_end = long_answer_candidate['end_token']
  short_answer_candidate = annotations['short_answers']

  if not short_answer_candidate:
    short_ans_start = -1
    short_ans_end = -1
  else:
    short_ans_start = short_answer_candidate[0]['start_token'] - long_ans_start
    short_ans_end = short_answer_candidate[0]['end_token'] - long_ans_start

  new_data_d = {
        'example_id': json_obj['example_id'],
        'question_text': json_obj['question_text'],
        'long_answer_text': extracting_text_using_start_end_token_id(
            json_obj['document_text'],
            long_ans_start,
            long_ans_end
        ),
        'yes_no_answer': annotations['yes_no_answer'],
        'short_answer_start_token': short_ans_start,
        'short_answer_end_token': short_ans_end
  }
  return new_data_d

In [0]:
def create_short_answer_dataset(path):
  short_answer_dataset = []
  with open(path) as f:
    for line in f:
      old_data_d = json.loads(line)
      if has_long_answer(old_data_d['annotations'][0]['long_answer']):
        new_data_d = data_cleaning_for_short_answer(old_data_d)
        short_answer_dataset.append(new_data_d)
  return pd.DataFrame(short_answer_dataset)

In [69]:
raw_df = create_short_answer_dataset(train_path)

print(len(raw_df))
raw_df.head()

152148


Unnamed: 0,example_id,question_text,long_answer_text,yes_no_answer,short_answer_start_token,short_answer_end_token
0,5655493461695504401,which is the most common use of opt-in e-mail ...,<P> A common example of permission marketing i...,NONE,8,17
1,5328212470870865242,how i.met your mother who is the mother,"<P> Tracy McConnell , better known as `` The M...",NONE,1,3
2,4435104480114867852,what type of fertilisation takes place in humans,<P> The process of fertilization involves a sp...,NONE,-1,-1
3,5289242154789678439,who had the most wins in the nfl,<P> Active quarterback Tom Brady holds the rec...,NONE,3,5
4,-2500044561429484630,who played mantis guardians of the galaxy 2,<P> Pom Klementieff ( born 3 May 1986 ) is a F...,NONE,1,3


### Save Short Answer Dataset to Pickle Format and Export It To Google Drive

In [0]:
raw_df.to_pickle("./short_ans_raw_df.pkl")

In [0]:
!cp ./short_ans_raw_df.pkl /content/drive/My\ Drive

### Create YES/NO Answer Dataset

In [0]:
def create_short_ans_features(raw_df, mode='1'):
  '''
  parameters:
  raw_df: short answer dataframe
  mode: 1(default): yes/no answer; 2: short answer entity
  returns:
  dataframe for tokenizedshort answer dataset
  '''
  from transformers import AlbertTokenizer

  tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
  short_ans_features_dict = {
    'token_ids': [],
    'segment_ids': [],
    'mask_ids': []
  }

  if mode == '1':
    short_ans_features_dict['label_yes_no'] = []
  elif mode == '2':
    short_ans_features_dict['label_start_tokens'] = []
    short_ans_features_dict['label_end_tokens'] = []

  label_yes_no_map = {
    'YES': 0,
    'NO': 1,
    'NONE': 2
  }

  MAX_LENGTH = 512

  for i in range(len(raw_df)):
    # tokenize question text
    tokens = ['[CLS]'] + tokenizer.tokenize(raw_df.question_text[i]) + ['[SEP]']
    sentence_A_len = len(tokens)
  
    # tokenize no long answer
    if raw_df.short_answer_start_token[i] == -1:
      tokens = tokens + tokenizer.tokenize(raw_df.long_answer_text[i]) + ['[SEP]']
      sentence_len = len(tokens)
      label_start_token = 0
      label_end_token = 0
    # tokenize short answer span
    else:
      # cut long answer into 3 chunks
      long_answer = raw_df.long_answer_text[i].split()
      chunk_1 = ' '.join(long_answer[:raw_df.short_answer_start_token[i]])
      chunk_2 = ' '.join(long_answer[raw_df.short_answer_start_token[i]:raw_df.short_answer_end_token[i]])
      chunk_3 = ' '.join(long_answer[raw_df.short_answer_end_token[i]:])

      # handle new start end token
      tokens = tokens + tokenizer.tokenize(chunk_1)
      label_start_token = len(tokens)
      tokens = tokens + tokenizer.tokenize(chunk_2)
      label_end_token = len(tokens)

      tokens = tokens + tokenizer.tokenize(chunk_3) + ['[SEP]']
      sentence_len = len(tokens)

    # apply truncating
    if sentence_len > MAX_LENGTH:
      tokens = tokens[:MAX_LENGTH-1] + ['[SEP]']
      sentence_len = MAX_LENGTH
    if label_end_token > MAX_LENGTH - 1: # should not exceed last token [SEP]
      label_start_token = 0
      label_end_token = 0

    # create segment_id and mask_id
    segment_ids = sentence_A_len * [0] + (sentence_len - sentence_A_len) * [1] 
    mask_ids = sentence_len * [1]

    # apply padding
    if (sentence_len < MAX_LENGTH):
      pad_len = MAX_LENGTH - sentence_len
      tokens = tokens + pad_len * ['[PAD]']
      segment_ids = segment_ids + pad_len * [0]
      mask_ids = mask_ids + pad_len * [0]
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # yes_no_label
    yes_no_label = label_yes_no_map[raw_df.yes_no_answer[i]]

    # append to lists
    short_ans_features_dict['token_ids'].append(token_ids)
    short_ans_features_dict['segment_ids'].append(segment_ids)
    short_ans_features_dict['mask_ids'].append(mask_ids)
    if mode == '1':
      short_ans_features_dict['label_yes_no'].append(yes_no_label)
    elif mode == '2':
      short_ans_features_dict['label_start_tokens'].append(label_start_token)
      short_ans_features_dict['label_end_tokens'].append(label_end_token)

  return pd.DataFrame(short_ans_features_dict)

In [93]:
yes_no_ans_df = create_short_ans_features(raw_df)
print(len(yes_no_ans_df))
yes_no_ans_df.head()

152148


Unnamed: 0,token_ids,segment_ids,mask_ids,label_yes_no
0,"[2, 56, 25, 14, 127, 757, 275, 16, 17034, 8, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
1,"[2, 184, 31, 9, 5909, 154, 449, 72, 25, 14, 44...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
2,"[2, 98, 1001, 16, 4270, 8005, 4330, 1384, 209,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,"[2, 72, 41, 14, 127, 4041, 19, 14, 4101, 3, 13...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
4,"[2, 72, 257, 169, 3409, 16931, 16, 14, 9358, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2


### Save YES/NO Answer Dataset to Pickle Format and Export It To Google Drive

In [0]:
# save dataframe to pickle file
yes_no_ans_df.to_pickle("./yes_no_ans_df.pkl")

In [0]:
!cp ./yes_no_ans_df.pkl /content/drive/My\ Drive