# [Module 1.5] BERT Input 변환 과정 확인
이 노트북은 Text 입력 데이타가 최종적으로 사용할 TF Record 형태로 변환하는 과정을 보여줌. 이를 위해 huggingface의 Transformers 를 사용 함.

- S3에서 Tweet 데이터 로컬에 다운로드
- Input Text --> BERT Feature Vector 로 변환 --> TF Record로 변환
```python
    # BERT Feature Vector
    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id)
```
- 변환된 예시 Tweet 보여줌
---
Reference
- Chris Fregly, Antje Barth, Book, Data Science on AWS, https://www.oreilly.com/library/view/data-science-on/9781492079385/
    - Source: Data Science on Amazon Web Services
        - https://github.com/data-science-on-aws/workshop
- Transformers (https://github.com/huggingface/transformers)
    

In [1]:
import pandas as pd
import re
import boto3
import numpy as np
import os

from TweetInput import TweetInput

<h2> Download tweet data from S3 </h2>
이전 노트북에서 업로드한 S3의 데이타 파일 가져오기

In [2]:
%store -r

In [3]:
! aws s3 ls {s3_destination_path_csv}/

2020-07-21 12:28:31    2761552 tweet_file_01.csv.gz
2020-07-21 12:28:31    2749596 tweet_file_02.csv.gz
2020-07-19 13:06:40    6790624 tweet_file_test.csv


In [4]:
file_name_01 = os.path.join(s3_destination_path_csv, "tweet_file_01.csv.gz")
file_name_02 = os.path.join(s3_destination_path_csv, "tweet_file_02.csv.gz")
print("file_name_01: ", file_name_01)
print("file_name_02: ", file_name_02)
tweet_file_01_df = pd.read_csv(file_name_01, compression='gzip')
tweet_file_02_df = pd.read_csv(file_name_02, compression='gzip')

file_name_01:  s3://sagemaker-us-west-2-057716757052/tweet_emoticon/csv/tweet_file_01.csv.gz
file_name_02:  s3://sagemaker-us-west-2-057716757052/tweet_emoticon/csv/tweet_file_02.csv.gz


# Process BERT Input

In [5]:
!pip install --upgrade pip
!pip install -q wrapt --upgrade --ignore-installed
!pip install -q transformers==2.8.0
!pip install -q tensorflow==2.1.0

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (20.1.1)
[31mERROR: astroid 2.3.3 has requirement wrapt==1.11.*, but you'll have wrapt 1.12.1 which is incompatible.[0m


In [6]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# We set sequence to be at most 128 tokens long
MAX_SEQ_LENGTH = 128
LABEL_VALUES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i
    
class InputFeatures(object):
    """
    BERT feature vectors
    """
    def __init__(self,
                 input_ids,
                 input_mask,
                 segment_ids,
                 label_id
                ):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        
class Input(object):
    """
    A single training/test input for sequence classifications
    """
    def __init__(self, text, label=None):
        """Constructs an Input.
        Args:
          text: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
          label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.text = text
        self.label = label
        
def convert_input(text_input):
    # First, we need to preprocess our data so that it matches the data BERT was trained on:
    # 1. Lowercase our text (if we're using a BERT lowercase model)
    # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
    # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
    # 
    # Fortunately, the Transformers tokenizer does this for us!

    tokens = tokenizer.tokenize(text_input.text)
    print('**tokens**\n{}\n'.format(tokens))

    encode_plus_tokens = tokenizer.encode_plus(text_input.text,
                                               pad_to_max_length=True,
                                               max_length=MAX_SEQ_LENGTH)

    input_ids = encode_plus_tokens['input_ids']
    input_mask = encode_plus_tokens['attention_mask']
    segment_ids = [0] * MAX_SEQ_LENGTH

    label_id = label_map[text_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id)

    print('**input_ids**\n{}\n'.format(features.input_ids))
    print('**input_mask**\n{}\n'.format(features.input_mask))
    print('**segment_ids**\n{}\n'.format(features.segment_ids))
    print('**label_id**\n{}\n'.format(features.label_id))

    return features
    
# We'll need to transform our data into a format that BERT understands
# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
# - `label` is the star_rating label (0, 1, 2, 3, 4, 5, ..,9) for our training input data

def transform_inputs_to_tfrecord(inputs):
    
    tf_records = list()
    for (input_idx, text_input) in enumerate(inputs):            
        if input_idx % 10000 == 0:
            print('Writing input {} of {}\n'.format(input_idx, len(inputs)))

        features = convert_input(text_input)
        
        all_features = collections.OrderedDict()
        all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))

        tf_records.append(tf_record.SerializeToString())

    return tf_records
        

## Show BERT feature vectors

In [7]:
df01_sample = tweet_file_01_df[0:1]
df02_sample = tweet_file_02_df[0:1]
DATA_COLUMN = 'TWEET'
LABEL_COLUMN = 'LABEL'


In [8]:
print("Tweet: \n {}".format(df01_sample[DATA_COLUMN][0]))
inputs = df01_sample.apply(lambda x: Input(text = x[DATA_COLUMN], 
                                   label = x[LABEL_COLUMN]), axis=1)
tf_records = transform_inputs_to_tfrecord(inputs)

Tweet: 
 @atlhawks chance the rapper or kent bazemore chance3 coloringbook twins
Writing input 0 of 1

**tokens**
['@', 'at', '##l', '##hawks', 'chance', 'the', 'rapper', 'or', 'kent', 'ba', '##ze', '##more', 'chance', '##3', 'coloring', '##book', 'twins']

**input_ids**
[101, 1030, 2012, 2140, 16043, 3382, 1996, 10687, 2030, 5982, 8670, 4371, 5974, 3382, 2509, 22276, 8654, 8178, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**input_mask**
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [9]:
print("Tweet: \n {}".format(df02_sample[DATA_COLUMN][0]))
inputs = df02_sample.apply(lambda x: Input(text = x[DATA_COLUMN], 
                                   label = x[LABEL_COLUMN]), axis=1)
tf_records = transform_inputs_to_tfrecord(inputs)

Tweet: 
 that song from tokyo drift makes me wanna drift and drive super bad ass
Writing input 0 of 1

**tokens**
['that', 'song', 'from', 'tokyo', 'drift', 'makes', 'me', 'wanna', 'drift', 'and', 'drive', 'super', 'bad', 'ass']

**input_ids**
[101, 2008, 2299, 2013, 5522, 11852, 3084, 2033, 10587, 11852, 1998, 3298, 3565, 2919, 4632, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**input_mask**
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [10]:
tf_records[0]

b'\n\xea\x03\n\x96\x01\n\x0bsegment_ids\x12\x86\x01\x1a\x83\x01\n\x80\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\xa2\x01\n\tinput_ids\x12\x94\x01\x1a\x91\x01\n\x8e\x01e\xd8\x0f\xfb\x11\xdd\x0f\x92+\xcc\\\x8c\x18\xf1\x0f\xdbR\xcc\\\xce\x0f\xe2\x19\xed\x1b\xe7\x16\x98$f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\