# OPTION - [Module 2.0] BERT Input 변환 과정 확인
이 노트북은 Text 입력 데이타가 최종적으로 사용할 TF Record 형태로 변환하는 과정을 보여줌. 이를 위해 [huggingface의 Transformers](https://github.com/huggingface/transformers) 를 사용 함.

- S3에서 Tweet 데이터 로컬에 다운로드
- Input Text --> BERT Feature Vector 로 변환 --> TF Record로 변환
```python
    # BERT Feature Vector
    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id)
```
- 변환된 예시 Tweet 보여줌
---
Reference
- Chris Fregly, Antje Barth, Book, Data Science on AWS, https://www.oreilly.com/library/view/data-science-on/9781492079385/
    - Source: Data Science on Amazon Web Services
        - https://github.com/data-science-on-aws/workshop
- Transformers (https://github.com/huggingface/transformers)
    

In [11]:
import pandas as pd
import re
import boto3
import numpy as np
import os


<h2> Download tweet data from S3 </h2>
이전 노트북에서 업로드한 S3의 데이타 파일 가져오기

In [12]:
%store -r

In [13]:
! aws s3 ls {s3_destination_path_csv}/

2020-08-16 13:34:55     838266 tweet_file_01.csv.gz
2020-08-16 13:34:55     838236 tweet_file_02.csv.gz


In [14]:
file_name_01 = os.path.join(s3_destination_path_csv, "tweet_file_01.csv.gz")
file_name_02 = os.path.join(s3_destination_path_csv, "tweet_file_02.csv.gz")
print("file_name_01: ", file_name_01)
print("file_name_02: ", file_name_02)
tweet_file_01_df = pd.read_csv(file_name_01, compression='gzip')
tweet_file_02_df = pd.read_csv(file_name_02, compression='gzip')

file_name_01:  s3://sagemaker-ap-northeast-2-343441690612/tweet_emoticon/csv/tweet_file_01.csv.gz
file_name_02:  s3://sagemaker-ap-northeast-2-343441690612/tweet_emoticon/csv/tweet_file_02.csv.gz


# Process BERT Input

In [15]:
!pip install --upgrade pip
!pip install -q wrapt --upgrade --ignore-installed
!pip install -q transformers==2.8.0
!pip install -q tensorflow==2.1.0

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (20.2.2)
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

astroid 2.3.3 requires wrapt==1.11.*, but you'll have wrapt 1.12.1 which is incompatible.[0m


In [16]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# We set sequence to be at most 128 tokens long
MAX_SEQ_LENGTH = 32
LABEL_VALUES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i
    
class InputFeatures(object):
    """
    BERT feature vectors
    """
    def __init__(self,
                 input_ids,
                 input_mask,
                 segment_ids,
                 label_id
                ):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        
class Input(object):
    """
    A single training/test input for sequence classifications
    """
    def __init__(self, text, label=None):
        """Constructs an Input.
        Args:
          text: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
          label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.text = text
        self.label = label
        
def convert_input(text_input):
    # First, we need to preprocess our data so that it matches the data BERT was trained on:
    # 1. Lowercase our text (if we're using a BERT lowercase model)
    # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
    # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
    # 
    # Fortunately, the Transformers tokenizer does this for us!

    tokens = tokenizer.tokenize(text_input.text)
    print('**tokens**\n{}\n'.format(tokens))

    encode_plus_tokens = tokenizer.encode_plus(text_input.text,
                                               pad_to_max_length=True,
                                               max_length=MAX_SEQ_LENGTH)

    input_ids = encode_plus_tokens['input_ids']
    input_mask = encode_plus_tokens['attention_mask']
    segment_ids = [0] * MAX_SEQ_LENGTH

    label_id = label_map[text_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id)

    print('**input_ids**\n{}\n'.format(features.input_ids))
    print('**input_mask**\n{}\n'.format(features.input_mask))
    print('**segment_ids**\n{}\n'.format(features.segment_ids))
    print('**label_id**\n{}\n'.format(features.label_id))

    return features
    
# We'll need to transform our data into a format that BERT understands
# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
# - `label` is the star_rating label (0, 1, 2, 3, 4, 5, ..,9) for our training input data

def transform_inputs_to_tfrecord(inputs):
    
    tf_records = list()
    for (input_idx, text_input) in enumerate(inputs):            
        if input_idx % 10000 == 0:
            print('Writing input {} of {}\n'.format(input_idx, len(inputs)))

        features = convert_input(text_input)
        
        all_features = collections.OrderedDict()
        all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))

        tf_records.append(tf_record.SerializeToString())

    return tf_records
        

## Show BERT feature vectors

아래는 "Tweet" 의 **자연어** 가 Transfomer의 입력 형태인 **token, input_id, input_mask, segment_id, label_id** 로 변환 되는 것을 보여 줍니다.

In [17]:
df01_sample = tweet_file_01_df[0:1]
df02_sample = tweet_file_02_df[0:1]
DATA_COLUMN = 'TWEET'
LABEL_COLUMN = 'LABEL'


In [18]:
print("Tweet: \n {}".format(df01_sample[DATA_COLUMN][0]))
inputs = df01_sample.apply(lambda x: Input(text = x[DATA_COLUMN], 
                                   label = x[LABEL_COLUMN]), axis=1)
tf_records = transform_inputs_to_tfrecord(inputs)

Tweet: 
  street hood world 2016 by on soundcloud prod by yoshimi tcprt bzbrt 4 hiphop music
Writing input 0 of 1

**tokens**
['street', 'hood', 'world', '2016', 'by', 'on', 'sound', '##cl', '##oud', 'pro', '##d', 'by', 'yo', '##shi', '##mi', 'tc', '##pr', '##t', 'b', '##z', '##br', '##t', '4', 'hip', '##hop', 'music']

**input_ids**
[101, 2395, 7415, 2088, 2355, 2011, 2006, 2614, 20464, 19224, 4013, 2094, 2011, 10930, 6182, 4328, 22975, 18098, 2102, 1038, 2480, 19892, 2102, 1018, 5099, 18471, 2189, 102, 0, 0, 0, 0]

**input_mask**
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]

**segment_ids**
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**label_id**
2



In [19]:
print("Tweet: \n {}".format(df02_sample[DATA_COLUMN][0]))
inputs = df02_sample.apply(lambda x: Input(text = x[DATA_COLUMN], 
                                   label = x[LABEL_COLUMN]), axis=1)
tf_records = transform_inputs_to_tfrecord(inputs)

Tweet: 
  r g will never let you down yesterday was amazing
Writing input 0 of 1

**tokens**
['r', 'g', 'will', 'never', 'let', 'you', 'down', 'yesterday', 'was', 'amazing']

**input_ids**
[101, 1054, 1043, 2097, 2196, 2292, 2017, 2091, 7483, 2001, 6429, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**input_mask**
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**segment_ids**
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**label_id**
2



In [20]:
tf_records[0]

b'\n\xba\x01\n3\n\x0bsegment_ids\x12$\x1a"\n \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n;\n\tinput_ids\x12.\x1a,\n*e\x9e\x08\x93\x08\xb1\x10\x94\x11\xf4\x11\xe1\x0f\xab\x10\xbb:\xd1\x0f\x9d2f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n2\n\ninput_mask\x12$\x1a"\n \x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\x12\n\tlabel_ids\x12\x05\x1a\x03\n\x01\x02'