# Tweet input processing: Download, look into, and store to S3

In [1]:
import pandas as pd
import re
import boto3
import numpy as np
import os

from TweetInput import TweetInput

<h2> Download tweet data from s3 </h2>
<p>Location: https://201912-mbp-gsmoon.s3.us-east-2.amazonaws.com/emoji_data/emojis.csv

In [2]:
%store -r

In [3]:
s3_destination_path_csv


's3://sagemaker-us-east-2-057716757052/tweet_emoticon/csv'

In [4]:
! aws s3 ls {s3_destination_path_csv}/

2020-06-22 12:33:51    6704694 tweet_file_01.csv
2020-06-22 12:33:51    6715384 tweet_file_02.csv


In [5]:
file_name_01 = os.path.join(s3_destination_path_csv, "tweet_file_01.csv")
file_name_02 = os.path.join(s3_destination_path_csv, "tweet_file_02.csv")
tweet_file_01_df = pd.read_csv(file_name_01)
tweet_file_02_df = pd.read_csv(file_name_02)

## Make two input files

In [6]:
print(tweet_file_01_df.info())
print(tweet_file_02_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116038 entries, 0 to 116037
Data columns (total 2 columns):
TWEET    116038 non-null object
LABEL    116038 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116039 entries, 0 to 116038
Data columns (total 2 columns):
TWEET    116039 non-null object
LABEL    116039 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.8+ MB
None


# Process BERT Input

In [7]:
!pip install --upgrade pip
!pip install -q wrapt --upgrade --ignore-installed
!pip install -q transformers==2.8.0
!pip install -q tensorflow==2.1.0

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (20.1.1)


In [8]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# We set sequence to be at most 128 tokens long
MAX_SEQ_LENGTH = 128
LABEL_VALUES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i
    
class InputFeatures(object):
    """
    BERT feature vectors
    """
    def __init__(self,
                 input_ids,
                 input_mask,
                 segment_ids,
                 label_id
                ):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        
class Input(object):
    """
    A single training/test input for sequence classifications
    """
    def __init__(self, text, label=None):
        """Constructs an Input.
        Args:
          text: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
          label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.text = text
        self.label = label
        
def convert_input(text_input):
    # First, we need to preprocess our data so that it matches the data BERT was trained on:
    # 1. Lowercase our text (if we're using a BERT lowercase model)
    # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
    # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
    # 
    # Fortunately, the Transformers tokenizer does this for us!

    tokens = tokenizer.tokenize(text_input.text)
    print('**tokens**\n{}\n'.format(tokens))

    encode_plus_tokens = tokenizer.encode_plus(text_input.text,
                                               pad_to_max_length=True,
                                               max_length=MAX_SEQ_LENGTH)

    input_ids = encode_plus_tokens['input_ids']
    input_mask = encode_plus_tokens['attention_mask']
    segment_ids = [0] * MAX_SEQ_LENGTH

    label_id = label_map[text_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id)

    print('**input_ids**\n{}\n'.format(features.input_ids))
    print('**input_mask**\n{}\n'.format(features.input_mask))
    print('**segment_ids**\n{}\n'.format(features.segment_ids))
    print('**label_id**\n{}\n'.format(features.label_id))

    return features
    
# We'll need to transform our data into a format that BERT understands
# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
# - `label` is the star_rating label (0, 1, 2, 3, 4, 5, ..,9) for our training input data

def transform_inputs_to_tfrecord(inputs):
    
    tf_records = list()
    for (input_idx, text_input) in enumerate(inputs):            
        if input_idx % 10000 == 0:
            print('Writing input {} of {}\n'.format(input_idx, len(inputs)))

        features = convert_input(text_input)
        
        all_features = collections.OrderedDict()
        all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))

        tf_records.append(tf_record.SerializeToString())

    return tf_records
        

## Show BERT feature vectors

In [9]:
df01_sample = tweet_file_01_df[0:2]
df02_sample = tweet_file_02_df[0:2]
DATA_COLUMN = 'TWEET'
LABEL_COLUMN = 'LABEL'


In [10]:

inputs = df01_sample.apply(lambda x: Input(text = x[DATA_COLUMN], 
                                   label = x[LABEL_COLUMN]), axis=1)
tf_records = transform_inputs_to_tfrecord(inputs)

Writing input 0 of 2

**tokens**
['@', 'at', '##l', '##hawks', 'chance', 'the', 'rapper', 'or', 'kent', 'ba', '##ze', '##more', 'chance', '##3', 'coloring', '##book', 'twins']

**input_ids**
[101, 1030, 2012, 2140, 16043, 3382, 1996, 10687, 2030, 5982, 8670, 4371, 5974, 3382, 2509, 22276, 8654, 8178, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**input_mask**
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [11]:
inputs = df02_sample.apply(lambda x: Input(text = x[DATA_COLUMN], 
                                   label = x[LABEL_COLUMN]), axis=1)
tf_records = transform_inputs_to_tfrecord(inputs)

Writing input 0 of 2

**tokens**
['@', 'eli', '##ssar', '##ene', '##ee', 'lo', '##l', 'that', "'", 's', 'how', 'you', 'know', 'a', 'movie', 'is', 'good']

**input_ids**
[101, 1030, 12005, 25556, 8625, 4402, 8840, 2140, 2008, 1005, 1055, 2129, 2017, 2113, 1037, 3185, 2003, 2204, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**input_mask**
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [12]:
tf_records[0]

b'\n\xee\x03\n\x96\x01\n\x0bsegment_ids\x12\x86\x01\x1a\x83\x01\n\x80\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\xa6\x01\n\tinput_ids\x12\x98\x01\x1a\x95\x01\n\x92\x01e\x86\x08\xe5]\xd4\xc7\x01\xb1C\xb2"\x88E\xdc\x10\xd8\x0f\xed\x07\x9f\x08\xd1\x10\xe1\x0f\xc1\x10\x8d\x08\xf1\x18\xd3\x0f\x9c\x11f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00