In [None]:
%autosave 0

In [None]:
!pip install transformers

In [None]:
!pip install ipywidgets widgetsnbextension pandas-profiling

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [1]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

max_seq_length = 64

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

REVIEW_BODY_COLUMN = "review_body"
REVIEW_ID_COLUMN = "review_id"

LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]

2024-01-10 08:14:57.670039: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i

In [3]:
print(label_map)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}


In [4]:
class InputFeatures(object):
    """BERT特徴量ベクトル"""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.review_id = review_id
        self.date = date
        self.label = label

In [5]:
class Input(object):
    """シーケンス分類で用いるトレーニング/テストの単一の入力"""

    def __init__(self, text, review_id, date, label=None):
        """入力のコンストラクタ
        Args:
          text: 文字列。トークン化されていない一つ目のシーケンスのテキスト。
            単一シーケンスのタスクではこのシーケンスのみを指定する。
          label: (オプショナル) 文字列。サンプルのラベル。トレーニングや検証用のサンプルでは指定する。
            テスト用のサンプルでは指定しない。
        """
        self.text = text
        self.review_id = review_id
        self.date = date
        self.label = label


In [6]:
def convert_input(the_input, max_seq_length):
    # まず、BERTが学習したデータ形式と合うようにデータを前処理する。
    # 1. テキストを小文字にする（BERT lowercaseモデルを用いる場合）
    # 2. トークン化する（例、"sally says hi" -> ["sally", "says", "hi"]）
    # 3. 単語をWordPieceに分割（例、"calling" -> ["call", "##ing"]）
    #
    # この辺りの処理はTransformersライブラリのトークナイザーがまかなってくれます。

    tokens = tokenizer.tokenize(the_input.text)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')
    # print("**{} tokens**\n{}\n".format(len(tokens), tokens))

    encode_plus_tokens = tokenizer.encode_plus(
        the_input.text,
        pad_to_max_length=True,
        max_length=max_seq_length,
        truncation=True
    )
    
    # 事前学習済みBERTの語彙ID。トークンを表す。（トークン数が `max_seq_length` 未満であれば0をパディングする）
    input_ids = encode_plus_tokens["input_ids"]

    # BERTがどのトークンに注目するかを0/1で指定。`input_ids` のパディング部分のベクトル要素には0を割り当てる。
    input_mask = encode_plus_tokens["attention_mask"]

    # テキスト分類のような単一シーケンスのタスクではセグメントIDは常に0とする。質問回答や次文予測のような2シーケンスタスクの場合は1を割り当てる。
    segment_ids = [0] * max_seq_length

    # それぞれのトレーニングデータの行のラベル（`star_rating` 1〜5）
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date,
        label=the_input.label,
    )

    # print("**{} input_ids**\n{}\n".format(len(features.input_ids), features.input_ids))
    # print("**{} input_mask**\n{}\n".format(len(features.input_mask), features.input_mask))
    # print("**{} segment_ids**\n{}\n".format(len(features.segment_ids), features.segment_ids))
    # print("**label_id**\n{}\n".format(features.label_id))
    # print("**review_id**\n{}\n".format(features.review_id))
    # print("**date**\n{}\n".format(features.date))
    # print("**label**\n{}\n".format(features.label))

    return features


In [7]:
def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
    # データをBERTが理解できるフォーマットに変換する
    records = []
    tf_record_writer = tf.io.TFRecordWriter(output_file)

    for (input_idx, the_input) in enumerate(inputs):
        if input_idx % 10000 == 0:
            print("Writing input {} of {}\n".format(input_idx, len(inputs)))

        features = convert_input(the_input, max_seq_length)

        all_features = collections.OrderedDict()

        # input_ids、input_mask、segment_ids、label_idsを含んだTFRecordを作成
        all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
        tf_record_writer.write(tf_record.SerializeToString())

        # Feature Storeに格納する、すべての特徴量を含んだレコードを作成
        records.append(
            {
                "input_ids": features.input_ids,
                "input_mask": features.input_mask,
                "segment_ids": features.segment_ids,
                "label_id": features.label_id,
                "review_id": the_input.review_id,
                "date": the_input.date,
                "label": features.label,
            }
        )

    tf_record_writer.close()

    return records

In [8]:
from datetime import datetime
from time import strftime

# timestamp = datetime.now().replace(microsecond=0).isoformat()
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

2024-01-10T08:15:42Z


In [9]:
import pandas as pd

data = [
    [
        5,
        "ABCD12345",
        """I needed an "antivirus" application and know the quality of Norton products.  This was a no brainer for me and I am glad it was so simple to get.""",
    ],
    [
        3,
        "EFGH12345",
        """The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.""",
    ],
    [
        1,
        "IJKL2345",
        """Terrible, none of my codes worked, and I can't uninstall it.  I think this product IS malware and viruses""",
    ],
]

df = pd.DataFrame(data, columns=["star_rating", "review_id", "review_body"])

# Input クラスを使用して、データからサンプルを作成する。
inputs = df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [10]:
df

Unnamed: 0,star_rating,review_id,review_body
0,5,ABCD12345,"I needed an ""antivirus"" application and know t..."
1,3,EFGH12345,The problem with ElephantDrive is that it requ...
2,1,IJKL2345,"Terrible, none of my codes worked, and I can't..."


In [11]:
inputs

0    <__main__.Input object at 0x7f862b219b10>
1    <__main__.Input object at 0x7f862b219bd0>
2    <__main__.Input object at 0x7f862b219cf0>
dtype: object

In [12]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
print(inputs[0].date)

2024-01-10T08:15:42Z


In [13]:
output_file = "./data.tfrecord"

In [14]:
records = transform_inputs_to_tfrecord(inputs, output_file, max_seq_length)

Writing input 0 of 3





In [16]:
print(records[0])

{'input_ids': [101, 1045, 2734, 2019, 1000, 3424, 23350, 1000, 4646, 1998, 2113, 1996, 3737, 1997, 10770, 3688, 1012, 2023, 2001, 1037, 2053, 4167, 2121, 2005, 2033, 1998, 1045, 2572, 5580, 2009, 2001, 2061, 3722, 2000, 2131, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'input_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'segment_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label_id': 4, 'review_id': 'ABCD12345', 'date': '2024-01-10T08:15:42Z', 'label': 5}


In [17]:
import pandas as pd

In [18]:
df = pd.read_parquet("/mnt/amazon_reviews_2015.snappy.parquet",columns=["star_rating","review_id","review_body"])

In [19]:
# validation_df = df.tail(400000)
validation_df = df.sample(n=400000)

In [20]:
# train_df = df.head(2000000)
train_df = df.sample(n=2000000)

In [21]:
validation_df

Unnamed: 0,star_rating,review_id,review_body
11344580,4,b'R1OBPIY8OUIPS8',"b'Daniel Schroeder is a memorysmith, a special..."
5040427,2,b'R1TUJL0R14EIOI',b'We'
41300046,5,b'RJ6BFXOU5P00',b'A little goes a long way. Still have most of...
14772219,3,b'R314JO010XMCAI',b'It flowed nicely and was an interesting read...
35104823,4,b'R2VQTPNCJIGBJQ',"b""a high quality item but don't be fooled by t..."
...,...,...,...
33298676,4,b'RSRHBNKH8NSUX',"b""These were gifted to another and the recipie..."
13192025,5,b'R3E2W390716L9Q',"b""Awesome device.<br />I hated running since s..."
27853670,5,b'R2DXIHY4YGIUIB',"b'I love it. Thank you,'"
26325461,5,b'R20E2X7NNFL41',b'Just about ready to harvest. Corn looks good.'


In [22]:
train_df

Unnamed: 0,star_rating,review_id,review_body
40287687,5,b'R2I81D9GSXEODT',"b'great tub. We never used the sling, but use..."
41204959,5,b'RNFRRCK32GN5W',b'Kept me going from start to finish. A few p...
20760519,3,b'R2HFIXQLHMXPF8',b'Three stars just because I am a huge Stallon...
28134124,1,b'R2B4I8IN8JABMS',b'Tip feel off the first day of use. Complete...
5887886,5,b'R2O8E72T4Y1WLD',b'I use this nice fragrance for refreshing-dai...
...,...,...,...
39514290,5,b'R2749LXIXL021A',"b""Love this cable! I've had several others and..."
28272239,5,b'RRC34AQWEOXZO',"b'I love this, Does want it says.'"
22849981,5,b'R30G7N52K9JZUF',"b""This is my favorite baby name book. We've p..."
12957973,1,b'REKWJMG8V3Q75',"b""This is my review as of 7/28/17:<br />Worst ..."


In [23]:
train_df['review_id'] = train_df['review_id'].str.decode("utf-8")

In [24]:
train_df['review_body'] = train_df['review_body'].str.decode("utf-8","ignore")

In [25]:
train_df

Unnamed: 0,star_rating,review_id,review_body
40287687,5,R2I81D9GSXEODT,"great tub. We never used the sling, but use t..."
41204959,5,RNFRRCK32GN5W,Kept me going from start to finish. A few plo...
20760519,3,R2HFIXQLHMXPF8,Three stars just because I am a huge Stallone ...
28134124,1,R2B4I8IN8JABMS,Tip feel off the first day of use. Complete POS
5887886,5,R2O8E72T4Y1WLD,I use this nice fragrance for refreshing-daily.
...,...,...,...
39514290,5,R2749LXIXL021A,Love this cable! I've had several others and t...
28272239,5,RRC34AQWEOXZO,"I love this, Does want it says."
22849981,5,R30G7N52K9JZUF,This is my favorite baby name book. We've pur...
12957973,1,REKWJMG8V3Q75,This is my review as of 7/28/17:<br />Worst pr...


In [26]:
validation_df['review_id'] = validation_df['review_id'].str.decode("utf-8")

In [27]:
validation_df['review_body'] = validation_df['review_body'].str.decode("utf-8","ignore")

In [28]:
validation_df

Unnamed: 0,star_rating,review_id,review_body
11344580,4,R1OBPIY8OUIPS8,"Daniel Schroeder is a memorysmith, a specialis..."
5040427,2,R1TUJL0R14EIOI,We
41300046,5,RJ6BFXOU5P00,A little goes a long way. Still have most of i...
14772219,3,R314JO010XMCAI,It flowed nicely and was an interesting read. ...
35104823,4,R2VQTPNCJIGBJQ,a high quality item but don't be fooled by the...
...,...,...,...
33298676,4,RSRHBNKH8NSUX,These were gifted to another and the recipient...
13192025,5,R3E2W390716L9Q,Awesome device.<br />I hated running since sch...
27853670,5,R2DXIHY4YGIUIB,"I love it. Thank you,"
26325461,5,R20E2X7NNFL41,Just about ready to harvest. Corn looks good.


In [29]:
print(f"df Memory Usage: {df.memory_usage(deep=True).sum() / 1024**3} GB")

df Memory Usage: 11.584748897701502 GB


In [30]:
# Input クラスを使用して、データからサンプルを作成する。
train_inputs = train_df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [31]:
# Input クラスを使用して、データからサンプルを作成する。
validation_inputs = validation_df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [None]:
# !pip install pandarallel

In [None]:
# from pandarallel import pandarallel
# import os
# os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
# !pip3 install --no-cache-dir accelerate
# pandarallel.initialize(nb_workers=2, progress_bar=True, use_memory_fs=False)

In [None]:
#def func(x):
#    return lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp)

# inputs = df.parallel_apply(
#     lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
#     axis=1
# )

In [32]:
train_inputs

40287687    <__main__.Input object at 0x7f818173b640>
41204959    <__main__.Input object at 0x7f818173be20>
20760519    <__main__.Input object at 0x7f8746463610>
28134124    <__main__.Input object at 0x7f8746463550>
5887886     <__main__.Input object at 0x7f862b218cd0>
                              ...                    
39514290    <__main__.Input object at 0x7f8557b91fc0>
28272239    <__main__.Input object at 0x7f8557b92020>
22849981    <__main__.Input object at 0x7f8557b92080>
12957973    <__main__.Input object at 0x7f8557b920e0>
10631401    <__main__.Input object at 0x7f8557b92140>
Length: 2000000, dtype: object

In [33]:
train_output_file = "/mnt/train_data.tfrecord"

In [34]:
validation_output_file = "/mnt/validation_data.tfrecord"

In [None]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
# print(train_inputs[0].date)

In [35]:
train_records = transform_inputs_to_tfrecord(train_inputs, train_output_file, max_seq_length)

Writing input 0 of 2000000





Writing input 10000 of 2000000

Writing input 20000 of 2000000

Writing input 30000 of 2000000

Writing input 40000 of 2000000

Writing input 50000 of 2000000

Writing input 60000 of 2000000

Writing input 70000 of 2000000

Writing input 80000 of 2000000

Writing input 90000 of 2000000

Writing input 100000 of 2000000

Writing input 110000 of 2000000

Writing input 120000 of 2000000

Writing input 130000 of 2000000

Writing input 140000 of 2000000

Writing input 150000 of 2000000

Writing input 160000 of 2000000

Writing input 170000 of 2000000

Writing input 180000 of 2000000

Writing input 190000 of 2000000

Writing input 200000 of 2000000

Writing input 210000 of 2000000

Writing input 220000 of 2000000

Writing input 230000 of 2000000

Writing input 240000 of 2000000

Writing input 250000 of 2000000

Writing input 260000 of 2000000

Writing input 270000 of 2000000

Writing input 280000 of 2000000

Writing input 290000 of 2000000

Writing input 300000 of 2000000

Writing input 31000

In [36]:
validation_records = transform_inputs_to_tfrecord(validation_inputs, validation_output_file, max_seq_length)

Writing input 0 of 400000

Writing input 10000 of 400000

Writing input 20000 of 400000

Writing input 30000 of 400000

Writing input 40000 of 400000

Writing input 50000 of 400000

Writing input 60000 of 400000

Writing input 70000 of 400000

Writing input 80000 of 400000

Writing input 90000 of 400000

Writing input 100000 of 400000

Writing input 110000 of 400000

Writing input 120000 of 400000

Writing input 130000 of 400000

Writing input 140000 of 400000

Writing input 150000 of 400000

Writing input 160000 of 400000

Writing input 170000 of 400000

Writing input 180000 of 400000

Writing input 190000 of 400000

Writing input 200000 of 400000

Writing input 210000 of 400000

Writing input 220000 of 400000

Writing input 230000 of 400000

Writing input 240000 of 400000

Writing input 250000 of 400000

Writing input 260000 of 400000

Writing input 270000 of 400000

Writing input 280000 of 400000

Writing input 290000 of 400000

Writing input 300000 of 400000

Writing input 310000 o

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>