In [1]:
#%autosave 0

In [2]:
!pip install transformers

[0m

In [3]:
!pip install ipywidgets widgetsnbextension pandas-profiling



[0m

In [4]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [5]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

max_seq_length = 64

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

REVIEW_BODY_COLUMN = "review_body"
REVIEW_ID_COLUMN = "review_id"

LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]

2024-01-13 09:53:31.881946: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i

In [7]:
print(label_map)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}


In [8]:
class InputFeatures(object):
    """BERT特徴量ベクトル"""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.review_id = review_id
        self.date = date
        self.label = label

In [9]:
class Input(object):
    """シーケンス分類で用いるトレーニング/テストの単一の入力"""

    def __init__(self, text, review_id, date, label=None):
        """入力のコンストラクタ
        Args:
          text: 文字列。トークン化されていない一つ目のシーケンスのテキスト。
            単一シーケンスのタスクではこのシーケンスのみを指定する。
          label: (オプショナル) 文字列。サンプルのラベル。トレーニングや検証用のサンプルでは指定する。
            テスト用のサンプルでは指定しない。
        """
        self.text = text
        self.review_id = review_id
        self.date = date
        self.label = label


In [10]:
def convert_input(the_input, max_seq_length):
    # まず、BERTが学習したデータ形式と合うようにデータを前処理する。
    # 1. テキストを小文字にする（BERT lowercaseモデルを用いる場合）
    # 2. トークン化する（例、"sally says hi" -> ["sally", "says", "hi"]）
    # 3. 単語をWordPieceに分割（例、"calling" -> ["call", "##ing"]）
    #
    # この辺りの処理はTransformersライブラリのトークナイザーがまかなってくれます。

    tokens = tokenizer.tokenize(the_input.text)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')
    # print("**{} tokens**\n{}\n".format(len(tokens), tokens))

    encode_plus_tokens = tokenizer.encode_plus(
        the_input.text,
        pad_to_max_length=True,
        max_length=max_seq_length,
        truncation=True
    )
    
    # 事前学習済みBERTの語彙ID。トークンを表す。（トークン数が `max_seq_length` 未満であれば0をパディングする）
    input_ids = encode_plus_tokens["input_ids"]

    # BERTがどのトークンに注目するかを0/1で指定。`input_ids` のパディング部分のベクトル要素には0を割り当てる。
    input_mask = encode_plus_tokens["attention_mask"]

    # テキスト分類のような単一シーケンスのタスクではセグメントIDは常に0とする。質問回答や次文予測のような2シーケンスタスクの場合は1を割り当てる。
    segment_ids = [0] * max_seq_length

    # それぞれのトレーニングデータの行のラベル（`star_rating` 1〜5）
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date,
        label=the_input.label,
    )

    # print("**{} input_ids**\n{}\n".format(len(features.input_ids), features.input_ids))
    # print("**{} input_mask**\n{}\n".format(len(features.input_mask), features.input_mask))
    # print("**{} segment_ids**\n{}\n".format(len(features.segment_ids), features.segment_ids))
    # print("**label_id**\n{}\n".format(features.label_id))
    # print("**review_id**\n{}\n".format(features.review_id))
    # print("**date**\n{}\n".format(features.date))
    # print("**label**\n{}\n".format(features.label))

    return features


In [11]:
def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
    # データをBERTが理解できるフォーマットに変換する
    records = []
    tf_record_writer = tf.io.TFRecordWriter(output_file)

    for (input_idx, the_input) in enumerate(inputs):
        if input_idx % 10000 == 0:
            print("Writing input {} of {}\n".format(input_idx, len(inputs)))

        features = convert_input(the_input, max_seq_length)

        all_features = collections.OrderedDict()

        # input_ids、input_mask、segment_ids、label_idsを含んだTFRecordを作成
        all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
        tf_record_writer.write(tf_record.SerializeToString())

        # Feature Storeに格納する、すべての特徴量を含んだレコードを作成
        records.append(
            {
                "input_ids": features.input_ids,
                "input_mask": features.input_mask,
                "segment_ids": features.segment_ids,
                "label_id": features.label_id,
                "review_id": the_input.review_id,
                "date": the_input.date,
                "label": features.label,
            }
        )

    tf_record_writer.close()

    return records

In [12]:
from datetime import datetime
from time import strftime

# timestamp = datetime.now().replace(microsecond=0).isoformat()
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

2024-01-13T09:53:42Z


In [13]:
import pandas as pd

data = [
    [
        5,
        "ABCD12345",
        """I needed an "antivirus" application and know the quality of Norton products.  This was a no brainer for me and I am glad it was so simple to get.""",
    ],
    [
        3,
        "EFGH12345",
        """The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.""",
    ],
    [
        1,
        "IJKL2345",
        """Terrible, none of my codes worked, and I can't uninstall it.  I think this product IS malware and viruses""",
    ],
]

df = pd.DataFrame(data, columns=["star_rating", "review_id", "review_body"])

# Input クラスを使用して、データからサンプルを作成する。
inputs = df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [14]:
df

Unnamed: 0,star_rating,review_id,review_body
0,5,ABCD12345,"I needed an ""antivirus"" application and know t..."
1,3,EFGH12345,The problem with ElephantDrive is that it requ...
2,1,IJKL2345,"Terrible, none of my codes worked, and I can't..."


In [15]:
inputs

0    <__main__.Input object at 0x7f9585115a50>
1    <__main__.Input object at 0x7f9585115a80>
2    <__main__.Input object at 0x7f9585115d20>
dtype: object

In [16]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
print(inputs[0].date)

2024-01-13T09:53:42Z


In [17]:
output_file = "./data.tfrecord"

In [18]:
records = transform_inputs_to_tfrecord(inputs, output_file, max_seq_length)

Writing input 0 of 3





In [19]:
print(records[0]['input_ids'])

[101, 1045, 2734, 2019, 1000, 3424, 23350, 1000, 4646, 1998, 2113, 1996, 3737, 1997, 10770, 3688, 1012, 2023, 2001, 1037, 2053, 4167, 2121, 2005, 2033, 1998, 1045, 2572, 5580, 2009, 2001, 2061, 3722, 2000, 2131, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [20]:
import pandas as pd

In [21]:
df = pd.read_parquet("/mnt/amazon_reviews_2015.snappy.parquet",columns=["star_rating","review_id","review_body"])

In [22]:
for i in range(1,6):
    print("star_rating: ", i)
    print(df[df['star_rating'] == i].count())
    print('-------------------------')

star_rating:  1
star_rating    3253070
review_id      3253070
review_body    3253070
dtype: int64
-------------------------
star_rating:  2
star_rating    1865322
review_id      1865322
review_body    1865322
dtype: int64
-------------------------
star_rating:  3
star_rating    3130345
review_id      3130345
review_body    3130345
dtype: int64
-------------------------
star_rating:  4
star_rating    6578230
review_id      6578230
review_body    6578230
dtype: int64
-------------------------
star_rating:  5
star_rating    27078664
review_id      27078664
review_body    27078664
dtype: int64
-------------------------


In [23]:
for i in range(1,6):
    new_df = 'df' + str(i)
    globals()[new_df] = df[df['star_rating'] == i].sample(n=1865322)

In [24]:
df1

Unnamed: 0,star_rating,review_id,review_body
22584665,1,b'R11FEEV0RX08PS',"b""Don't waste your money. It does break down t..."
8499323,1,b'R2UD8BKASFV8MG',b'Smell was decent. Why 1 star? Messy. It was ...
34719018,1,b'R2YIK14O62P1BC',"b'The replacement ribbon came damaged, with th..."
5104415,1,b'RYIVJY7G8U4XI',"b""Was so excited to finally get a walking foot..."
17273280,1,b'R30EBCIJTSVKCS',b'Small'
...,...,...,...
37765554,1,b'R3TOF14V8XYIHO',b'utterly garbage.<br />they are heavy! the li...
32286847,1,b'R39AV44QAU0VXO',"b""I would love this product, but apparently, I..."
26315027,1,b'R3KRTTK6YH48KK',b'It is a piece of junk. Charged it for 9 hour...
41275145,1,b'R3GZC0Z1MCS39P',b'Broke the first day I had it.'


In [25]:
df2

Unnamed: 0,star_rating,review_id,review_body
35867943,2,b'RLWA5OI84OY78',b'Return'
34468209,2,b'RGHLYUZXHOCW0',b'The ink ran out within a month.'
9300615,2,b'R39KRO7R1M3XL5',"b""This is BROWN not black. It is not clear fro..."
31113356,2,b'R2QS7EX65QJWYU',b'It will not let me watch YouTube or Netflix ...
5354466,2,b'R2L4UN65071FYI',b'Thin story line. It requires a little humor ...
...,...,...,...
23885989,2,b'RFAJUDCZ37HCB',b':('
20525928,2,b'R2NT8F6VXDTOH2',"b""Nice color! That's about all, very thin with..."
37885167,2,b'R2QFVTC4KSN1RN',b'they are not compressing enough.'
8841708,2,b'R2NKM9BZSXTEO9',b'I cannot seem to get this to open with or wi...


In [26]:
df3

Unnamed: 0,star_rating,review_id,review_body
11397796,3,b'R3BFGY96VV871D',"b""Ok, but I can tell they won't last long."""
31975168,3,b'R12LF1SV7RUIXL',b'I brought 2 pairs of these sandals: one for ...
33347225,3,b'RV3CL901HHS6Q',"b""Not a &#34;Big boy CB&#34; Limited transmiss..."
20634795,3,b'R1ZM1L2WE64EDD',"b""For my mom ... she reported a lot of dead-in..."
8726758,3,b'R2L9NU8MGLPF8L',b'I go thru at least one a year...'
...,...,...,...
20563681,3,b'R2FX3C3KM23J4R',"b""The Coincidence of Coconut Cake is a perfect..."
4903541,3,b'R39PSQC3QB5RAF',"b""The price was great, but keep in mind that t..."
39042200,3,b'R11MIY55PO3SRZ',b'NOT VERY MANY INCLUDED IN THE PACKAGE GO THI...
12876421,3,b'RI8AFU4QVP4B3',b'Has parts were it skips right when opened'


In [27]:
df4

Unnamed: 0,star_rating,review_id,review_body
33356216,4,b'RVYQ6NZZ7OW5H',"b""Good to listen to music wwhile I workout my ..."
22620591,4,b'R2ZOSK5Z853KTF',"b'Came in right on time, back emblem does not ..."
20767209,4,b'R2EYBO3KX92TDI',b'love it'
23775506,4,b'R2Z938XBO0JNEH',"b'A good,inexspensive replacement,good fit,fun..."
35561309,4,b'R16F8O8A8Q9ZLX',"b""way way long i have a big calf and im 6'2&#3..."
...,...,...,...
2616184,4,b'R3RJCQP4H3NLJO',"b'very good quality,feels slightly narrow from..."
20385450,4,b'R2K1FJY5ETCVO5',b'Amusing easy reading. Perfect reading before...
39542466,4,b'R9LOJALS7CFHI',b'perfect for a costume. They stretch for vari...
13006051,4,b'R2AWIBKC7AB2SS',b'it works well'


In [28]:
df5

Unnamed: 0,star_rating,review_id,review_body
13781824,5,b'R3FVBCSZTAXIG6',b'excellent'
32791194,5,b'R2VI1Y67KOE196',b'Very robust design and delivered very quickl...
23577726,5,b'R24D3G34K5S2K6',"b""I did not buy this for its intended purpose ..."
4212851,5,b'R2UUZGHGLWPFRF',"b'even though is not a clear cream, it wont le..."
14735108,5,b'R3AZB989IA9ZAM',"b""I have been using Bob's recipe for duck for ..."
...,...,...,...
33888054,5,b'R9UFIGYVA210V',b'Very nice.'
2324960,5,b'R1UA2OBT8N79EX',"b""so true it's funny"""
20441136,5,b'R11R7HPI8V1RZ',"b'Wow.<br />Beautiful, beautiful stuff.'"
8128716,5,b'RPIHVFOW1XEII',b'Good product as described.'


In [29]:
df_temp = pd.concat([df1,df2,df3,df4,df5])

In [30]:
df_temp.count()

star_rating    9326610
review_id      9326610
review_body    9326610
dtype: int64

In [31]:
df_shaffle = df_temp.sample(9000000)

In [32]:
df_head = df_shaffle.head(8000000)
df_tail = df_shaffle.tail(1000000)

In [34]:
train_df = df_head.sample(n=2000000)
train_df

Unnamed: 0,star_rating,review_id,review_body
37499218,3,b'R24DJYGFO9F009',"b""The camera seems to have good build quality,..."
35181101,5,b'R1UNQ7TXRVQDXL',b'I had no problem at all. Very efficient!'
22435572,4,b'RJQH1Q2VY6E82',"b""A fun read on a familiar topic... Noah's ark!"""
3408094,3,b'R93IJKXTZQ4AW',b'Work great but break quite easily.'
6054982,3,b'R17L05U64NMV9X',b'Much larger than I thought and thinking abou...
...,...,...,...
34909518,4,b'RCZIXXPB2PCAB',"b'Working so far. Got most of the bubbles out,..."
26051895,3,b'R16PYOQ104PR2E',b'Easy to mount under a kitchen cabinet. Fair ...
11442245,3,b'RJ2WYZJMPVGQO',"b'Its ok, but needs to be more firmer, its to ..."
22178875,1,b'R2C3N9YSY6VDVI',"b""looks like kids chair. cheap materials and d..."


In [35]:
validation_df = df_tail.sample(n=400000)
validation_df

Unnamed: 0,star_rating,review_id,review_body
23800914,3,b'R2P93GYUXJOUT4',b'I like it because it has lots of different c...
26652549,1,b'R1A620YNF3OYT2',"b""Incredibly disappointed in this shirt. I had..."
5540209,1,b'R2MREOTFIJ24TE',b'How do I trust the code examples when this b...
17988972,1,b'R20E9KI6JX8VVP',"b""Cats like it but the price is a joke. Pay v..."
6122030,5,b'R36FR5TZLC3CNL',b'I loved the first installment of Nick Hall a...
...,...,...,...
24888829,4,b'R1OLCO2RYYRM81',b'Very snug fit and works as advertised. Howe...
24718260,4,b'R2NVPKLXUJITMP',"b""great for bedside table, but so small it doe..."
8506167,2,b'R3QWE70169BMNG',"b""don't know what podiatrist is cutting toenai..."
29460531,3,b'REAZVA9RV1O1G',"b""Very pretty watch! I bought this watch on Oc..."


In [36]:
train_df['review_id'] = train_df['review_id'].str.decode("utf-8")

In [37]:
train_df['review_body'] = train_df['review_body'].str.decode("utf-8","ignore")

In [38]:
train_df

Unnamed: 0,star_rating,review_id,review_body
37499218,3,R24DJYGFO9F009,"The camera seems to have good build quality, a..."
35181101,5,R1UNQ7TXRVQDXL,I had no problem at all. Very efficient!
22435572,4,RJQH1Q2VY6E82,A fun read on a familiar topic... Noah's ark!
3408094,3,R93IJKXTZQ4AW,Work great but break quite easily.
6054982,3,R17L05U64NMV9X,Much larger than I thought and thinking about ...
...,...,...,...
34909518,4,RCZIXXPB2PCAB,"Working so far. Got most of the bubbles out, b..."
26051895,3,R16PYOQ104PR2E,Easy to mount under a kitchen cabinet. Fair re...
11442245,3,RJ2WYZJMPVGQO,"Its ok, but needs to be more firmer, its to so..."
22178875,1,R2C3N9YSY6VDVI,looks like kids chair. cheap materials and doe...


In [39]:
validation_df['review_id'] = validation_df['review_id'].str.decode("utf-8")

In [40]:
validation_df['review_body'] = validation_df['review_body'].str.decode("utf-8","ignore")

In [41]:
validation_df

Unnamed: 0,star_rating,review_id,review_body
23800914,3,R2P93GYUXJOUT4,I like it because it has lots of different cha...
26652549,1,R1A620YNF3OYT2,Incredibly disappointed in this shirt. I had t...
5540209,1,R2MREOTFIJ24TE,How do I trust the code examples when this boo...
17988972,1,R20E9KI6JX8VVP,Cats like it but the price is a joke. Pay ver...
6122030,5,R36FR5TZLC3CNL,I loved the first installment of Nick Hall and...
...,...,...,...
24888829,4,R1OLCO2RYYRM81,Very snug fit and works as advertised. Howeve...
24718260,4,R2NVPKLXUJITMP,"great for bedside table, but so small it doesn..."
8506167,2,R3QWE70169BMNG,don't know what podiatrist is cutting toenails...
29460531,3,REAZVA9RV1O1G,Very pretty watch! I bought this watch on Octo...


In [42]:
print(f"df Memory Usage: {df.memory_usage(deep=True).sum() / 1024**3} GB")

df Memory Usage: 11.584748897701502 GB


In [43]:
# Input クラスを使用して、データからサンプルを作成する。
train_inputs = train_df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [44]:
# Input クラスを使用して、データからサンプルを作成する。
validation_inputs = validation_df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [None]:
# !pip install pandarallel

In [None]:
# from pandarallel import pandarallel
# import os
# os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
# !pip3 install --no-cache-dir accelerate
# pandarallel.initialize(nb_workers=2, progress_bar=True, use_memory_fs=False)

In [None]:
#def func(x):
#    return lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp)

# inputs = df.parallel_apply(
#     lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
#     axis=1
# )

In [45]:
train_inputs

37499218    <__main__.Input object at 0x7f8f6b313040>
35181101    <__main__.Input object at 0x7f8f6b313d30>
22435572    <__main__.Input object at 0x7f8f6b313c40>
3408094     <__main__.Input object at 0x7f958507d300>
6054982     <__main__.Input object at 0x7f8f6b34cd00>
                              ...                    
34909518    <__main__.Input object at 0x7f94d363ace0>
26051895    <__main__.Input object at 0x7f94d363ad40>
11442245    <__main__.Input object at 0x7f94d363ada0>
22178875    <__main__.Input object at 0x7f94d363ae00>
32973432    <__main__.Input object at 0x7f94d363ae60>
Length: 2000000, dtype: object

In [46]:
train_output_file = "/mnt/train_data.tfrecord"

In [47]:
validation_output_file = "/mnt/validation_data.tfrecord"

In [None]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
# print(train_inputs[0].date)

In [48]:
train_records = transform_inputs_to_tfrecord(train_inputs, train_output_file, max_seq_length)

Writing input 0 of 2000000





Writing input 10000 of 2000000

Writing input 20000 of 2000000

Writing input 30000 of 2000000

Writing input 40000 of 2000000

Writing input 50000 of 2000000

Writing input 60000 of 2000000

Writing input 70000 of 2000000

Writing input 80000 of 2000000

Writing input 90000 of 2000000

Writing input 100000 of 2000000

Writing input 110000 of 2000000

Writing input 120000 of 2000000

Writing input 130000 of 2000000

Writing input 140000 of 2000000

Writing input 150000 of 2000000

Writing input 160000 of 2000000

Writing input 170000 of 2000000

Writing input 180000 of 2000000

Writing input 190000 of 2000000

Writing input 200000 of 2000000

Writing input 210000 of 2000000

Writing input 220000 of 2000000

Writing input 230000 of 2000000

Writing input 240000 of 2000000

Writing input 250000 of 2000000

Writing input 260000 of 2000000

Writing input 270000 of 2000000

Writing input 280000 of 2000000

Writing input 290000 of 2000000

Writing input 300000 of 2000000

Writing input 31000

In [49]:
validation_records = transform_inputs_to_tfrecord(validation_inputs, validation_output_file, max_seq_length)

Writing input 0 of 400000

Writing input 10000 of 400000

Writing input 20000 of 400000

Writing input 30000 of 400000

Writing input 40000 of 400000

Writing input 50000 of 400000

Writing input 60000 of 400000

Writing input 70000 of 400000

Writing input 80000 of 400000

Writing input 90000 of 400000

Writing input 100000 of 400000

Writing input 110000 of 400000

Writing input 120000 of 400000

Writing input 130000 of 400000

Writing input 140000 of 400000

Writing input 150000 of 400000

Writing input 160000 of 400000

Writing input 170000 of 400000

Writing input 180000 of 400000

Writing input 190000 of 400000

Writing input 200000 of 400000

Writing input 210000 of 400000

Writing input 220000 of 400000

Writing input 230000 of 400000

Writing input 240000 of 400000

Writing input 250000 of 400000

Writing input 260000 of 400000

Writing input 270000 of 400000

Writing input 280000 of 400000

Writing input 290000 of 400000

Writing input 300000 of 400000

Writing input 310000 o

In [50]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>