In [1]:
%autosave 0

Autosave disabled


In [2]:
!pip install transformers

[0m

In [3]:
!pip install ipywidgets widgetsnbextension pandas-profiling



[0m

In [4]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [5]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

max_seq_length = 64

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

REVIEW_BODY_COLUMN = "review_body"
REVIEW_ID_COLUMN = "review_id"

LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]

2024-01-11 11:36:34.729328: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i

In [7]:
print(label_map)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}


In [8]:
class InputFeatures(object):
    """BERT特徴量ベクトル"""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.review_id = review_id
        self.date = date
        self.label = label

In [9]:
class Input(object):
    """シーケンス分類で用いるトレーニング/テストの単一の入力"""

    def __init__(self, text, review_id, date, label=None):
        """入力のコンストラクタ
        Args:
          text: 文字列。トークン化されていない一つ目のシーケンスのテキスト。
            単一シーケンスのタスクではこのシーケンスのみを指定する。
          label: (オプショナル) 文字列。サンプルのラベル。トレーニングや検証用のサンプルでは指定する。
            テスト用のサンプルでは指定しない。
        """
        self.text = text
        self.review_id = review_id
        self.date = date
        self.label = label


In [10]:
def convert_input(the_input, max_seq_length):
    # まず、BERTが学習したデータ形式と合うようにデータを前処理する。
    # 1. テキストを小文字にする（BERT lowercaseモデルを用いる場合）
    # 2. トークン化する（例、"sally says hi" -> ["sally", "says", "hi"]）
    # 3. 単語をWordPieceに分割（例、"calling" -> ["call", "##ing"]）
    #
    # この辺りの処理はTransformersライブラリのトークナイザーがまかなってくれます。

    tokens = tokenizer.tokenize(the_input.text)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')
    # print("**{} tokens**\n{}\n".format(len(tokens), tokens))

    encode_plus_tokens = tokenizer.encode_plus(
        the_input.text,
        pad_to_max_length=True,
        max_length=max_seq_length,
        truncation=True
    )
    
    # 事前学習済みBERTの語彙ID。トークンを表す。（トークン数が `max_seq_length` 未満であれば0をパディングする）
    input_ids = encode_plus_tokens["input_ids"]

    # BERTがどのトークンに注目するかを0/1で指定。`input_ids` のパディング部分のベクトル要素には0を割り当てる。
    input_mask = encode_plus_tokens["attention_mask"]

    # テキスト分類のような単一シーケンスのタスクではセグメントIDは常に0とする。質問回答や次文予測のような2シーケンスタスクの場合は1を割り当てる。
    segment_ids = [0] * max_seq_length

    # それぞれのトレーニングデータの行のラベル（`star_rating` 1〜5）
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date,
        label=the_input.label,
    )

    # print("**{} input_ids**\n{}\n".format(len(features.input_ids), features.input_ids))
    # print("**{} input_mask**\n{}\n".format(len(features.input_mask), features.input_mask))
    # print("**{} segment_ids**\n{}\n".format(len(features.segment_ids), features.segment_ids))
    # print("**label_id**\n{}\n".format(features.label_id))
    # print("**review_id**\n{}\n".format(features.review_id))
    # print("**date**\n{}\n".format(features.date))
    # print("**label**\n{}\n".format(features.label))

    return features


In [11]:
def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
    # データをBERTが理解できるフォーマットに変換する
    records = []
    tf_record_writer = tf.io.TFRecordWriter(output_file)

    for (input_idx, the_input) in enumerate(inputs):
        if input_idx % 10000 == 0:
            print("Writing input {} of {}\n".format(input_idx, len(inputs)))

        features = convert_input(the_input, max_seq_length)

        all_features = collections.OrderedDict()

        # input_ids、input_mask、segment_ids、label_idsを含んだTFRecordを作成
        all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
        tf_record_writer.write(tf_record.SerializeToString())

        # Feature Storeに格納する、すべての特徴量を含んだレコードを作成
        records.append(
            {
                "input_ids": features.input_ids,
                "input_mask": features.input_mask,
                "segment_ids": features.segment_ids,
                "label_id": features.label_id,
                "review_id": the_input.review_id,
                "date": the_input.date,
                "label": features.label,
            }
        )

    tf_record_writer.close()

    return records

In [12]:
from datetime import datetime
from time import strftime

# timestamp = datetime.now().replace(microsecond=0).isoformat()
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

2024-01-11T11:36:51Z


In [13]:
import pandas as pd

data = [
    [
        5,
        "ABCD12345",
        """I needed an "antivirus" application and know the quality of Norton products.  This was a no brainer for me and I am glad it was so simple to get.""",
    ],
    [
        3,
        "EFGH12345",
        """The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.""",
    ],
    [
        1,
        "IJKL2345",
        """Terrible, none of my codes worked, and I can't uninstall it.  I think this product IS malware and viruses""",
    ],
]

df = pd.DataFrame(data, columns=["star_rating", "review_id", "review_body"])

# Input クラスを使用して、データからサンプルを作成する。
inputs = df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [14]:
df

Unnamed: 0,star_rating,review_id,review_body
0,5,ABCD12345,"I needed an ""antivirus"" application and know t..."
1,3,EFGH12345,The problem with ElephantDrive is that it requ...
2,1,IJKL2345,"Terrible, none of my codes worked, and I can't..."


In [15]:
inputs

0    <__main__.Input object at 0x7f0f72615f60>
1    <__main__.Input object at 0x7f0f72615e40>
2    <__main__.Input object at 0x7f0f72616230>
dtype: object

In [16]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
print(inputs[0].date)

2024-01-11T11:36:51Z


In [17]:
output_file = "./data.tfrecord"

In [18]:
records = transform_inputs_to_tfrecord(inputs, output_file, max_seq_length)

Writing input 0 of 3





In [19]:
print(records[0]['input_ids'])

[101, 1045, 2734, 2019, 1000, 3424, 23350, 1000, 4646, 1998, 2113, 1996, 3737, 1997, 10770, 3688, 1012, 2023, 2001, 1037, 2053, 4167, 2121, 2005, 2033, 1998, 1045, 2572, 5580, 2009, 2001, 2061, 3722, 2000, 2131, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [20]:
import pandas as pd

In [21]:
df = pd.read_parquet("/mnt/amazon_reviews_2015.snappy.parquet",columns=["star_rating","review_id","review_body"])

In [22]:
for i in range(1,6):
    print("star_rating: ", i)
    print(df[df['star_rating'] == i].count())
    print('-------------------------')

star_rating:  1
star_rating    3253070
review_id      3253070
review_body    3253070
dtype: int64
-------------------------
star_rating:  2
star_rating    1865322
review_id      1865322
review_body    1865322
dtype: int64
-------------------------
star_rating:  3
star_rating    3130345
review_id      3130345
review_body    3130345
dtype: int64
-------------------------
star_rating:  4
star_rating    6578230
review_id      6578230
review_body    6578230
dtype: int64
-------------------------
star_rating:  5
star_rating    27078664
review_id      27078664
review_body    27078664
dtype: int64
-------------------------


In [23]:
for i in range(1,6):
    new_df = 'df' + str(i)
    globals()[new_df] = df[df['star_rating'] == i].sample(n=1865322)

In [24]:
df1

Unnamed: 0,star_rating,review_id,review_body
4903618,1,b'RKFCQXLBBPRD3',b'This stuff sucked probably more than the doc...
33698210,1,b'R2RRMD19EGL0DD',"b""This cd is basically inaccurate. I'm very di..."
3102182,1,b'R3S9ROXEFA0YBH',b'Not worth your money.'
37250725,1,b'R2UTEK3SPPE8JG',b'The color scheme is beautiful. The texture i...
31682325,1,b'R2HMVTJIEEWP7I',b'Item appears used and has 2 missing screws.'
...,...,...,...
6641386,1,b'RGY4E9V1T8AU5',b'Never showed up. Waste of money.'
2246292,1,b'R25LL06YZ6BBIY',"b""didn't like, returned"""
36142430,1,b'R32SVMLMDWAVFT',"b""doesn't charge right. i would it leave it fo..."
29637306,1,b'R1D0C5NO4BGH46',b'I had the same issues as with the Freebird G...


In [25]:
df2

Unnamed: 0,star_rating,review_id,review_body
29664787,2,b'R2WJNTRLEDHK5C',b'The top of the boot is small so it is hard t...
36093020,2,b'R2RMNCP513ZG3I',"b'After 1 hour of opened, an couple of pieces ..."
16861574,2,b'R2NWWN7PNPI1D9',"b""It's a well made top but it's just a strange..."
35764654,2,b'R14YCB8RTXL5TE',"b""I like to remove the phone from the case fro..."
32246781,2,b'RCSWXPMUW6IIF',"b""The small band is pretty. The larger ring lo..."
...,...,...,...
20490415,2,b'R3JF78H9ESU1OF',"b'Story, speaking, action was way too slow. V..."
5936141,2,b'R1SKEYSE08BS6I',"b""Unfortunately, it didn't include the lyrics!..."
14411528,2,b'R3E7QYDZA9JE2P',"b""OK, so I've had them for a week and the batt..."
22671768,2,b'R3163DMNTQH35K',"b""We broke quite a few of these. A friend of m..."


In [26]:
df3

Unnamed: 0,star_rating,review_id,review_body
38369774,3,b'R21FDA9V55LSY9',b'Good laughs from the old days - film editing...
37926078,3,b'ROTXNE6PMWJUU',b'I love this cover but my husband hates it. T...
38435581,3,b'RE9HXR1AZ55CS',"b""Way too small. Seems like a decent product b..."
34621463,3,b'R3EO2KTHFNX63O',b'LIttle disappointed it leaks from the plug.....
25370923,3,b'R2E5IO61DF7DD0',"b'It fits everywhere but the hips to zip, and ..."
...,...,...,...
27255323,3,b'R1D9551FSRH4BP',b'Es una interesante propuesta para entender l...
4984989,3,b'R37O701FVLB64W',"b'It was pretty good, hard to believe it was t..."
15125812,3,b'RI2VAV4BV8HXW',"b""Its a good movie, entertaining but really fa..."
35102221,3,b'R141UJO69H8OO0',"b""I bought these to wear to work but there's n..."


In [27]:
df4

Unnamed: 0,star_rating,review_id,review_body
4831141,4,b'RO5H7WBD5L0Y9',b'Top rung'
32339917,4,b'RZJ9N02HTGPCN',"b""Having a little trouble with the zipper plus..."
16136195,4,b'R18F99AEWSE341',b'I really enjoyed this movie.'
11246491,4,b'RN59ARR9IYIIO',"b'Good product, but definitely made for runner..."
6294277,4,b'R26EJ01YWCT73L',b'Most enjoyable.'
...,...,...,...
12727919,4,b'R1GH3LBM2UQH49',"b""It's nice to have Aida in larger pieces than..."
39150390,4,b'R16GZT3GU9VWQ9',b'it is very pretty'
12803618,4,b'R2MPJ3YMAA3VML',"b'Nice and long. V is very deep, deeper than I..."
29262798,4,b'R3IHFGT0V2L147',b'The only downside is it freezes up occasiona...


In [28]:
df5

Unnamed: 0,star_rating,review_id,review_body
29634415,5,b'R2L5YNQ7ZM2RBY',"b'Thin, high-quality, protection, super soft i..."
12898871,5,b'R3OC6ACLMBLNCT',"b""I have a lot of stress in my life right now,..."
14200114,5,b'RSR7I2HQ615XM',b'Just what I expected'
34636970,5,b'R1JK42UEKER3RB',"b""It's the perfect case for the Nexus 4"""
20724568,5,b'R276AWTXA0LAH0',b'I love this waffle maker supper easy to use ...
...,...,...,...
14905399,5,b'RUF5AYPWOS4KY',"b'Sturdy and practical, good value!'"
7171554,5,b'R2HM3MLHJ9ZUJQ',b'Bosch is not another formula police show. Th...
37473834,5,b'R2LTKDGUGH5D84',b'These headphones are the best thing that hav...
21990883,5,b'R2UYQJN7WNTDB5',b'This the original exceptionally effective Ni...


In [29]:
dfx = pd.concat([df1,df2,df3,df4,df5])

In [30]:
dfx.count()

star_rating    9326610
review_id      9326610
review_body    9326610
dtype: int64

In [31]:
# validation_df = df.tail(400000)
validation_df = dfx.sample(n=400000)

In [32]:
# train_df = df.head(2000000)
train_df = dfx.sample(n=2000000)

In [33]:
validation_df

Unnamed: 0,star_rating,review_id,review_body
30892578,1,b'R14JGGFVPXKKDS',b'The black toner cartridge stained the paper ...
8847100,3,b'R1L0C7FY2K27LH',"b'I like it, but wish it were bigger'"
31949199,4,b'R21FGG0J3MSWMF',b'I am using it for my Raspberry OpenElec 5.0....
1557562,3,b'R2FM9NSH6LI12A',"b""Overall not a bad game it has Good graphics ..."
20183956,2,b'R1W3CCIMU6TFYO',b'Love the product and very quick delivery.'
...,...,...,...
15587114,4,b'R384VUPSX8R5JI',b'Great book. Sheldon Siegel is one of my top ...
6440820,2,b'RNGVTTJ90X0AF',"b""Coins disappearing from your it coins and I ..."
8776712,2,b'R224JJDFA3KCTV',"b""After reading the reviews I was excited to g..."
24658656,2,b'R1LIQ27HGIUQ4J',"b""It works, but it's very fragile. I've now b..."


In [34]:
train_df

Unnamed: 0,star_rating,review_id,review_body
10812900,5,b'R3628NTZ9P23U8',b'Good product'
31045612,5,b'R2OXAPPO8GRVTE',"b""Arrived quickly , very well packed.<br />It'..."
5989889,1,b'R2CRPFL202DJ53',b'Three weeks of use and I had problems. The c...
16291618,5,b'R7A719ZDOGLRB',b'I just purchased the iPhone6 and it was driv...
9571136,1,b'R3O691ED4K68XM',b'Will not last. Purchased from BestBuy at a v...
...,...,...,...
27894319,5,b'R3TQGER9V4S02K',b'One of the best things over used to load mag...
35551473,4,b'RRLUBZDDICYD2',"b'Great, but buy the size larger.'"
16476478,1,b'R2VO44V0LHNZ1L',b'These taste extremely awful and bitter and y...
3294004,1,b'RPZXP2SKYHFHE',b'I thought this was the new season - so my mi...


In [35]:
train_df['review_id'] = train_df['review_id'].str.decode("utf-8")

In [36]:
train_df['review_body'] = train_df['review_body'].str.decode("utf-8","ignore")

In [37]:
train_df

Unnamed: 0,star_rating,review_id,review_body
10812900,5,R3628NTZ9P23U8,Good product
31045612,5,R2OXAPPO8GRVTE,"Arrived quickly , very well packed.<br />It's ..."
5989889,1,R2CRPFL202DJ53,Three weeks of use and I had problems. The cas...
16291618,5,R7A719ZDOGLRB,I just purchased the iPhone6 and it was drivin...
9571136,1,R3O691ED4K68XM,Will not last. Purchased from BestBuy at a ver...
...,...,...,...
27894319,5,R3TQGER9V4S02K,One of the best things over used to load magaz...
35551473,4,RRLUBZDDICYD2,"Great, but buy the size larger."
16476478,1,R2VO44V0LHNZ1L,These taste extremely awful and bitter and you...
3294004,1,RPZXP2SKYHFHE,I thought this was the new season - so my mist...


In [38]:
validation_df['review_id'] = validation_df['review_id'].str.decode("utf-8")

In [39]:
validation_df['review_body'] = validation_df['review_body'].str.decode("utf-8","ignore")

In [40]:
validation_df

Unnamed: 0,star_rating,review_id,review_body
30892578,1,R14JGGFVPXKKDS,The black toner cartridge stained the paper wi...
8847100,3,R1L0C7FY2K27LH,"I like it, but wish it were bigger"
31949199,4,R21FGG0J3MSWMF,I am using it for my Raspberry OpenElec 5.0.5....
1557562,3,R2FM9NSH6LI12A,Overall not a bad game it has Good graphics an...
20183956,2,R1W3CCIMU6TFYO,Love the product and very quick delivery.
...,...,...,...
15587114,4,R384VUPSX8R5JI,Great book. Sheldon Siegel is one of my top fa...
6440820,2,RNGVTTJ90X0AF,Coins disappearing from your it coins and I bo...
8776712,2,R224JJDFA3KCTV,After reading the reviews I was excited to get...
24658656,2,R1LIQ27HGIUQ4J,"It works, but it's very fragile. I've now bro..."


In [41]:
print(f"df Memory Usage: {df.memory_usage(deep=True).sum() / 1024**3} GB")

df Memory Usage: 11.584748897701502 GB


In [42]:
# Input クラスを使用して、データからサンプルを作成する。
train_inputs = train_df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [43]:
# Input クラスを使用して、データからサンプルを作成する。
validation_inputs = validation_df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [None]:
# !pip install pandarallel

In [None]:
# from pandarallel import pandarallel
# import os
# os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
# !pip3 install --no-cache-dir accelerate
# pandarallel.initialize(nb_workers=2, progress_bar=True, use_memory_fs=False)

In [None]:
#def func(x):
#    return lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp)

# inputs = df.parallel_apply(
#     lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
#     axis=1
# )

In [44]:
train_inputs

10812900    <__main__.Input object at 0x7f097c0af5e0>
31045612    <__main__.Input object at 0x7f097c0afe20>
5989889     <__main__.Input object at 0x7f097c0af070>
16291618    <__main__.Input object at 0x7f097c0af640>
9571136     <__main__.Input object at 0x7f0f73237fa0>
                              ...                    
27894319    <__main__.Input object at 0x7f0ebd3525f0>
35551473    <__main__.Input object at 0x7f0ebd352650>
16476478    <__main__.Input object at 0x7f0ebd3526b0>
3294004     <__main__.Input object at 0x7f0ebd352710>
31278741    <__main__.Input object at 0x7f0ebd352770>
Length: 2000000, dtype: object

In [45]:
train_output_file = "/mnt/train_data.tfrecord"

In [46]:
validation_output_file = "/mnt/validation_data.tfrecord"

In [None]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
# print(train_inputs[0].date)

In [47]:
train_records = transform_inputs_to_tfrecord(train_inputs, train_output_file, max_seq_length)

Writing input 0 of 2000000





Writing input 10000 of 2000000

Writing input 20000 of 2000000

Writing input 30000 of 2000000

Writing input 40000 of 2000000

Writing input 50000 of 2000000

Writing input 60000 of 2000000

Writing input 70000 of 2000000

Writing input 80000 of 2000000

Writing input 90000 of 2000000

Writing input 100000 of 2000000

Writing input 110000 of 2000000

Writing input 120000 of 2000000

Writing input 130000 of 2000000

Writing input 140000 of 2000000

Writing input 150000 of 2000000

Writing input 160000 of 2000000

Writing input 170000 of 2000000

Writing input 180000 of 2000000

Writing input 190000 of 2000000

Writing input 200000 of 2000000

Writing input 210000 of 2000000

Writing input 220000 of 2000000

Writing input 230000 of 2000000

Writing input 240000 of 2000000

Writing input 250000 of 2000000

Writing input 260000 of 2000000

Writing input 270000 of 2000000

Writing input 280000 of 2000000

Writing input 290000 of 2000000

Writing input 300000 of 2000000

Writing input 31000

In [48]:
validation_records = transform_inputs_to_tfrecord(validation_inputs, validation_output_file, max_seq_length)

Writing input 0 of 400000

Writing input 10000 of 400000

Writing input 20000 of 400000

Writing input 30000 of 400000

Writing input 40000 of 400000

Writing input 50000 of 400000

Writing input 60000 of 400000

Writing input 70000 of 400000

Writing input 80000 of 400000

Writing input 90000 of 400000

Writing input 100000 of 400000

Writing input 110000 of 400000

Writing input 120000 of 400000

Writing input 130000 of 400000

Writing input 140000 of 400000

Writing input 150000 of 400000

Writing input 160000 of 400000

Writing input 170000 of 400000

Writing input 180000 of 400000

Writing input 190000 of 400000

Writing input 200000 of 400000

Writing input 210000 of 400000

Writing input 220000 of 400000

Writing input 230000 of 400000

Writing input 240000 of 400000

Writing input 250000 of 400000

Writing input 260000 of 400000

Writing input 270000 of 400000

Writing input 280000 of 400000

Writing input 290000 of 400000

Writing input 300000 of 400000

Writing input 310000 o

In [49]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>