In [1]:
#%autosave 0

In [2]:
!pip install transformers

[0m

In [3]:
!pip install ipywidgets widgetsnbextension pandas-profiling



[0m

In [4]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [5]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

max_seq_length = 64

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

REVIEW_BODY_COLUMN = "review_body"
REVIEW_ID_COLUMN = "review_id"

LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]

2024-01-14 07:07:55.132243: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i

In [7]:
print(label_map)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}


In [8]:
class InputFeatures(object):
    """BERT特徴量ベクトル"""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.review_id = review_id
        self.date = date
        self.label = label

In [9]:
class Input(object):
    """シーケンス分類で用いるトレーニング/テストの単一の入力"""

    def __init__(self, text, review_id, date, label=None):
        """入力のコンストラクタ
        Args:
          text: 文字列。トークン化されていない一つ目のシーケンスのテキスト。
            単一シーケンスのタスクではこのシーケンスのみを指定する。
          label: (オプショナル) 文字列。サンプルのラベル。トレーニングや検証用のサンプルでは指定する。
            テスト用のサンプルでは指定しない。
        """
        self.text = text
        self.review_id = review_id
        self.date = date
        self.label = label


In [10]:
def convert_input(the_input, max_seq_length):
    # まず、BERTが学習したデータ形式と合うようにデータを前処理する。
    # 1. テキストを小文字にする（BERT lowercaseモデルを用いる場合）
    # 2. トークン化する（例、"sally says hi" -> ["sally", "says", "hi"]）
    # 3. 単語をWordPieceに分割（例、"calling" -> ["call", "##ing"]）
    #
    # この辺りの処理はTransformersライブラリのトークナイザーがまかなってくれます。

    tokens = tokenizer.tokenize(the_input.text)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')
    # print("**{} tokens**\n{}\n".format(len(tokens), tokens))

    encode_plus_tokens = tokenizer.encode_plus(
        the_input.text,
        pad_to_max_length=True,
        max_length=max_seq_length,
        truncation=True
    )
    
    # 事前学習済みBERTの語彙ID。トークンを表す。（トークン数が `max_seq_length` 未満であれば0をパディングする）
    input_ids = encode_plus_tokens["input_ids"]

    # BERTがどのトークンに注目するかを0/1で指定。`input_ids` のパディング部分のベクトル要素には0を割り当てる。
    input_mask = encode_plus_tokens["attention_mask"]

    # テキスト分類のような単一シーケンスのタスクではセグメントIDは常に0とする。質問回答や次文予測のような2シーケンスタスクの場合は1を割り当てる。
    segment_ids = [0] * max_seq_length

    # それぞれのトレーニングデータの行のラベル（`star_rating` 1〜5）
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date,
        label=the_input.label,
    )

    # print("**{} input_ids**\n{}\n".format(len(features.input_ids), features.input_ids))
    # print("**{} input_mask**\n{}\n".format(len(features.input_mask), features.input_mask))
    # print("**{} segment_ids**\n{}\n".format(len(features.segment_ids), features.segment_ids))
    # print("**label_id**\n{}\n".format(features.label_id))
    # print("**review_id**\n{}\n".format(features.review_id))
    # print("**date**\n{}\n".format(features.date))
    # print("**label**\n{}\n".format(features.label))

    return features


In [11]:
def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
    # データをBERTが理解できるフォーマットに変換する
    records = []
    tf_record_writer = tf.io.TFRecordWriter(output_file)

    for (input_idx, the_input) in enumerate(inputs):
        if input_idx % 10000 == 0:
            print("Writing input {} of {}\n".format(input_idx, len(inputs)))

        features = convert_input(the_input, max_seq_length)

        all_features = collections.OrderedDict()

        # input_ids、input_mask、segment_ids、label_idsを含んだTFRecordを作成
        all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
        tf_record_writer.write(tf_record.SerializeToString())

        # Feature Storeに格納する、すべての特徴量を含んだレコードを作成
        records.append(
            {
                "input_ids": features.input_ids,
                "input_mask": features.input_mask,
                "segment_ids": features.segment_ids,
                "label_id": features.label_id,
                "review_id": the_input.review_id,
                "date": the_input.date,
                "label": features.label,
            }
        )

    tf_record_writer.close()

    return records

In [12]:
from datetime import datetime
from time import strftime

# timestamp = datetime.now().replace(microsecond=0).isoformat()
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

2024-01-14T07:07:59Z


In [13]:
import pandas as pd

data = [
    [
        5,
        "ABCD12345",
        """I needed an "antivirus" application and know the quality of Norton products.  This was a no brainer for me and I am glad it was so simple to get.""",
    ],
    [
        3,
        "EFGH12345",
        """The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.""",
    ],
    [
        1,
        "IJKL2345",
        """Terrible, none of my codes worked, and I can't uninstall it.  I think this product IS malware and viruses""",
    ],
]

df = pd.DataFrame(data, columns=["star_rating", "review_id", "review_body"])

# Input クラスを使用して、データからサンプルを作成する。
inputs = df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [14]:
df

Unnamed: 0,star_rating,review_id,review_body
0,5,ABCD12345,"I needed an ""antivirus"" application and know t..."
1,3,EFGH12345,The problem with ElephantDrive is that it requ...
2,1,IJKL2345,"Terrible, none of my codes worked, and I can't..."


In [15]:
inputs

0    <__main__.Input object at 0x7f8248719f60>
1    <__main__.Input object at 0x7f8248719f90>
2    <__main__.Input object at 0x7f824871a3e0>
dtype: object

In [16]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
print(inputs[0].date)

2024-01-14T07:07:59Z


In [17]:
output_file = "./data.tfrecord"

In [18]:
records = transform_inputs_to_tfrecord(inputs, output_file, max_seq_length)

Writing input 0 of 3





In [19]:
print(records[0]['input_ids'])

[101, 1045, 2734, 2019, 1000, 3424, 23350, 1000, 4646, 1998, 2113, 1996, 3737, 1997, 10770, 3688, 1012, 2023, 2001, 1037, 2053, 4167, 2121, 2005, 2033, 1998, 1045, 2572, 5580, 2009, 2001, 2061, 3722, 2000, 2131, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [20]:
import pandas as pd

In [21]:
df = pd.read_parquet("/mnt/amazon_reviews_2015.snappy.parquet",columns=["star_rating","review_id","review_body"])

In [22]:
for i in range(1,6):
    print("star_rating: ", i)
    print(df[df['star_rating'] == i].count())
    print('-------------------------')

star_rating:  1
star_rating    3253070
review_id      3253070
review_body    3253070
dtype: int64
-------------------------
star_rating:  2
star_rating    1865322
review_id      1865322
review_body    1865322
dtype: int64
-------------------------
star_rating:  3
star_rating    3130345
review_id      3130345
review_body    3130345
dtype: int64
-------------------------
star_rating:  4
star_rating    6578230
review_id      6578230
review_body    6578230
dtype: int64
-------------------------
star_rating:  5
star_rating    27078664
review_id      27078664
review_body    27078664
dtype: int64
-------------------------


In [23]:
for i in range(1,6):
    new_df = 'df' + str(i)
    globals()[new_df] = df[df['star_rating'] == i].sample(n=1865322)

In [24]:
df1

Unnamed: 0,star_rating,review_id,review_body
1735225,1,b'RB1QW12CCLB7B',"b'Sound was very good, but they did not fit in..."
40403483,1,b'R2KEJJOMH8VHEL',"b'On first use, the plug sleeve became loose. ..."
925117,1,b'R33XSBQBQ7PKY8',"b'It is not the brand name, imitacion'"
41494152,1,b'R2RIM3WJ1588HJ',b'they were missing the ear piece and they wer...
39693336,1,b'RU835O3I6702U',"b""Doesn't fit like another reviewer states, pl..."
...,...,...,...
40793042,1,b'R20LMUH195Q4E1',"b""For closeup work the glasses from the Dollar..."
32267446,1,b'R255N476G62JP4',"b'I loved this sandal, but the shoe fell apart..."
36282840,1,b'RVX401EC6UCLA',"b""For the price it look cheap and small . That..."
28750983,1,b'R3B10OZQJU09OS',"b""The size is to tight for this phone. My phon..."


In [25]:
df2

Unnamed: 0,star_rating,review_id,review_body
38103584,2,b'R4KVFAO7RUC5E',"b""I bought this due to the reviews. I don't s..."
30508120,2,b'R32W0L11OWPLMV',"b""I found &#34;Whiplash&#34; unpleasant, weird..."
15919034,2,b'R3R3BFTF4M07FL',b'This product does not work for me. I followe...
24556306,2,b'R2SVVCQQKC278O',"b""Ok..after reading the Fix I was not pleased,..."
38815534,2,b'R2B03WJBMJ228T',b'Very tight fit.. Could break easily.'
...,...,...,...
15828497,2,b'RVZ82MR7UG8T4',b'Very weak scent. Barely detectable when use...
25451609,2,b'R2LVFH2FBOZ1ZX',"b'a dissapontment, looks like someone took a m..."
39275301,2,b'R2TNUHYB1DK592',b'Still slips off of my head during cardio and...
38351711,2,b'R1KC3S88NWU7WW',"b""The quality sucks I'm kind of disappointed ..."


In [26]:
df3

Unnamed: 0,star_rating,review_id,review_body
35824019,3,b'RJ6AR9FY8EKM9',b'Super cute'
29655266,3,b'RQ4WI1FXL8JGC',b'There really good is listen good I like them'
1265917,3,b'R3W4QANIX6EBQY',"b""This has an after taste that I don't care fo..."
34482601,3,b'ROVJ8YH1JCOGD',b'Does not roll smooth when you apply a lot of...
13156207,3,b'R18047YBCXHVXZ',b'The shampoo seemed to work the first time I ...
...,...,...,...
38130244,3,b'R25UA7FROEZ83I',b'Accidentally ordered 2 of these. Tried to se...
13888672,3,b'R3799ET0RRAZFZ',b'I liked how the story contained verifiable f...
22116564,3,b'R34L005R1ZCKR3',"b""IT'S OK COULD BE WAAY BETTER!!!!!"""
33946168,3,b'RA23UHAKKIT43',"b'Unfortunately, I injured my neck recently. ..."


In [27]:
df4

Unnamed: 0,star_rating,review_id,review_body
16796129,4,b'R3OV8PUZWDACNU',b'The comforter is good quality for the price....
28594670,4,b'R3IA3AG6ZLZOSI',"b'This is a very sturdy, well made cover, but ..."
12483993,4,b'RAE906M01KEK9',"b""I enjoyed Estelle Ryan's &#34;. . . Connecti..."
6664553,4,b'R2A9SR9SIABF0Y',b'Even better than the first! Never sure what...
39792308,4,b'R1MEYSLK5PPEB0',b'The bag is ideal for our dog and anyone who ...
...,...,...,...
1690092,4,b'R3TMZIE9IS9G44',"b""these are great, downside not washable... b..."
29054323,4,b'R4F9AGKH97T0B',b'I would have like if it brought more than 1 ...
41720249,4,b'R2JFARRGFBCPMW',b'This item was great exactly what I needed I...
35235423,4,b'R36M44YC8I3RPW',b'beautiful'


In [28]:
df5

Unnamed: 0,star_rating,review_id,review_body
24430447,5,b'R17QUY63PAWRL3',b'Really enjoyed seeing the couples and the ch...
38240250,5,b'R1SEC88RBIHAWQ',b'These were the cheapest cables I felt comfor...
6115108,5,b'R7FFXE2NQMVAH',b'It is show good'
5874912,5,b'R28GS8BRWK0FU4',b'Great pants. Fits real well. Lightweight f...
22295688,5,b'RGC3T7TE1O43K',b'LOVED IT!! Great Love story and very exciti...
...,...,...,...
31717404,5,b'R3OSYSQYMUW2WK',b'Great'
17433047,5,b'R290XNDJIZ753E',"b'Funny, touching, quirky, yet still real. I ..."
39306745,5,b'RZU995XK3JJAS',b'Excellent product'
4666500,5,b'R30J4X1IYHYMW1',b'Best blues rock guitarist since Hendrix'


In [29]:
df_temp = pd.concat([df1,df2,df3,df4,df5])

In [30]:
df_temp.count()

star_rating    9326610
review_id      9326610
review_body    9326610
dtype: int64

In [31]:
df_shaffle = df_temp.sample(9000000)

In [32]:
df_head = df_shaffle.head(8000000)
df_tail = df_shaffle.tail(1000000)

In [36]:
df_head.to_parquet("/mnt/amazon_reviews_2015_head.snappy.parquet")
df_tail.to_parquet("/mnt/amazon_reviews_2015_tail.snappy.parquet")

In [37]:
train_df = df_head.sample(n=2000000)
train_df

Unnamed: 0,star_rating,review_id,review_body
34952828,3,b'RYYWJEH5W37ID',"b""My first tripod for my phone broke. Figured ..."
3060525,3,b'RTPJ52SM0DGWW',b'If you do a lot on eBay you may like this i...
10590515,3,b'R2QT5SB2JWCO6I',"b""These are too small to my preference. Not th..."
13356069,4,b'R2I0GSGDYNQVNN',"b""Super cute, but my two-year-old refuses to w..."
31800828,2,b'RDUZPLMYO1YPZ',"b""Not good if you want full features on a wind..."
...,...,...,...
40326391,1,b'R3QCHNJ5A8KJB',"b'As someone else has said, the picture is ver..."
9372897,2,b'R2E3X55FF7N2UW',"b""Not worth your time don't bother"""
26285013,4,b'R3JXV63TI1NLTI',"b'I like very much but, no instructions on how..."
31962344,3,b'R3RP33YBPWDNR',"b""Meh. Cold floor comes through. They're cute ..."


In [38]:
validation_df = df_tail.sample(n=400000)
validation_df

Unnamed: 0,star_rating,review_id,review_body
27493054,4,b'R1WBORAR43TLKR',"b'Fits comfortably, i am 6ft 240lbs and the si..."
4902677,1,b'RV2PJUMPUTY51',b'I feel like this was a real rip off. I have...
10575025,1,b'R3I2F8F1RGOZYE',b'Very poor quality. The stiching began fallin...
21100892,3,b'R17368KK2YNKDZ',"b""This android box loads fast but not all apps..."
23873417,3,b'RZ83O217XMG1Z',"b""I bought this to use as a taste test pack fo..."
...,...,...,...
73166,3,b'R3G3815T0GIDW9',b'Well it was slow and it took me 3 days to re...
21531078,5,b'R23W1AYPRWXO8E',b'Perfect'
30491024,1,b'R2REMV0OPLG8FB',b'Came broken. Ordered another one and the new...
41703353,5,b'R2CIQQM505KBYQ',"b""This is an excellent second addition to the ..."


In [39]:
train_df['review_id'] = train_df['review_id'].str.decode("utf-8")

In [40]:
train_df['review_body'] = train_df['review_body'].str.decode("utf-8","ignore")

In [41]:
train_df

Unnamed: 0,star_rating,review_id,review_body
34952828,3,RYYWJEH5W37ID,My first tripod for my phone broke. Figured I'...
3060525,3,RTPJ52SM0DGWW,If you do a lot on eBay you may like this it ...
10590515,3,R2QT5SB2JWCO6I,These are too small to my preference. Not the ...
13356069,4,R2I0GSGDYNQVNN,"Super cute, but my two-year-old refuses to wea..."
31800828,2,RDUZPLMYO1YPZ,Not good if you want full features on a window...
...,...,...,...
40326391,1,R3QCHNJ5A8KJB,"As someone else has said, the picture is very ..."
9372897,2,R2E3X55FF7N2UW,Not worth your time don't bother
26285013,4,R3JXV63TI1NLTI,"I like very much but, no instructions on how t..."
31962344,3,R3RP33YBPWDNR,Meh. Cold floor comes through. They're cute wi...


In [42]:
validation_df['review_id'] = validation_df['review_id'].str.decode("utf-8")

In [43]:
validation_df['review_body'] = validation_df['review_body'].str.decode("utf-8","ignore")

In [44]:
validation_df

Unnamed: 0,star_rating,review_id,review_body
27493054,4,R1WBORAR43TLKR,"Fits comfortably, i am 6ft 240lbs and the size..."
4902677,1,RV2PJUMPUTY51,I feel like this was a real rip off. I have u...
10575025,1,R3I2F8F1RGOZYE,Very poor quality. The stiching began falling ...
21100892,3,R17368KK2YNKDZ,This android box loads fast but not all apps a...
23873417,3,RZ83O217XMG1Z,I bought this to use as a taste test pack for ...
...,...,...,...
73166,3,R3G3815T0GIDW9,Well it was slow and it took me 3 days to read...
21531078,5,R23W1AYPRWXO8E,Perfect
30491024,1,R2REMV0OPLG8FB,Came broken. Ordered another one and the new o...
41703353,5,R2CIQQM505KBYQ,This is an excellent second addition to the fi...


In [45]:
print(f"df Memory Usage: {df.memory_usage(deep=True).sum() / 1024**3} GB")

df Memory Usage: 11.584748897701502 GB


In [46]:
# Input クラスを使用して、データからサンプルを作成する。
train_inputs = train_df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [47]:
# Input クラスを使用して、データからサンプルを作成する。
validation_inputs = validation_df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [None]:
# !pip install pandarallel

In [None]:
# from pandarallel import pandarallel
# import os
# os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
# !pip3 install --no-cache-dir accelerate
# pandarallel.initialize(nb_workers=2, progress_bar=True, use_memory_fs=False)

In [None]:
#def func(x):
#    return lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp)

# inputs = df.parallel_apply(
#     lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
#     axis=1
# )

In [48]:
train_inputs

34952828    <__main__.Input object at 0x7f7d6d4c73d0>
3060525     <__main__.Input object at 0x7f7d6d4c7fa0>
10590515    <__main__.Input object at 0x7f7d6d4c7760>
13356069    <__main__.Input object at 0x7f7d6d4c6920>
31800828    <__main__.Input object at 0x7f7d6d4c4fa0>
                              ...                    
40326391    <__main__.Input object at 0x7f811382d030>
9372897     <__main__.Input object at 0x7f811382d090>
26285013    <__main__.Input object at 0x7f811382d0f0>
31962344    <__main__.Input object at 0x7f811382d150>
2615374     <__main__.Input object at 0x7f811382d1b0>
Length: 2000000, dtype: object

In [49]:
train_output_file = "/mnt/train_data.tfrecord"

In [50]:
validation_output_file = "/mnt/validation_data.tfrecord"

In [51]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
# print(train_inputs[0].date)

In [52]:
train_records = transform_inputs_to_tfrecord(train_inputs, train_output_file, max_seq_length)

Writing input 0 of 2000000





Writing input 10000 of 2000000

Writing input 20000 of 2000000

Writing input 30000 of 2000000

Writing input 40000 of 2000000

Writing input 50000 of 2000000

Writing input 60000 of 2000000

Writing input 70000 of 2000000

Writing input 80000 of 2000000

Writing input 90000 of 2000000

Writing input 100000 of 2000000

Writing input 110000 of 2000000

Writing input 120000 of 2000000

Writing input 130000 of 2000000

Writing input 140000 of 2000000

Writing input 150000 of 2000000

Writing input 160000 of 2000000

Writing input 170000 of 2000000

Writing input 180000 of 2000000

Writing input 190000 of 2000000

Writing input 200000 of 2000000

Writing input 210000 of 2000000

Writing input 220000 of 2000000

Writing input 230000 of 2000000

Writing input 240000 of 2000000

Writing input 250000 of 2000000

Writing input 260000 of 2000000

Writing input 270000 of 2000000

Writing input 280000 of 2000000

Writing input 290000 of 2000000

Writing input 300000 of 2000000

Writing input 31000

In [53]:
validation_records = transform_inputs_to_tfrecord(validation_inputs, validation_output_file, max_seq_length)

Writing input 0 of 400000

Writing input 10000 of 400000

Writing input 20000 of 400000

Writing input 30000 of 400000

Writing input 40000 of 400000

Writing input 50000 of 400000

Writing input 60000 of 400000

Writing input 70000 of 400000

Writing input 80000 of 400000

Writing input 90000 of 400000

Writing input 100000 of 400000

Writing input 110000 of 400000

Writing input 120000 of 400000

Writing input 130000 of 400000

Writing input 140000 of 400000

Writing input 150000 of 400000

Writing input 160000 of 400000

Writing input 170000 of 400000

Writing input 180000 of 400000

Writing input 190000 of 400000

Writing input 200000 of 400000

Writing input 210000 of 400000

Writing input 220000 of 400000

Writing input 230000 of 400000

Writing input 240000 of 400000

Writing input 250000 of 400000

Writing input 260000 of 400000

Writing input 270000 of 400000

Writing input 280000 of 400000

Writing input 290000 of 400000

Writing input 300000 of 400000

Writing input 310000 o

In [54]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>