In [1]:
#%autosave 0

In [2]:
!pip install transformers
 # Transformerは、2017年にGoogleが発表した深層学習モデルで、現在では自然言語処理の分野で最も利用されているモデルの一つです。
 # このモデルは、大規模なテキストデータを効率的に処理する能力を持っており、多くの応用例が存在します。例えば、チャットボットで
 # あるChatGPTも、Transformerモデルをベースとしています。Hugging FaceのTransformers ライブラリには、多数の学習済みモデル
 # が提供されており、これをベースにして、新たなデータセットでの学習や、モデルのパラメータ調整などのタスクを行うことができます。


[0m

In [3]:
!pip install ipywidgets widgetsnbextension pandas-profiling



[0m

In [4]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [5]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

max_seq_length = 64

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
 # DistilBERTはHuggingface が NeurIPS 2019 に公開したモデルで、名前は「Distilated-BERT」の略となります。
 # DistilBERTはBERTアーキテクチャをベースにした、小さくて、速くて、軽いTransformerモデルです。
 # DistilBERTは、BERT-baseよりもパラメータが40%少なく、60%高速に動作し、GLUE Benchmarkで測定されたBERTの
 # 97%の性能を維持できると言われています。
 # DistilBERTは、教師と呼ばれる大きなモデルを生徒と呼ばれる小さなモデルに圧縮する技術である知識蒸留を用いて
 # 訓練されます。BERTを蒸留することで、元のBERTモデルと多くの類似点を持ちながら、より軽量で実行速度が速い
 # Transformerモデルを得ることができます。

REVIEW_BODY_COLUMN = "review_body"
REVIEW_ID_COLUMN = "review_id"

LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]

2024-01-15 12:42:57.414569: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-15 12:42:57.414634: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-15 12:42:57.414666: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-15 12:42:57.422779: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i

In [7]:
print(label_map)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}


In [8]:
class InputFeatures(object):
    """BERT特徴量ベクトル"""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.review_id = review_id
        self.date = date
        self.label = label

In [9]:
class Input(object):
    """シーケンス分類で用いるトレーニング/テストの単一の入力"""

    def __init__(self, text, review_id, date, label=None):
        """入力のコンストラクタ
        Args:
          text: 文字列。トークン化されていない一つ目のシーケンスのテキスト。
            単一シーケンスのタスクではこのシーケンスのみを指定する。
          label: (オプショナル) 文字列。サンプルのラベル。トレーニングや検証用のサンプルでは指定する。
            テスト用のサンプルでは指定しない。
        """
        self.text = text
        self.review_id = review_id
        self.date = date
        self.label = label


In [10]:
def convert_input(the_input, max_seq_length):
    # まず、BERTが学習したデータ形式と合うようにデータを前処理する。
    # 1. テキストを小文字にする（BERT lowercaseモデルを用いる場合）
    # 2. トークン化する（例、"sally says hi" -> ["sally", "says", "hi"]）
    # 3. 単語をWordPieceに分割（例、"calling" -> ["call", "##ing"]）
    #
    # この辺りの処理はTransformersライブラリのトークナイザーがまかなってくれます。

    tokens = tokenizer.tokenize(the_input.text)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')
    # print("**{} tokens**\n{}\n".format(len(tokens), tokens))

    encode_plus_tokens = tokenizer.encode_plus(
        the_input.text,
        pad_to_max_length=True,
        max_length=max_seq_length,
        truncation=True
    )
    
    # 事前学習済みBERTの語彙ID。トークンを表す。（トークン数が `max_seq_length` 未満であれば0をパディングする）
    input_ids = encode_plus_tokens["input_ids"]

    # BERTがどのトークンに注目するかを0/1で指定。`input_ids` のパディング部分のベクトル要素には0を割り当てる。
    input_mask = encode_plus_tokens["attention_mask"]

    # テキスト分類のような単一シーケンスのタスクではセグメントIDは常に0とする。質問回答や次文予測のような2シーケンスタスクの場合は1を割り当てる。
    segment_ids = [0] * max_seq_length

    # それぞれのトレーニングデータの行のラベル（`star_rating` 1〜5）
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date,
        label=the_input.label,
    )

    # print("**{} input_ids**\n{}\n".format(len(features.input_ids), features.input_ids))
    # print("**{} input_mask**\n{}\n".format(len(features.input_mask), features.input_mask))
    # print("**{} segment_ids**\n{}\n".format(len(features.segment_ids), features.segment_ids))
    # print("**label_id**\n{}\n".format(features.label_id))
    # print("**review_id**\n{}\n".format(features.review_id))
    # print("**date**\n{}\n".format(features.date))
    # print("**label**\n{}\n".format(features.label))

    return features


In [11]:
def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
    # データをBERTが理解できるフォーマットに変換する
    records = []
    tf_record_writer = tf.io.TFRecordWriter(output_file)

    for (input_idx, the_input) in enumerate(inputs):
        if input_idx % 10000 == 0:
            print("Writing input {} of {}\n".format(input_idx, len(inputs)))

        features = convert_input(the_input, max_seq_length)

        all_features = collections.OrderedDict()

        # input_ids、input_mask、segment_ids、label_idsを含んだTFRecordを作成
        all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
        tf_record_writer.write(tf_record.SerializeToString())

        # Feature Storeに格納する、すべての特徴量を含んだレコードを作成
        records.append(
            {
                "input_ids": features.input_ids,
                "input_mask": features.input_mask,
                "segment_ids": features.segment_ids,
                "label_id": features.label_id,
                "review_id": the_input.review_id,
                "date": the_input.date,
                "label": features.label,
            }
        )

    tf_record_writer.close()

    return records

In [12]:
from datetime import datetime
from time import strftime

# timestamp = datetime.now().replace(microsecond=0).isoformat()
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

2024-01-15T12:43:07Z


In [13]:
import pandas as pd

data = [
    [
        5,
        "ABCD12345",
        """I needed an "antivirus" application and know the quality of Norton products.  This was a no brainer for me and I am glad it was so simple to get.""",
    ],
    [
        3,
        "EFGH12345",
        """The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.""",
    ],
    [
        1,
        "IJKL2345",
        """Terrible, none of my codes worked, and I can't uninstall it.  I think this product IS malware and viruses""",
    ],
]

df = pd.DataFrame(data, columns=["star_rating", "review_id", "review_body"])

# Input クラスを使用して、データからサンプルを作成する。
inputs = df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [14]:
df

Unnamed: 0,star_rating,review_id,review_body
0,5,ABCD12345,"I needed an ""antivirus"" application and know t..."
1,3,EFGH12345,The problem with ElephantDrive is that it requ...
2,1,IJKL2345,"Terrible, none of my codes worked, and I can't..."


In [15]:
inputs

0    <__main__.Input object at 0x7feee5f0e650>
1    <__main__.Input object at 0x7feee5f0ec20>
2    <__main__.Input object at 0x7feee5f0edd0>
dtype: object

In [16]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
print(inputs[0].date)

2024-01-15T12:43:07Z


In [17]:
output_file = "./data.tfrecord"

In [18]:
records = transform_inputs_to_tfrecord(inputs, output_file, max_seq_length)

Writing input 0 of 3





In [19]:
print(records[0]['input_ids'])

[101, 1045, 2734, 2019, 1000, 3424, 23350, 1000, 4646, 1998, 2113, 1996, 3737, 1997, 10770, 3688, 1012, 2023, 2001, 1037, 2053, 4167, 2121, 2005, 2033, 1998, 1045, 2572, 5580, 2009, 2001, 2061, 3722, 2000, 2131, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [20]:
import pandas as pd

In [21]:
df = pd.read_parquet("/mnt/amazon_reviews_2015_small.snappy.parquet",columns=["star_rating","review_id","review_body"])

In [22]:
for i in range(1,6):
    print("star_rating: ", i)
    print(df[df['star_rating'] == i].count())
    print('-------------------------')

star_rating:  1
star_rating    36844
review_id      36844
review_body    36844
dtype: int64
-------------------------
star_rating:  2
star_rating    22482
review_id      22482
review_body    22482
dtype: int64
-------------------------
star_rating:  3
star_rating    37827
review_id      37827
review_body    37827
dtype: int64
-------------------------
star_rating:  4
star_rating    77271
review_id      77271
review_body    77271
dtype: int64
-------------------------
star_rating:  5
star_rating    325576
review_id      325576
review_body    325576
dtype: int64
-------------------------


In [23]:
for i in range(1,6):
    new_df = 'df' + str(i)
    globals()[new_df] = df[df['star_rating'] == i].sample(n=22482)

In [24]:
df1

Unnamed: 0,star_rating,review_id,review_body
225295,1,R2WAFFM8QVWKE1,I'm not sure why anyone would put artificial f...
246235,1,R3PBKVRSYGPQ2N,They are flimsy and difficult to get on cans.
436857,1,RSIX4RVWQUXLL,Horrible. Didn't do anything for me. Not effec...
242550,1,R2A1FCYJG2ER6N,Horrible absolutely useless very disappointed
269039,1,R2ZZH38SS81DQV,This is absolute junk. I have purchase two of...
...,...,...,...
497115,1,R3BFS3EA7ZS7TS,This powder has very unpleasant taste. It mad...
84338,1,R29D6ZJLKJ1BHT,I was very disappointed cuz it was little and ...
186060,1,R20B2NEPP6U13E,Received quickly and no issue with the seller....
147190,1,R2FXYFB5X7R96J,Was just ok


In [25]:
df2

Unnamed: 0,star_rating,review_id,review_body
53145,2,RYUP37NGSV511,Bad water taste after only a month in use
45953,2,R3FL1BEFOYYD9Q,Not enough good songs to keep my interest...
372387,2,R3509DBMSURHKP,"Way too small. Had to return it. Seems to me, ..."
248170,2,R2HI8EZHQ7K42M,the string broke off the first time I played w...
314619,2,RSWY7Y40EC9U1,So I wanted an app so I could learn a song and...
...,...,...,...
170664,2,R3KA3WNBRXFQOC,i was very disappointed with these gloves. the...
400492,2,R7T7D44MY93C3,This outfit runs too small for the size.
144463,2,RCBI6ZHLDM28J,"Well, it comes on and does it's thing as descr..."
56633,2,R2IC2BGXAIE500,i was not expecting the tier to be so small. I...


In [26]:
df3

Unnamed: 0,star_rating,review_id,review_body
470498,3,RTAWBC13XDWPO,Not the best quality but okay.
229419,3,R3SAQZKUZQZZZ2,I should've read all the reviews a little clos...
32447,3,R2UN5M26K5UCG8,I had purchased a similar product elsewhere an...
147691,3,R3HSDMA9JVXRKF,I can only trust that the program works as it ...
270525,3,RYKDD8YV88AG0,The only flaw I can find with this machine is ...
...,...,...,...
460332,3,R1WFSDY4XV8LU3,This book was just okay. I guess because Maya...
11532,3,R3AVDQL50E6TGR,I'm actually very surprised how highly rated t...
273304,3,R22RHTX1GSSEAZ,"have use it yet, but seams to be of good quali..."
337530,3,R1R6N60B2OBVBQ,Pros:<br />LATCH system.<br />Comfortable cush...


In [27]:
df4

Unnamed: 0,star_rating,review_id,review_body
124105,4,R2UX0BSW1HPRAT,**Spoiler Alert** Sexually Graphic not suitabl...
417113,4,R39KH16WVFYKFE,This would be SOOO much better if there was a ...
214602,4,R33LJCEF13D650,"Totally convincing. I took this on a cruise, a..."
39891,4,R3S4NUJZGSCJKF,Yup. good bang for the buck.
402497,4,R1DPQYWDBFXH3Y,"what is there to review ,this song is hopping"
...,...,...,...
449418,4,R1GVLZB6A87I0N,Nice and warm!
137837,4,R2OZ29ZNHCXYO1,Very nice and am very interested in this
152578,4,R2BED30B3JJMQ1,It is quite good for the price.
303191,4,RPY9E13IQN7LF,Good but no foxes in it.


In [28]:
df5

Unnamed: 0,star_rating,review_id,review_body
112124,5,R2LQZI7JR932JJ,Washed and ironed beautifully. I folded them ...
167159,5,R6DZV1J88E76A,I really have not worn any yet but they look good
97650,5,R3TRZRHOX54MBM,Easy to use fun to play free app im just typin...
432347,5,RHJZUJRAXH9T6,"Received, great shape"
241992,5,R1H7DTHXKVO0LV,I love the product. The color is vibrant and l...
...,...,...,...
484244,5,RM69MHWP3ON37,Like it.
147442,5,R2NCNDCAS7P1DF,This games is fun and addicting! I recommend t...
44075,5,R10XE92TUO3BXC,This is very cool!!!
234687,5,R2Y2NIBNU50Z4Q,Love em' so far. They seem like they are well ...


In [29]:
df_temp = pd.concat([df1,df2,df3,df4,df5])

In [30]:
df_temp.count()

star_rating    112410
review_id      112410
review_body    112410
dtype: int64

In [31]:
df_shaffle = df_temp.sample(100000)

In [32]:
df_head = df_shaffle.head(80000)
df_tail = df_shaffle.tail(20000)

In [33]:
df_head.to_parquet("/mnt/amazon_reviews_2015_small_head.snappy.parquet")
df_tail.to_parquet("/mnt/amazon_reviews_2015_small_tail.snappy.parquet")

In [34]:
train_df = df_head.sample(n=80000)
train_df

Unnamed: 0,star_rating,review_id,review_body
490796,3,RLZADDLF207LO,It is moisturizing but I haven't noticed any i...
241513,5,R2PLJZVKTED5M1,Really great product for dry skin. A little t...
19001,1,R3TLB69YYK6N2C,"Decent, but the madnetic button doesn't work a..."
485196,4,RH7QMQNXJ7NE7,Fit my dryer well.
187423,2,R295XHUX70JO3X,I us bought 2 of these sleeping bags for campi...
...,...,...,...
431267,5,R36YLWFQQ8R108,"good information to know, I do not totally agr..."
18505,2,R1DHAFIS7FE9LF,The individual floating shaving heads are held...
243446,2,RJYS1N8PWMXRJ,Not worth the trouble!! Doesn't shave very clo...
329286,3,RERN4ZWC3XR0O,Very trendy look and material was amazing.But ...


In [35]:
validation_df = df_tail.sample(n=20000)
validation_df

Unnamed: 0,star_rating,review_id,review_body
420604,5,R1CRPVY253Y7NS,These are the only pet blankets we will use fo...
101752,5,RN2FSD3R56ZTA,Our 9 year old dog loves it. We went to the we...
390680,3,R385VVPZ8KHKHJ,The package was damaged and the toy was a dirt...
337584,1,RA7C8RBB8UQVR,Arrived broken
2184,2,R1V3A7UVWF7RO6,Falls off a lot.. And doesn't absorb
...,...,...,...
245975,1,R37P8FSRDJF7F2,CHIHUAHUA REVIEW:<br /><br />Our Chihuahua is ...
261787,4,RLOD2FPQKQ0FX,"They work well with my phone, and they're nice..."
126557,3,R2IBICCIHKL620,This book was kind of<br />hard to stay into f...
10417,2,RB1XVC52FPM20,It notably slowed down my computer.


In [36]:
print(f"df Memory Usage: {df.memory_usage(deep=True).sum() / 1024**3} GB")

df Memory Usage: 0.1716369530186057 GB


In [37]:
# Input クラスを使用して、データからサンプルを作成する。
train_inputs = train_df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [38]:
# Input クラスを使用して、データからサンプルを作成する。
validation_inputs = validation_df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [39]:
# !pip install pandarallel

In [40]:
# from pandarallel import pandarallel
# import os
# os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
# !pip3 install --no-cache-dir accelerate
# pandarallel.initialize(nb_workers=2, progress_bar=True, use_memory_fs=False)

In [41]:
#def func(x):
#    return lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp)

# inputs = df.parallel_apply(
#     lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
#     axis=1
# )

In [42]:
train_inputs

490796    <__main__.Input object at 0x7feea6286b00>
241513    <__main__.Input object at 0x7feea6284f10>
19001     <__main__.Input object at 0x7feea62864a0>
485196    <__main__.Input object at 0x7feea6285de0>
187423    <__main__.Input object at 0x7feea6286380>
                            ...                    
431267    <__main__.Input object at 0x7feea56879d0>
18505     <__main__.Input object at 0x7feea5687a30>
243446    <__main__.Input object at 0x7feea5687a90>
329286    <__main__.Input object at 0x7feea5687af0>
222631    <__main__.Input object at 0x7feea5687b50>
Length: 80000, dtype: object

In [43]:
train_output_file = "/mnt/train_data_small.tfrecord"

In [44]:
validation_output_file = "/mnt/validation_data_small.tfrecord"

In [45]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
# print(train_inputs[0].date)

In [46]:
train_records = transform_inputs_to_tfrecord(train_inputs, train_output_file, max_seq_length)

Writing input 0 of 80000





Writing input 10000 of 80000

Writing input 20000 of 80000

Writing input 30000 of 80000

Writing input 40000 of 80000

Writing input 50000 of 80000

Writing input 60000 of 80000

Writing input 70000 of 80000



In [47]:
validation_records = transform_inputs_to_tfrecord(validation_inputs, validation_output_file, max_seq_length)

Writing input 0 of 20000

Writing input 10000 of 20000



In [48]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>