In [2]:
%autosave 0

Autosave disabled


In [3]:
!pip install transformers

[0m

In [4]:
!pip install ipywidgets widgetsnbextension pandas-profiling



[0m

In [5]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [6]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

max_seq_length = 64

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

REVIEW_BODY_COLUMN = "review_body"
REVIEW_ID_COLUMN = "review_id"

LABEL_COLUMN = "star_rating"
LABEL_VALUES = [1, 2, 3, 4, 5]

2024-01-07 13:58:19.749701: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-07 13:58:19.749761: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-07 13:58:19.749825: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-07 13:58:19.763237: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i

In [8]:
print(label_map)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}


In [9]:
class InputFeatures(object):
    """BERT特徴量ベクトル"""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, review_id, date, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.review_id = review_id
        self.date = date
        self.label = label


In [10]:
class Input(object):
    """シーケンス分類で用いるトレーニング/テストの単一の入力"""

    def __init__(self, text, review_id, date, label=None):
        """入力のコンストラクタ
        Args:
          text: 文字列。トークン化されていない一つ目のシーケンスのテキスト。
            単一シーケンスのタスクではこのシーケンスのみを指定する。
          label: (オプショナル) 文字列。サンプルのラベル。トレーニングや検証用のサンプルでは指定する。
            テスト用のサンプルでは指定しない。
        """
        self.text = text
        self.review_id = review_id
        self.date = date
        self.label = label


In [47]:
def convert_input(the_input, max_seq_length):
    # まず、BERTが学習したデータ形式と合うようにデータを前処理する。
    # 1. テキストを小文字にする（BERT lowercaseモデルを用いる場合）
    # 2. トークン化する（例、"sally says hi" -> ["sally", "says", "hi"]）
    # 3. 単語をWordPieceに分割（例、"calling" -> ["call", "##ing"]）
    #
    # この辺りの処理はTransformersライブラリのトークナイザーがまかなってくれます。

    tokens = tokenizer.tokenize(the_input.text)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')
    # print("**{} tokens**\n{}\n".format(len(tokens), tokens))

    encode_plus_tokens = tokenizer.encode_plus(
        the_input.text,
        pad_to_max_length=True,
        max_length=max_seq_length,
        truncation=True
    )
    
    # 事前学習済みBERTの語彙ID。トークンを表す。（トークン数が `max_seq_length` 未満であれば0をパディングする）
    input_ids = encode_plus_tokens["input_ids"]

    # BERTがどのトークンに注目するかを0/1で指定。`input_ids` のパディング部分のベクトル要素には0を割り当てる。
    input_mask = encode_plus_tokens["attention_mask"]

    # テキスト分類のような単一シーケンスのタスクではセグメントIDは常に0とする。質問回答や次文予測のような2シーケンスタスクの場合は1を割り当てる。
    segment_ids = [0] * max_seq_length

    # それぞれのトレーニングデータの行のラベル（`star_rating` 1〜5）
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date,
        label=the_input.label,
    )

    # print("**{} input_ids**\n{}\n".format(len(features.input_ids), features.input_ids))
    # print("**{} input_mask**\n{}\n".format(len(features.input_mask), features.input_mask))
    # print("**{} segment_ids**\n{}\n".format(len(features.segment_ids), features.segment_ids))
    # print("**label_id**\n{}\n".format(features.label_id))
    # print("**review_id**\n{}\n".format(features.review_id))
    # print("**date**\n{}\n".format(features.date))
    # print("**label**\n{}\n".format(features.label))

    return features


In [39]:
def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
    # データをBERTが理解できるフォーマットに変換する
    records = []
    tf_record_writer = tf.io.TFRecordWriter(output_file)

    for (input_idx, the_input) in enumerate(inputs):
        if input_idx % 10000 == 0:
            print("Writing input {} of {}\n".format(input_idx, len(inputs)))

        features = convert_input(the_input, max_seq_length)

        all_features = collections.OrderedDict()

        # input_ids、input_mask、segment_ids、label_idsを含んだTFRecordを作成
        all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
        tf_record_writer.write(tf_record.SerializeToString())

        # Feature Storeに格納する、すべての特徴量を含んだレコードを作成
        records.append(
            {
                "input_ids": features.input_ids,
                "input_mask": features.input_mask,
                "segment_ids": features.segment_ids,
                "label_id": features.label_id,
                "review_id": the_input.review_id,
                "date": the_input.date,
                "label": features.label,
            }
        )

    tf_record_writer.close()

    return records

In [40]:
from datetime import datetime
from time import strftime

# timestamp = datetime.now().replace(microsecond=0).isoformat()
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

2024-01-07T14:10:22Z


In [41]:
import pandas as pd

data = [
    [
        5,
        "ABCD12345",
        """I needed an "antivirus" application and know the quality of Norton products.  This was a no brainer for me and I am glad it was so simple to get.""",
    ],
    [
        3,
        "EFGH12345",
        """The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos.""",
    ],
    [
        1,
        "IJKL2345",
        """Terrible, none of my codes worked, and I can't uninstall it.  I think this product IS malware and viruses""",
    ],
]

df = pd.DataFrame(data, columns=["star_rating", "review_id", "review_body"])

# Input クラスを使用して、データからサンプルを作成する。
inputs = df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [42]:
df

Unnamed: 0,star_rating,review_id,review_body
0,5,ABCD12345,"I needed an ""antivirus"" application and know t..."
1,3,EFGH12345,The problem with ElephantDrive is that it requ...
2,1,IJKL2345,"Terrible, none of my codes worked, and I can't..."


In [43]:
inputs

0    <__main__.Input object at 0x7f38818226e0>
1    <__main__.Input object at 0x7f38818211b0>
2    <__main__.Input object at 0x7f3881823dc0>
dtype: object

In [44]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
print(inputs[0].date)

2024-01-07T14:10:22Z


In [45]:
output_file = "./data.tfrecord"

In [48]:
records = transform_inputs_to_tfrecord(inputs, output_file, max_seq_length)

Writing input 0 of 3



In [49]:
import pandas as pd

In [50]:
df = pd.read_parquet("/mnt/amazon_reviews_2015.snappy.parquet",columns=["star_rating","review_id","review_body"])

In [51]:
df

Unnamed: 0,star_rating,review_id,review_body
0,5,b'R2C20GSMIOZYVP',b'I have made multiple purchases of this prosc...
1,3,b'RPI30SPP1J9U9',"b""I am not sure if it's a product or storage p..."
2,1,b'RKYY2ZQGUV06L',"b""I was hoping this had a stronger taste than..."
3,5,b'RKYYAEA9G3CD4',b'Awesome Tea!'
4,4,b'R17ZQPU555KVR6',"b""This tasty spread tastes just like a melted ..."
...,...,...,...
41905626,5,b'R2341YPSNIJ3NB',"b""I got this case for my violin, as my old one..."
41905627,5,b'R34HOANGHY4878',b'just as excpected'
41905628,4,b'R3APW14Y9V4QOP',"b""It has ten really cool sounds, the beat step..."
41905629,5,b'R18BIAZS3JP0MI',b'very clear sound thank you amazon i recommen...


In [52]:
df['review_id'] = df['review_id'].str.decode("utf-8")

In [53]:
df['review_body'] = df['review_body'].str.decode("utf-8","ignore")

In [54]:
df

Unnamed: 0,star_rating,review_id,review_body
0,5,R2C20GSMIOZYVP,I have made multiple purchases of this prosciu...
1,3,RPI30SPP1J9U9,I am not sure if it's a product or storage pro...
2,1,RKYY2ZQGUV06L,I was hoping this had a stronger taste than r...
3,5,RKYYAEA9G3CD4,Awesome Tea!
4,4,R17ZQPU555KVR6,This tasty spread tastes just like a melted Re...
...,...,...,...
41905626,5,R2341YPSNIJ3NB,"I got this case for my violin, as my old one w..."
41905627,5,R34HOANGHY4878,just as excpected
41905628,4,R3APW14Y9V4QOP,"It has ten really cool sounds, the beat step w..."
41905629,5,R18BIAZS3JP0MI,very clear sound thank you amazon i recommende...


In [55]:
print(f"df Memory Usage: {df.memory_usage(deep=True).sum() / 1024**3} GB")

df Memory Usage: 13.552809612825513 GB


In [None]:
!pip install pandarallel

In [None]:
from pandarallel import pandarallel
import os
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'

pandarallel.initialize(nb_workers=2, progress_bar=True, use_memory_fs=False)

In [56]:
# Input クラスを使用して、データからサンプルを作成する。
inputs = df.apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1,
)

In [None]:
#import os
#os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
#!export TMPDIR=/var/tmp
#!pip3 install --no-cache-dir accelerate

In [None]:
#def func(x):
#    return lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp)

inputs = df.parallel_apply(
    lambda x: Input(label=x[LABEL_COLUMN], text=x[REVIEW_BODY_COLUMN], review_id=x[REVIEW_ID_COLUMN], date=timestamp),
    axis=1
)

In [57]:
inputs

0           <__main__.Input object at 0x7f3881822020>
1           <__main__.Input object at 0x7f3881820f70>
2           <__main__.Input object at 0x7f38818232e0>
3           <__main__.Input object at 0x7f3881823a60>
4           <__main__.Input object at 0x7f38818234c0>
                              ...                    
41905626    <__main__.Input object at 0x7f335263d960>
41905627    <__main__.Input object at 0x7f335263d9c0>
41905628    <__main__.Input object at 0x7f335263da20>
41905629    <__main__.Input object at 0x7f335263da80>
41905630    <__main__.Input object at 0x7f335263dae0>
Length: 41905631, dtype: object

In [58]:
output_file = "/mnt/data.tfrecord"

In [59]:
# date が Feature Store の仕様に合わせて ISO-8601 になっていることを確認
print(inputs[0].date)

2024-01-07T14:10:22Z


In [None]:
records = transform_inputs_to_tfrecord(inputs, output_file, max_seq_length)

Writing input 0 of 41905631





Writing input 10000 of 41905631

Writing input 20000 of 41905631

Writing input 30000 of 41905631

Writing input 40000 of 41905631

Writing input 50000 of 41905631

Writing input 60000 of 41905631

Writing input 70000 of 41905631

Writing input 80000 of 41905631

Writing input 90000 of 41905631

Writing input 100000 of 41905631

Writing input 110000 of 41905631

Writing input 120000 of 41905631

Writing input 130000 of 41905631

Writing input 140000 of 41905631

Writing input 150000 of 41905631

Writing input 160000 of 41905631

Writing input 170000 of 41905631

Writing input 180000 of 41905631

Writing input 190000 of 41905631

Writing input 200000 of 41905631

Writing input 210000 of 41905631

Writing input 220000 of 41905631

Writing input 230000 of 41905631

Writing input 240000 of 41905631

Writing input 250000 of 41905631

Writing input 260000 of 41905631

Writing input 270000 of 41905631

Writing input 280000 of 41905631

Writing input 290000 of 41905631

Writing input 300000 of

Writing input 2390000 of 41905631

Writing input 2400000 of 41905631

Writing input 2410000 of 41905631

Writing input 2420000 of 41905631

Writing input 2430000 of 41905631

Writing input 2440000 of 41905631

Writing input 2450000 of 41905631

Writing input 2460000 of 41905631

Writing input 2470000 of 41905631

Writing input 2480000 of 41905631

Writing input 2490000 of 41905631

Writing input 2500000 of 41905631

Writing input 2510000 of 41905631

Writing input 2520000 of 41905631

Writing input 2530000 of 41905631

Writing input 2540000 of 41905631

Writing input 2550000 of 41905631

Writing input 2560000 of 41905631

Writing input 2570000 of 41905631

Writing input 2580000 of 41905631

Writing input 2590000 of 41905631

Writing input 2600000 of 41905631

Writing input 2610000 of 41905631

Writing input 2620000 of 41905631

Writing input 2630000 of 41905631

Writing input 2640000 of 41905631

Writing input 2650000 of 41905631

Writing input 2660000 of 41905631

Writing input 267000

Writing input 4740000 of 41905631

Writing input 4750000 of 41905631

Writing input 4760000 of 41905631

Writing input 4770000 of 41905631

Writing input 4780000 of 41905631

Writing input 4790000 of 41905631

Writing input 4800000 of 41905631

Writing input 4810000 of 41905631

Writing input 4820000 of 41905631

Writing input 4830000 of 41905631

Writing input 4840000 of 41905631

Writing input 4850000 of 41905631

Writing input 4860000 of 41905631

Writing input 4870000 of 41905631

Writing input 4880000 of 41905631

Writing input 4890000 of 41905631

Writing input 4900000 of 41905631

Writing input 4910000 of 41905631

Writing input 4920000 of 41905631

Writing input 4930000 of 41905631

Writing input 4940000 of 41905631

Writing input 4950000 of 41905631

Writing input 4960000 of 41905631

Writing input 4970000 of 41905631

Writing input 4980000 of 41905631

Writing input 4990000 of 41905631

Writing input 5000000 of 41905631

Writing input 5010000 of 41905631

Writing input 502000

Writing input 7090000 of 41905631

Writing input 7100000 of 41905631

Writing input 7110000 of 41905631

Writing input 7120000 of 41905631

Writing input 7130000 of 41905631

Writing input 7140000 of 41905631

Writing input 7150000 of 41905631

Writing input 7160000 of 41905631

Writing input 7170000 of 41905631

Writing input 7180000 of 41905631

Writing input 7190000 of 41905631

Writing input 7200000 of 41905631

Writing input 7210000 of 41905631

Writing input 7220000 of 41905631

Writing input 7230000 of 41905631

Writing input 7240000 of 41905631

Writing input 7250000 of 41905631

Writing input 7260000 of 41905631

Writing input 7270000 of 41905631

Writing input 7280000 of 41905631

Writing input 7290000 of 41905631

Writing input 7300000 of 41905631

Writing input 7310000 of 41905631

Writing input 7320000 of 41905631

Writing input 7330000 of 41905631

Writing input 7340000 of 41905631

Writing input 7350000 of 41905631

Writing input 7360000 of 41905631

Writing input 737000

Writing input 9440000 of 41905631

Writing input 9450000 of 41905631

Writing input 9460000 of 41905631

Writing input 9470000 of 41905631

Writing input 9480000 of 41905631

Writing input 9490000 of 41905631

Writing input 9500000 of 41905631

Writing input 9510000 of 41905631

Writing input 9520000 of 41905631

Writing input 9530000 of 41905631

Writing input 9540000 of 41905631

Writing input 9550000 of 41905631

Writing input 9560000 of 41905631

Writing input 9570000 of 41905631

Writing input 9580000 of 41905631

Writing input 9590000 of 41905631

Writing input 9600000 of 41905631

Writing input 9610000 of 41905631

Writing input 9620000 of 41905631

Writing input 9630000 of 41905631

Writing input 9640000 of 41905631

Writing input 9650000 of 41905631

Writing input 9660000 of 41905631

Writing input 9670000 of 41905631

Writing input 9680000 of 41905631

Writing input 9690000 of 41905631

Writing input 9700000 of 41905631

Writing input 9710000 of 41905631

Writing input 972000

Writing input 11740000 of 41905631

Writing input 11750000 of 41905631

Writing input 11760000 of 41905631

Writing input 11770000 of 41905631

Writing input 11780000 of 41905631

Writing input 11790000 of 41905631

Writing input 11800000 of 41905631

Writing input 11810000 of 41905631

Writing input 11820000 of 41905631

Writing input 11830000 of 41905631

Writing input 11840000 of 41905631

Writing input 11850000 of 41905631

Writing input 11860000 of 41905631

Writing input 11870000 of 41905631

Writing input 11880000 of 41905631

Writing input 11890000 of 41905631

Writing input 11900000 of 41905631

Writing input 11910000 of 41905631

Writing input 11920000 of 41905631

Writing input 11930000 of 41905631

Writing input 11940000 of 41905631

Writing input 11950000 of 41905631

Writing input 11960000 of 41905631

Writing input 11970000 of 41905631

Writing input 11980000 of 41905631

Writing input 11990000 of 41905631

Writing input 12000000 of 41905631

Writing input 12010000 of 41

In [None]:
import re

my_bytes = b'apple,banana,kiwi'

m = re.search("apple", my_bytes.decode('utf-8'))


In [None]:
m