# 데이터 불러오기

In [1]:
from datasets import load_dataset

In [2]:
# pip install tqdm
# pip install ipywidgets
dataset = load_dataset("imdb")

In [3]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [4]:
print(dataset) # __str__(self)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [5]:
dataset = dataset.shuffle(seed=7902)

In [6]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [7]:
type(train_dataset)
print(train_dataset) # __str__(self)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


# 데이터 전처리

In [8]:
from transformers import BertTokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [9]:
# 전처리 함수 정의
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# 데이터셋 전처리
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

# 모델 불러오기

In [10]:
from transformers import TFBertForSequenceClassification
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import tensorflow as tf
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

In [12]:
print(type(train_dataset))
print(train_dataset)

<class 'datasets.arrow_dataset.Dataset'>
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})


In [13]:
ids = train_dataset['input_ids']

In [14]:
print(ids[:1])

[[101, 1000, 3889, 1000, 1006, 3782, 15107, 1007, 29438, 1037, 2177, 2013, 2152, 2082, 2005, 1037, 10301, 2012, 1996, 6644, 3295, 2073, 2010, 5519, 2567, 14008, 2253, 4394, 1012, 2096, 2027, 2024, 2045, 1010, 1037, 16360, 3775, 15204, 6492, 1999, 1996, 4338, 1997, 1037, 2158, 1006, 6966, 2033, 1037, 2843, 1997, 1996, 12267, 2386, 2013, 6492, 2013, 1996, 2304, 15825, 1007, 26751, 2015, 1999, 1996, 9917, 10549, 1996, 2157, 2051, 2000, 4060, 2068, 4237, 2028, 2011, 2028, 1012, 1037, 28988, 1010, 9155, 25231, 1011, 2828, 2051, 2931, 2919, 1011, 4632, 1010, 10645, 1006, 5252, 5972, 2239, 1007, 2038, 2019, 5230, 2510, 15742, 2016, 6078, 7885, 1999, 1010, 2096, 2009, 2036, 4240, 2004, 1037, 2173, 1997, 3808, 2013, 1996, 2518, 2006, 1996, 6703, 29216, 1012, 10645, 4282, 2062, 2084, 2016, 1005, 1055, 4129, 1006, 2016, 2036, 4265, 1996, 3279, 1997, 1037, 2775, 1007, 1010, 2021, 2045, 1005, 1055, 2178, 21160, 9792, 2087, 1997, 1996, 2177, 2031, 2053, 2801, 1997, 1012, 2023, 6492, 2453, 2074, 2022

In [15]:
%%time
import numpy as np

# train_input_ids = np.array([data['input_ids'] for data in train_dataset])
# train_attention_mask = np.array([data['attention_mask'] for data in train_dataset])
# train_token_type_ids = np.array([data['token_type_ids'] for data in train_dataset])
train_input_ids = np.array(train_dataset['input_ids'])
train_attention_mask = np.array(train_dataset['attention_mask'])
train_token_type_ids = np.array(train_dataset['token_type_ids'])
train_labels = np.array(train_dataset['label'])

CPU times: total: 172 ms
Wall time: 10.5 s


In [None]:
model.fit({'input_ids': train_input_ids,
           'attention_mask': train_attention_mask,
           'token_type_ids': train_token_type_ids},
          train_labels,
          batch_size=16, epochs=3)
# ResourceExhaustedError는 주로 메모리가 부족해서 발생합니다.
# 해결 1. 메모리 크기를 늘린다.
# 해결 2. 더 작은 모델을 사용한다.
# 해결 3. 배치크기를 줄인다.

Epoch 1/3
   6/1563 [..............................] - ETA: 7:57:33 - loss: 1.6298 - accuracy: 0.5104

In [None]:
# model.save_weights('imdb-transformer-tf-weight.h5')

In [None]:
# model.load_weights('imdb-transformer-tf-weight.h5')

In [None]:
results = model.evaluate({'input_ids': np.array(test_dataset['input_ids'][:160]),
                          'attention_mask': np.array(test_dataset['attention_mask'][:160]),
                          'token_type_ids': np.array(test_dataset['token_type_ids'][:160])},
                         np.array(test_dataset['label'][:160]))

In [None]:
print(test_dataset['label'])