In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.nn.functional as F
import tensorflow as tf
import pandas as pd

2023-08-30 10:56:04.796015: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
TRAIN_DATA_PATH = tf.keras.utils.get_file(
    'ratings_train.txt',
    'https://raw.github.com/ironmanciti/NLP_lecture/master/data/naver_movie/ratings_train.txt'
)
TEST_DATA_PATH = tf.keras.utils.get_file(
    'ratings_test.txt',
    'https://raw.github.com/ironmanciti/NLP_lecture/master/data/naver_movie/ratings_test.txt'
)

In [3]:
train_data = pd.read_csv(TRAIN_DATA_PATH, delimiter='\t')
train_data

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [4]:
test_data = pd.read_csv(TEST_DATA_PATH, delimiter='\t')
test_data

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0
...,...,...,...
49995,4608761,오랜만에 평점 로긴했네ㅋㅋ 킹왕짱 쌈뽕한 영화를 만났습니다 강렬하게 육쾌함,1
49996,5308387,의지 박약들이나 하는거다 탈영은 일단 주인공 김대희 닮았고 이등병 찐따 OOOO,0
49997,9072549,그림도 좋고 완성도도 높았지만... 보는 내내 불안하게 만든다,0
49998,5802125,절대 봐서는 안 될 영화.. 재미도 없고 기분만 잡치고.. 한 세트장에서 다 해먹네,0


In [5]:
df_train = train_data.dropna().sample(n=15000, random_state=42)
df_test = train_data.dropna().sample(n=5000, random_state=42)
df_train.shape, df_test.shape

((15000, 3), (5000, 3))

In [6]:
df_train['label'].value_counts()

label
1    7541
0    7459
Name: count, dtype: int64

In [7]:
X_train = df_train['document'].values.tolist()
y_train = df_train['label'].values.tolist()
X_test = df_test['document'].values.tolist()
y_test = df_test['label'].values.tolist()

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer

BertTokenizer(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [9]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [10]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [11]:
print(train_encodings['input_ids'][10])
print(train_encodings['attention_mask'][10])
print(train_encodings['token_type_ids'][10])

[101, 9652, 19855, 15387, 9568, 52363, 11882, 8896, 105197, 22440, 10028, 90537, 8848, 18778, 48549, 22333, 119, 119, 119, 9414, 21928, 18108, 9353, 11261, 34776, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [12]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.encodings['input_ids'])

In [13]:
train_dataset = ReviewDataset(train_encodings, y_train)
test_dataset = ReviewDataset(test_encodings, y_test)

In [14]:
next(iter(train_dataset))

{'input_ids': tensor([  101, 37370, 10739, 83491,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]),
 'token

In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

In [17]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import time

s = time.time()
trainer.train()
f'{(time.time() - s) / 60}'

Step,Training Loss
10,0.6828
20,0.7234
30,0.7165
40,0.7016
50,0.6964
60,0.7028
70,0.6803
80,0.6776
90,0.6665
100,0.6772


'9.66512536207835'

In [20]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.33662667870521545,
 'eval_runtime': 17.3954,
 'eval_samples_per_second': 287.433,
 'eval_steps_per_second': 17.993,
 'epoch': 2.0}

In [21]:
prediction = trainer.predict(test_dataset)

In [22]:
trainer.model.classifier

Linear(in_features=768, out_features=2, bias=True)

In [23]:
y_logit = torch.tensor(prediction[0])
y_logit[:10]

tensor([[-1.3737,  1.6030],
        [-1.2805,  1.4296],
        [-1.2698,  1.4110],
        [-1.3726,  1.5996],
        [ 1.5473, -1.5363],
        [ 0.8916, -0.9589],
        [ 1.4684, -1.4410],
        [ 1.2968, -1.2925],
        [ 1.5170, -1.4848],
        [-1.3379,  1.5320]])

In [24]:
y_pred = F.softmax(y_logit, dim=-1).argmax(axis=-1).numpy()
print(y_pred[:30])
print(y_test[:30])

[1 1 1 1 0 0 0 0 0 1 0 0 1 1 0 1 1 1 1 0 1 0 1 0 0 0 1 1 0 0]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0]


In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score

print(accuracy_score(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
cm

0.8706


array([[2250,  290],
       [ 357, 2103]])

In [26]:
ex = '돈주고 보기에는 아까운 영화 ㅠㅠ...'
tokenized = tokenizer([ex], truncation=True, padding=True)
pred = trainer.predict(ReviewDataset(tokenized))
logit = torch.tensor(pred[0])
result = F.softmax(logit, dim=-1).argmax(-1).numpy()
'positive' if result == 1 else 'negative'

'negative'