In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [3]:
train_data_path = "../../data/garments_train.csv"
test_data_path = "../../data/garments_test.csv"

In [4]:
X_col, y_col = "SentimentText", "Aspect"

In [5]:
train_df = pd.read_csv(train_data_path).loc[:, [X_col, y_col]].drop_duplicates().reset_index(drop=True)
train_df.head(2)

Unnamed: 0,SentimentText,Aspect
0,사이즈가잘맞네요,사이즈
1,좀크게나온듯,사이즈


In [6]:
label_encoder = LabelEncoder()
enc_data = label_encoder.fit_transform(train_df[y_col])
num_labels = len(set(enc_data))

In [7]:
label_items = label_encoder.classes_
label_numbers = label_encoder.transform(label_items)
dict(zip(label_items, label_numbers))

{'가격': 0, '기능': 1, '디자인': 2, '사이즈': 3, '품질': 4}

In [8]:
X_train, y_train = train_df.loc[:, X_col].to_list(), enc_data

In [9]:
HUGGING_FACE_PATH = "klue/bert-base"
model = TFAutoModelForSequenceClassification.from_pretrained(HUGGING_FACE_PATH, num_labels=num_labels, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(HUGGING_FACE_PATH)

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [10]:
X_train_encoding = tokenizer(X_train, padding=True, truncation=True, max_length=42)

In [11]:
SHUFFLE_PARAM = 1000

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train_encoding),
    y_train
)).shuffle(SHUFFLE_PARAM)

In [12]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, metrics=["accuracy"])
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  110617344 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
Total params: 110621189 (421.99 MB)
Trainable params: 110621189 (421.99 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
BATCH_PARAM = 32

validation_length = len(X_train) // 10
train_except_val = train_dataset.skip(validation_length).batch(BATCH_PARAM)
validation_data = train_dataset.take(validation_length).batch(BATCH_PARAM)

In [14]:
model.fit(
    train_except_val,
    epochs=1,
    batch_size=BATCH_PARAM,
    validation_data=validation_data)



<keras.src.callbacks.History at 0x7be73405c400>

In [15]:
test_df = pd.read_csv(test_data_path).loc[:, [X_col, y_col]].drop_duplicates().reset_index(drop=True)
test_df.head(2)

Unnamed: 0,SentimentText,Aspect
0,싸구려 느낌이 팍팍.,품질
1,털빠짐이 심함.,품질


In [16]:
X_test = test_df.loc[:, X_col].to_list()
y_test = label_encoder.transform(test_df.loc[:, y_col].to_list())

In [17]:
X_test_encoding = tokenizer(X_test, padding=True, truncation=True, max_length=42)

In [18]:
test_dataset = tf.data.Dataset.from_tensor_slices(
    dict(X_test_encoding)
).batch(BATCH_PARAM)

In [19]:
predictions = model.predict(test_dataset)
predictions.logits



array([[-1.470973  , -2.5759964 , -1.339758  , -1.1375242 ,  5.8944383 ],
       [-1.1386077 , -1.6226466 , -1.9521725 , -1.7040904 ,  5.9142222 ],
       [-0.24855399,  4.9348793 , -1.6661097 , -2.5324929 ,  0.5471002 ],
       ...,
       [-1.3306005 , -1.8002255 , -1.7160224 , -1.4829869 ,  6.0505404 ],
       [-1.5957936 , -2.6438289 , -0.08154138, -1.8314389 ,  5.4980683 ],
       [-1.4890118 , -1.7754257 , -1.7580992 , -1.394166  ,  5.7850785 ]],
      dtype=float32)

In [20]:
y_pred = np.argmax(predictions.logits, axis=1)
y_pred

array([4, 4, 1, ..., 4, 4, 4])

In [21]:
accuracy_score(y_test, y_pred)

0.958480416876786