In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [3]:
train_data_path = "../../data/garments_train.csv"
test_data_path = "../../data/garments_test.csv"

In [4]:
train_df = pd.read_csv(train_data_path).loc[:, ["RawText", "GeneralPolarity"]].drop_duplicates().reset_index(drop=True)
train_df.head(2)

Unnamed: 0,RawText,GeneralPolarity
0,이번에구매한데님은사이즈가잘맞네요 색상구성도괜찮고맘에든답니다 잘입겠습니다,1.0
1,바지는 너무 편하고 좋은데 좀크게나온듯 그리고 허리고리 하나가 안달려서 밑단수선하면...,0.0


In [5]:
label_encoder = LabelEncoder()
enc_data = label_encoder.fit_transform(train_df["GeneralPolarity"])
num_labels = len(set(enc_data))

In [6]:
label_items = label_encoder.classes_
label_numbers = label_encoder.transform(label_items)
dict(zip(label_items, label_numbers))

{-1.0: 0, 0.0: 1, 1.0: 2}

In [7]:
X_train, y_train = train_df.loc[:, "RawText"].to_list(), enc_data

In [8]:
HUGGING_FACE_PATH = "klue/bert-base"
model = TFAutoModelForSequenceClassification.from_pretrained(HUGGING_FACE_PATH, num_labels=num_labels, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(HUGGING_FACE_PATH)

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [9]:
X_train_encoding = tokenizer(X_train, padding=True, truncation=True, max_length=42)

In [10]:
SHUFFLE_PARAM = 1000

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train_encoding),
    y_train
)).shuffle(SHUFFLE_PARAM)

In [11]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, metrics=["accuracy"])
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  110617344 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 110619651 (421.98 MB)
Trainable params: 110619651 (421.98 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
BATCH_PARAM = 32

validation_length = len(X_train) // 10
train_except_val = train_dataset.skip(validation_length).batch(BATCH_PARAM)
validation_data = train_dataset.take(validation_length).batch(BATCH_PARAM)

In [13]:
model.fit(
    train_except_val,
    epochs=1,
    batch_size=BATCH_PARAM,
    validation_data=validation_data)



<keras.src.callbacks.History at 0x7b15c8321a20>

In [14]:
test_df = pd.read_csv(test_data_path).loc[:, ["RawText", "GeneralPolarity"]].drop_duplicates().reset_index(drop=True)
test_df.head(2)

Unnamed: 0,RawText,GeneralPolarity
0,바늘질 마감처리 불량. 싸구려 느낌이 팍팍. 털빠짐이 없다해서 구매했는데 털빠짐이 ...,-1.0
1,가격대비 퀄리티 좋습니다. 두께도 적당하고 자켓안에 있기에도 좋네요~~ 키 178 ...,1.0


In [42]:
X_test = test_df.loc[:, "RawText"].to_list()
y_test = label_encoder.transform(test_df.loc[:, "GeneralPolarity"].to_list())

In [23]:
X_test_encoding = tokenizer(X_test, padding=True, truncation=True, max_length=42)

In [34]:
test_dataset = tf.data.Dataset.from_tensor_slices(
    dict(X_test_encoding)
).batch(BATCH_PARAM)

In [35]:
predictions = model.predict(test_dataset)
predictions.logits



array([[ 3.3802183 , -0.67968434, -3.0090199 ],
       [-2.879693  , -1.2783443 ,  4.2939796 ],
       [-2.9588764 , -1.8239427 ,  4.816224  ],
       ...,
       [ 3.2928529 , -1.4270289 , -2.4327533 ],
       [ 3.0505805 , -0.61588275, -2.5736423 ],
       [ 2.3770638 , -0.5721623 , -1.755083  ]], dtype=float32)

In [40]:
y_pred = np.argmax(predictions.logits, axis=1)
y_pred

array([0, 2, 2, ..., 0, 0, 0])

In [43]:
accuracy_score(y_test, y_pred)

0.8265233988406616