In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [3]:
train_data_path = "../../data/garments_train.csv"
test_data_path = "../../data/garments_test.csv"

In [4]:
train_df = pd.read_csv(train_data_path).loc[:, ["SentimentText", "SentimentPolarity"]].drop_duplicates().reset_index(drop=True)
train_df.head(2)

Unnamed: 0,SentimentText,SentimentPolarity
0,사이즈가잘맞네요,1
1,좀크게나온듯,-1


In [5]:
label_encoder = LabelEncoder()
enc_data = label_encoder.fit_transform(train_df["SentimentPolarity"])
num_labels = len(set(enc_data))

In [6]:
label_items = label_encoder.classes_
label_numbers = label_encoder.transform(label_items)
dict(zip(label_items, label_numbers))

{-1: 0, 0: 1, 1: 2}

In [7]:
X_train, y_train = train_df.loc[:, "SentimentText"].to_list(), enc_data

In [8]:
HUGGING_FACE_PATH = "klue/bert-base"
model = TFAutoModelForSequenceClassification.from_pretrained(HUGGING_FACE_PATH, num_labels=num_labels, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(HUGGING_FACE_PATH)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
X_train_encoding = tokenizer(X_train, padding=True, truncation=True, max_length=42)

In [10]:
SHUFFLE_PARAM = 1000

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train_encoding),
    y_train
)).shuffle(SHUFFLE_PARAM)

In [11]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, metrics=["accuracy"])
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  110617344 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 110619651 (421.98 MB)
Trainable params: 110619651 (421.98 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
BATCH_PARAM = 32

validation_length = len(X_train) // 10
train_except_val = train_dataset.skip(validation_length).batch(BATCH_PARAM)
validation_data = train_dataset.take(validation_length).batch(BATCH_PARAM)

In [13]:
model.fit(
    train_except_val,
    epochs=1,
    batch_size=BATCH_PARAM,
    validation_data=validation_data)



<keras.src.callbacks.History at 0x7d66e013f1c0>

In [14]:
test_df = pd.read_csv(test_data_path).loc[:, ["SentimentText", "SentimentPolarity"]].drop_duplicates().reset_index(drop=True)
test_df.head(2)

Unnamed: 0,SentimentText,SentimentPolarity
0,싸구려 느낌이 팍팍.,-1
1,털빠짐이 심함.,-1


In [15]:
X_test = test_df.loc[:, "SentimentText"].to_list()
y_test = label_encoder.transform(test_df.loc[:, "SentimentPolarity"].to_list())

In [16]:
X_test_encoding = tokenizer(X_test, padding=True, truncation=True, max_length=42)

In [17]:
test_dataset = tf.data.Dataset.from_tensor_slices(
    dict(X_test_encoding)
).batch(BATCH_PARAM)

In [18]:
predictions = model.predict(test_dataset)
predictions.logits



array([[ 4.428238  , -0.72446096, -1.8984014 ],
       [ 4.6360784 , -1.1507659 , -1.9903612 ],
       [ 4.0729814 , -0.27796435, -2.122097  ],
       ...,
       [ 3.94736   , -0.5733663 , -2.082191  ],
       [ 1.7949303 , -0.36935195, -0.8457958 ],
       [ 4.80551   , -0.90055424, -2.1433537 ]], dtype=float32)

In [19]:
y_pred = np.argmax(predictions.logits, axis=1)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
accuracy_score(y_test, y_pred)

0.9071056610112548