# Distil-BERT, SentimentText to Aspect

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
Colle

In [2]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.metrics import accuracy_score

In [3]:
target_data_path = "test 데이터 위치를 입력합니다."
model_save_path = "모델이 저장될 위치를 입력합니다."

In [4]:
# Use saved model - 로컬에 저장된 모델 불러오기
NUM_LABELS = 3
model = TFAutoModelForSequenceClassification.from_pretrained(model_save_path, num_labels=NUM_LABELS, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(model_save_path, local_files_only=True)

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/zb-2nd/distilbert-aspect-sent.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
test_base_df = pd.read_csv(target_data_path).dropna()
test_base_df.head(2)

Unnamed: 0,Index,RawText,Source,Domain,MainCategory,ProductName,ReviewScore,Syllable,Word,RDate,GeneralPolarity,Aspect,SentimentText,SentimentWord,SentimentPolarity
0,112814,바늘질 마감처리 불량. 싸구려 느낌이 팍팍. 털빠짐이 없다해서 구매했는데 털빠짐이 ...,쇼핑몰,패션,남성의류,OO 프** 경량 다운 2종,20,128,29,20181215,-1.0,품질,싸구려 느낌이 팍팍.,3,-1
1,112814,바늘질 마감처리 불량. 싸구려 느낌이 팍팍. 털빠짐이 없다해서 구매했는데 털빠짐이 ...,쇼핑몰,패션,남성의류,OO 프** 경량 다운 2종,20,128,29,20181215,-1.0,품질,털빠짐이 심함.,2,-1


In [6]:
test_df = test_base_df.loc[:, ["RawText", "Aspect", "SentimentPolarity"]].drop_duplicates().reset_index(drop=True)
test_df.head(3)

Unnamed: 0,RawText,Aspect,SentimentPolarity
0,바늘질 마감처리 불량. 싸구려 느낌이 팍팍. 털빠짐이 없다해서 구매했는데 털빠짐이 ...,품질,-1
1,바늘질 마감처리 불량. 싸구려 느낌이 팍팍. 털빠짐이 없다해서 구매했는데 털빠짐이 ...,기능,-1
2,가격대비 퀄리티 좋습니다. 두께도 적당하고 자켓안에 있기에도 좋네요~~ 키 178 ...,품질,1


In [7]:
X_test_encoding = tokenizer(
    text=test_df["RawText"].to_list(),
    text_pair=test_df["Aspect"].to_list(),
    padding=True,
    truncation=True,
    max_length=42
)

In [8]:
test_dataset = tf.data.Dataset.from_tensor_slices(
    dict(X_test_encoding)
)

In [9]:
predictions = model.predict(test_dataset)
type(predictions)



transformers.modeling_tf_outputs.TFSequenceClassifierOutput

In [10]:
predictions.logits

array([[ 2.748322  , -1.6687123 , -1.1458524 ],
       [ 0.9370347 , -1.0011877 ,  0.16751605],
       [ 0.14698297, -2.2247612 ,  1.6456915 ],
       ...,
       [ 1.4818718 , -1.4365282 , -0.05779631],
       [ 0.5141412 , -1.0212477 ,  0.57856756],
       [ 2.644352  , -1.7032433 , -1.0153561 ]], dtype=float32)

In [11]:
pred_series = pd.Series(np.argmax(predictions.logits, axis=1) - 1, name="SentimentPredict")
pred_series.head(3)

0   -1
1   -1
2    1
Name: SentimentPredict, dtype: int64

In [12]:
result_df = pd.concat([test_df, pred_series], axis=1)
result_df.head()

Unnamed: 0,RawText,Aspect,SentimentPolarity,SentimentPredict
0,바늘질 마감처리 불량. 싸구려 느낌이 팍팍. 털빠짐이 없다해서 구매했는데 털빠짐이 ...,품질,-1,-1
1,바늘질 마감처리 불량. 싸구려 느낌이 팍팍. 털빠짐이 없다해서 구매했는데 털빠짐이 ...,기능,-1,-1
2,가격대비 퀄리티 좋습니다. 두께도 적당하고 자켓안에 있기에도 좋네요~~ 키 178 ...,품질,1,1
3,가격대비 퀄리티 좋습니다. 두께도 적당하고 자켓안에 있기에도 좋네요~~ 키 178 ...,사이즈,1,1
4,가벼워서 봄 가을이나 추운날 속에 바쳐입기 아주 딱입니다 가격도 좋고 옷감 재질...,가격,1,1


In [13]:
result_df["SentimentPolarity"].value_counts()

 1    7416
-1    3302
 0     501
Name: SentimentPolarity, dtype: int64

In [14]:
result_df["SentimentPredict"].value_counts()

 1    7196
-1    4023
Name: SentimentPredict, dtype: int64

In [15]:
accuracy_score(result_df["SentimentPolarity"].to_list(), result_df["SentimentPredict"].to_list())

0.8222657990908281