In [6]:
!pip install "accelerate>=0.26.0 "



In [7]:
!pip install --upgrade pyarrow



In [8]:
!pip install --upgrade datasets



In [9]:
!pip install datasets



In [10]:
!pip install transformers



In [11]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from collections import Counter
from konlpy.tag import Mecab
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt", filename="ratings_total.txt")

('ratings_total.txt', <http.client.HTTPMessage at 0x106a00dc0>)

In [13]:
total_data = pd.read_table('ratings_total.txt', names=['ratings', 'reviews'])
print('전체 리뷰 개수 :',len(total_data)) # 전체 리뷰 개수 출력


전체 리뷰 개수 : 200000


In [14]:
total_data['label'] = np.select([total_data.ratings > 3], [1], default=0)
total_data[:5]


Unnamed: 0,ratings,reviews,label
0,5,배공빠르고 굿,1
1,2,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고,0
2,5,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...,1
3,2,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...,0
4,5,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ,1


In [15]:
total_data['ratings'].nunique(), total_data['reviews'].nunique(), total_data['label'].nunique()


(4, 199908, 2)

In [16]:
total_data.drop_duplicates(subset=['reviews'], inplace=True) # reviews 열에서 중복인 내용이 있다면 중복 제거
print('총 샘플의 수 :',len(total_data))

총 샘플의 수 : 199908


In [17]:
print(total_data.isnull().values.any())

False


In [18]:
# train_data, test_data = train_test_split(total_data, test_size = 0.25, random_state = 42)
# print('훈련용 리뷰의 개수 :', len(train_data))
# print('테스트용 리뷰의 개수 :', len(test_data))

In [19]:
# train_data['label'].value_counts().plot(kind = 'bar')

# plt.title('Distribution of Label in Train Data')
# plt.xlabel('Label')
# plt.ylabel('Number of Reviews')
# plt.show()

In [20]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
model_name = "google-bert/bert-base-multilingual-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from datasets import Dataset

dataset_raw = Dataset.from_pandas(total_data)

In [23]:
dataset_raw

Dataset({
    features: ['ratings', 'reviews', 'label', '__index_level_0__'],
    num_rows: 199908
})

In [24]:
# Split the dataset into training and testing subsets
dataset = dataset_raw.train_test_split(test_size=0.2, seed=42)  # 20% for testing, 80% for training

train_dataset = dataset['train']
test_dataset = dataset['test']

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 159926
Test set size: 39982


In [25]:
# 토큰화된 덱스트 길이가 모델의 최대길이를 초과할 경우 자르기 위한 함수
def preprocess_data(example, tokenizer):
  return tokenizer(example['reviews'], truncation=True)

In [26]:
print(dataset.column_names)

{'train': ['ratings', 'reviews', 'label', '__index_level_0__'], 'test': ['ratings', 'reviews', 'label', '__index_level_0__']}


In [27]:
processed_dataset = dataset.map(
    lambda example: preprocess_data(example, tokenizer),
    batched=True,
    remove_columns=['ratings', '__index_level_0__', 'reviews']
).rename_column('label', 'labels')

Map: 100%|██████████| 159926/159926 [00:40<00:00, 3967.49 examples/s]
Map: 100%|██████████| 39982/39982 [00:10<00:00, 3641.04 examples/s]


In [28]:
print(processed_dataset)
print(processed_dataset['train'].features)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 159926
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 39982
    })
})
{'labels': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [29]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

In [30]:
# 모든 데이터를 모델의 최대길이나 다른 정해진 길이로 패딩하는 방법
max_length_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
max_length_dataloader = DataLoader(processed_dataset['train'], batch_size=4, collate_fn=max_length_collator, shuffle=False)

In [31]:
max_length_iterator = iter(max_length_dataloader)
max_length_batch = next(max_length_iterator)
print("max_length 패딩 입력 id shape:", max_length_batch['input_ids'].shape)
print("max_length 패딩 어텐션 마스크 shape:", max_length_batch['attention_mask'].shape)

max_length 패딩 입력 id shape: torch.Size([4, 512])
max_length 패딩 어텐션 마스크 shape: torch.Size([4, 512])


In [32]:
# 배치내에서 가장 긴 데이터 길이로 패딩하는 방법
longest_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest"
    )
longest_dataloader = DataLoader(
    processed_dataset['train'],
    batch_size=4,
    collate_fn=longest_collator,
    shuffle=False
  )

In [35]:
longest_iterator = iter(longest_dataloader)
longest_batch = next(longest_iterator)
print("longest 패딩 입렵 id shape", longest_batch['input_ids'].shape)
print("longest 패딩 어텐션 마스크 shape", longest_batch['attention_mask'].shape)

longest 패딩 입렵 id shape torch.Size([4, 70])
longest 패딩 어텐션 마스크 shape torch.Size([4, 70])


In [36]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="../Data/sample_data/text_classification",
    per_device_train_batch_size=8, # 학습 배치 크기
    per_device_eval_batch_size=16, # 검증 배치 크기
    learning_rate=5e-5, # 학습률
    num_train_epochs=1, # 학습 에포크
    eval_steps = 200, # 200 step마다 모델을 평가하고 로깅
    logging_steps=200,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"].select(range(10000)),
    eval_dataset=processed_dataset["test"].select(range(100)),
    data_collator=max_length_collator,
)

trainer.train()

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
trainer.train()

In [None]:
import torch

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

text = "진짜 재미있었어요. 또 보러 갈거에요"
inputs = tokenizer(text, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)
    print(outputs.logits)
    print(outputs.logits.argmax())

In [None]:
text = "연기력이 별로 에요"

inputs = tokenizer(text, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)
    print(outputs.logits)
    print(outputs.logits.argmax())