In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
import numpy as np
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

# 1. 데이터 불러오기
df = pd.read_csv('merged_product_data.csv', encoding='cp949')

# 2. NaN 방지 + 포맷 통일
def format_text_with_tags(row):
    text = row.get('text', '')
    if pd.isna(text):  # NaN이면 빈 문자열 처리
        return "[TITLE] 없음 [INFO] 없음"

    fields = text.split(' / ')
    title = fields[0].replace("제품명:", "").strip()
    info = ' / '.join(fields[1:]).strip() if len(fields) > 1 else ""
    return f"[TITLE] {title} [INFO] {info}"

df['text_full'] = df.apply(format_text_with_tags, axis=1)

# 3. 제목만 사용하는 버전
df['text_title_only'] = df['text_full'].apply(lambda x: x.split("[INFO]")[0] + "[INFO]")


In [3]:
# 3. 두 개 버전의 데이터를 합치기 (증강)
df_full = df[['text_full', 'label']].rename(columns={"text_full": "text"})
df_title_only = df[['text_title_only', 'label']].rename(columns={"text_title_only": "text"})
df_augmented = pd.concat([df_full, df_title_only], ignore_index=True)

# 4. 라벨 인코딩
label_encoder = LabelEncoder()
df_augmented['label_id'] = label_encoder.fit_transform(df_augmented['label'])

In [4]:
# 5. Train/Val/Test 분할
train_val_df, test_df = train_test_split(df_augmented, test_size=0.1, stratify=df_augmented['label_id'], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, stratify=train_val_df['label_id'], random_state=42)

# 6. HuggingFace Dataset 변환
train_dataset = Dataset.from_pandas(train_df[['text', 'label_id']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label_id']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label_id']])

In [5]:
# 7. Tokenizer
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

train_dataset = train_dataset.map(tokenize_fn, batched=True).map(lambda x: {'labels': x['label_id']}, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True).map(lambda x: {'labels': x['label_id']}, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True).map(lambda x: {'labels': x['label_id']}, batched=True)


Map: 100%|██████████| 68256/68256 [00:09<00:00, 7354.76 examples/s]
Map: 100%|██████████| 68256/68256 [00:00<00:00, 272108.99 examples/s]
Map: 100%|██████████| 7585/7585 [00:00<00:00, 7941.66 examples/s]
Map: 100%|██████████| 7585/7585 [00:00<00:00, 337941.32 examples/s]
Map: 100%|██████████| 8427/8427 [00:01<00:00, 8106.00 examples/s]
Map: 100%|██████████| 8427/8427 [00:00<00:00, 336558.75 examples/s]


In [6]:
# 8. 모델 정의
model = AutoModelForSequenceClassification.from_pretrained(
    "klue/bert-base",
    num_labels=len(label_encoder.classes_)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# 5. Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [8]:
import transformers

print(transformers.__version__)

4.52.4


In [9]:
import accelerate
print(accelerate.__version__)


1.7.0


In [10]:
import sys
print(sys.executable)

/home/gaon/anaconda3/envs/bert-env/bin/python


In [11]:
# 6. Training arguments
training_args = TrainingArguments(
    output_dir="./results3",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=100,
    weight_decay=0.001,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
)

In [12]:
# 7. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

In [13]:
# 8. Train
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3962,0.340869,0.891496
2,0.2343,0.261978,0.91971
3,0.1454,0.238243,0.929598
4,0.0949,0.214598,0.940804
5,0.0656,0.236389,0.941991
6,0.0474,0.238068,0.944628


TrainOutput(global_step=6402, training_loss=0.18790558716983433, metrics={'train_runtime': 59989.3429, 'train_samples_per_second': 113.78, 'train_steps_per_second': 1.779, 'total_flos': 1.0776118896584294e+17, 'train_loss': 0.18790558716983433, 'epoch': 6.0})

In [14]:
model.save_pretrained("./results4")
tokenizer.save_pretrained("./results4")

('./results4/tokenizer_config.json',
 './results4/special_tokens_map.json',
 './results4/vocab.txt',
 './results4/added_tokens.json',
 './results4/tokenizer.json')

In [15]:
import pickle
with open("./results4/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

In [16]:
# 10. Evaluate on test set
test_result = trainer.evaluate(test_dataset)
print(f"✅ Test Accuracy: {test_result['eval_accuracy']:.4f}")

✅ Test Accuracy: 0.9356


In [17]:
test_result

{'eval_loss': 0.23773278295993805,
 'eval_accuracy': 0.935564257742969,
 'eval_runtime': 110.5774,
 'eval_samples_per_second': 76.209,
 'eval_steps_per_second': 1.194,
 'epoch': 6.0}

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 모델 & 토크나이저 로드
model_path = "./results2"  # 저장된 디렉토리
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

model.eval()  # 추론 모드

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
import torch

def predict(text):
    # 토크나이즈
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()
    return predicted_class_id


In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pickle

# 1. 저장된 모델과 토크나이저 불러오기
model_path = "./results4"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model.eval()  # 추론 모드로 전환

# 2. 저장된 label_encoder 불러오기
with open("./results4/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

# 3. 예측 함수 정의
def predict(text):
    # 텍스트 토큰화
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()

    # 인코딩된 숫자를 다시 라벨(텍스트)로 변환
    predicted_label = label_encoder.inverse_transform([predicted_class_id])[0]
    return predicted_label

In [22]:
text = "제품명: 프리미엄 제주 감귤 3kg / 가격: 12,900원 / 브랜드: 감귤나라"
result = predict(text)
print("예측된 카테고리:", result)

예측된 카테고리: 식품


In [23]:
text = "제품명: 남성 방한화 겨울 털신발 남자 발목부츠 E55 / 가격: 24,400원 / {'상품번호': '7744368727', '상품상태': '신상품', '제조사': '다된데', '브랜드': '다된데', '모델명': 'E55', '원산지': '기타국가', '제품의 주소재': '상품상세설명 참조', '색상': '상품상세설명 참조', '치수': '상품정보 제공고시\n발길이 상품상세설명 참조\n굽높이 상품상세설명 참조', '발길이': '상품상세설명 참조', '굽높이': '상품상세설명 참조', '제조자(사)': '상품상세설명 참조', '제조국': '기타국가', '재화등의 A/S 관련 전화번호': '상세정보 확인'}"
result = predict(text)
print("예측된 카테고리:", result)

예측된 카테고리: 패션잡화
