In [1]:
!pip install transformers datasets torch tqdm pandas kss scikit-learn

Collecting kss
  Downloading kss-6.0.5-py3-none-any.whl.metadata (162 kB)
Collecting emoji==1.2.0 (from kss)
  Downloading emoji-1.2.0-py3-none-any.whl.metadata (4.3 kB)
Collecting pecab (from kss)
  Downloading pecab-1.0.8.tar.gz (26.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.4/26.4 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jamo (from kss)
  Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)
Collecting hangul-jamo (from kss)
  Downloading hangul_jamo-1.0.1-py3-none-any.whl.metadata (899 bytes)
Collecting tossi (from kss)
  Downloading tossi-0.3.1.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting distance (from kss)
  Downloading Distance-0.1.3.tar.gz (180 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

import pandas as pd
df = pd.read_csv("crawling/naver_blog_reviews_filtered.csv")

MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)


# 감성 분석 함수 정의
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()

    if pred == 0:
        return "부정"
    elif pred == 1:
        return "긍정"
    else:
        return "중립"
    
# 데이터에 감성 태그 붙이기
from tqdm import tqdm
sentiments = []

for title in tqdm(df['제목'].dropna()):
    sentiment = analyze_sentiment(title)
    sentiments.append(sentiment)

df['감성'] = sentiments
print(df[['제목', '감성']].head())

# 감성 분포 시각화
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.family'] = 'Malgun Gothic'

plt.figure(figsize=(6, 4))
sns.countplot(x='감성', data=df, palette='Set2')
plt.title("감성 분류 결과")
plt.show()

# 감성 분석 결과 저장
df.to_csv("인공지능_블로그_감성분석결과.csv", index=False, encoding='utf-8-sig')

tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyError: '제목'

model.safetensors:   0%|          | 0.00/452M [00:00<?, ?B/s]

In [None]:
# # 모델 로드
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name,
#     num_labels=3  # 감정 카테고리 수
# )

In [None]:
# # 학습 설정 & Trainer 객체 생성
# from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=5e-5,
#     per_device_train_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=10
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset,
#     eval_dataset=dataset
# )

# trainer.train()

In [None]:
# text = "이 식당 진짜 맛있어요"
# inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# with torch.no_grad():
#     outputs = model(**inputs)
#     logits = outputs.logits
#     pred = torch.argmax(logits, dim=-1).item()

# print("감정 라벨:", pred)  # 0=부정, 1=중립, 2=긍정