In [4]:
!pip install --upgrade pip
!pip install --upgrade transformers
!pip install --upgrade datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
Installing collected packages: fsspec, datasets
[2K  Attempting uninstall: fsspec
[2K    Found existing installation: fsspec 2025.3.2
[2K    Uninstalling fsspec-2025.3.2:
[2K      Successfully uninstalled fsspec-2025.3.2
[2K  Attempting uninstall: datasets
[2K    Found existing installation: datasets 2.14.4
[2K    Uninstalling datasets-2.14.4:
[2K      Successfully uninstalled datasets-2.14.4
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [datasets]
[1A[2K[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following depen

In [17]:
# 1. 설치
!pip install transformers konlpy scikit-learn pandas tqdm --quiet

# 2. 파일 불러오기
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

goodoc = pd.read_csv('goodoc_reviews.csv')
modoodoc = pd.read_csv('modoodoc_reviews.csv')

# 3. 리뷰 데이터 통합 (긍정/부정 컬럼 상관없이 모두)
all_data = []
def add_review_rows(df, hospital_col, pos_col, neg_col):
    for _, row in df.iterrows():
        hosp = row[hospital_col]
        for col in [pos_col, neg_col]:
            if pd.notnull(row.get(col, None)):
                for r in str(row[col]).split('\n'):
                    if r.strip():
                        all_data.append({'hospital': hosp, 'review': r.strip()})

add_review_rows(goodoc, 'hospital_name', 'positive', 'negative')
add_review_rows(modoodoc, '병원명', '긍정리뷰', '부정리뷰')
df = pd.DataFrame(all_data).drop_duplicates(subset=['hospital', 'review'])

# 4. 모델 불러와서 자동 감성 분류(긍정:1, 부정:0)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = logits.argmax(-1).item()
    return pred

df['label'] = df['review'].progress_apply(predict_sentiment)

# 5. 의미 키워드(불용어/동의어) 세팅
import re
from collections import Counter

STOPWORDS = set(['병원','의사','진료','환자','간호사','선생님','수술','약','내원','진단','치료','처방','이용','상담','원장','방문','인증','영수증','센터','한의원','의원','치과'])

MEANING_KEYWORDS = {
    "친절": ["친절", "친근", "상냥", "잘해주", "배려", "따뜻"],
    "불친절": ["불친절", "불쾌", "차가움", "쌀쌀"],
    "자세한 설명": ["설명", "상세", "이해시", "알려", "자세히"],
    "대기시간": ["대기", "대기시간", "기다림", "기다렸", "줄서", "오래", "한참"],
    "과잉진료": ["과잉", "과잉진료", "불필요", "쓸데없", "과하게", "돈만"],
    "비쌈": ["비쌈", "비싸", "가격", "진료비", "돈", "비용"],
    "저렴": ["저렴", "싸"],
    "청결": ["청결", "깨끗", "위생"],
    "시설": ["시설", "인테리어", "환경"],
    "추천": ["추천", "추천함", "강추"],
    "신속": ["신속", "빠르", "빨랐"],
}

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    return ' '.join([w for w in text.split() if w not in STOPWORDS])

def meaning_tokens(text):
    hits = []
    for mean, arr in MEANING_KEYWORDS.items():
        for kw in arr:
            if kw in text:
                hits.append(mean)
    return hits

df['review_clean'] = df['review'].apply(clean_text)
df['meaning_kw'] = df['review_clean'].apply(meaning_tokens)

# 6. 병원별 top3 키워드, 점수화(긍정: +1, 부정: -1, 인증/방문/영수증 언급시 가중치 1.5배)
result = []
for hosp, g in df.groupby('hospital'):
    tokens = []
    score = 0
    for _, row in g.iterrows():
        tokens += row['meaning_kw']
        # 가중치
        w = 1.0
        if any(word in row['review'] for word in ['영수증', '인증', '방문']):
            w = 1.5
        if row['label'] == 1:
            score += w
        else:
            score -= w
    top3 = [w for w, _ in Counter(tokens).most_common(3)]
    result.append({
        'hospital': hosp,
        'score': score,
        'top_keywords': ','.join(top3),
        'n_review': len(g)
    })

df_rank = pd.DataFrame(result).sort_values('score', ascending=False)
print(df_rank.head(10))
df_rank.to_csv('병원별_자동감성_top3키워드_점수화.csv', index=False)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 3945/3945 [12:33<00:00,  5.24it/s]


          hospital  score    top_keywords  n_review
380       사계절에스한의원   19.5      친절,추천,대기시간        38
1122      힐링산부인과의원   19.0  친절,자세한 설명,대기시간        53
151        노원 명한의원   15.5    친절,자세한 설명,추천        46
649         여진주한의원   13.5    친절,추천,자세한 설명        44
235    디딤정신건강의학과의원   13.5  친절,자세한 설명,과잉진료        22
859      인애한의원 노원점   13.0    친절,추천,자세한 설명        33
782   유앤영피부과의원 노원점   12.0      대기시간,친절,추천        71
1070        한의원혜민서   11.0    친절,추천,자세한 설명        35
403     상계바론정형외과의원   11.0  자세한 설명,비쌈,대기시간        30
740      올리브산부인과의원    8.0  자세한 설명,친절,과잉진료        62
