<a href="https://colab.research.google.com/github/ddoyungyung/Final-Team5/blob/main/%EB%A9%8B%EC%9F%81%EC%9D%B4_%EC%82%AC%EC%9E%90%EC%B2%98%EB%9F%BC_final_%ED%94%84%EB%A1%9C%EC%A0%9D%ED%8A%B8_%EC%86%8C%EC%9C%A44(%EC%B9%B4%EC%9D%B4%EC%A0%9C%EA%B3%B1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from collections import Counter
from scipy import stats

# --- 최종 확정된 키워드 리스트 (수동 + BERT 보강) ---
attraction_keywords = ['museum', 'park', 'view', 'beautiful', 'amazing']
eat_keywords = ['restaurant', 'bar']
transport_keywords = ['great', 'train', 'subway', 'metro', 'bus', 'station', 'metre', 'underground', 'transport', 'access', 'location', 'centre', 'position', 'tube']
service_keywords = ['lovely', 'friendly', 'staff', 'helpful', 'balcony', 'room', 'breakfast', 'tidy', 'polite', 'reception', 'front', 'desk', 'value', 'quality', 'check', 'response', 'food', 'drink', 'coffee', 'egg', 'selection']
roomsize_keywords = ['tidy', 'narrows', 'narrow', 'compact', 'small', 'tiny']
noisy_keywords = ['noise', 'noisy', 'soundproofng', 'soundproof', 'sound', 'sounds']
dirty_keywords = ['dirty', 'uncleaned', 'cleaned', 'toilet', 'bathroom', 'pillow', 'bug', 'smell']
expensive_keywords = ['expensive', 'price', 'pricy', 'high', 'prices', 'payment']
facility_amenity_keywords = ['water', 'hot', 'cold', 'condition', 'wifi', 'bed', 'furniture', 'building', 'carpet', 'window', 'mattress', 'elevator', 'lift', 'fridge', 'lighting', 'tub', 'shower', 'air', 'level', 'audio', 'recording']

keyword_categories = {
    'attraction': attraction_keywords,
    'eat': eat_keywords,
    'transport': transport_keywords,
    'service': service_keywords,
    'roomsize': roomsize_keywords,
    'noisy': noisy_keywords,
    'dirty': dirty_keywords,
    'expensive': expensive_keywords,
    'facility_amenity': facility_amenity_keywords
}

In [None]:
# 2. 카테고리 라벨링 함수 정의
def assign_category(adj, noun):
    categories = []
    adj_lower = str(adj).lower()
    noun_lower = str(noun).lower()
    for category, keywords in keyword_categories.items():
        if adj_lower in keywords or noun_lower in keywords:
            categories.append(category)
    return categories if categories else ['other']

# 3. 데이터 로드 및 라벨링
# 긍정 리뷰 쌍 로드
df_pos = pd.read_csv("/content/noun_adj_pairs_raw(pos).csv")
# 부정 리뷰 쌍 로드
df_neg = pd.read_csv("/content/noun_adj_pairs_freq(neg).csv")

In [None]:
# 4. 각 데이터프레임에 카테고리 라벨링 적용
df_pos['categories'] = df_pos.apply(lambda row: assign_category(row['adj'], row['noun']), axis=1)
df_neg['categories'] = df_neg.apply(lambda row: assign_category(row['adj'], row['noun']), axis=1)

# 5. 카테고리별 빈도수 집계
def aggregate_category_counts(df, review_type):
    counts = Counter()
    for categories in df['categories']:
        for cat in categories:
            counts[cat] += 1

    # 데이터프레임으로 변환
    df_counts = pd.DataFrame(counts.items(), columns=['category', 'count']).set_index('category')
    df_counts.rename(columns={'count': review_type}, inplace=True)
    return df_counts

pos_counts = aggregate_category_counts(df_pos, 'positive')
neg_counts = aggregate_category_counts(df_neg, 'negative')

In [None]:
# 6. Contingency Table 생성 (병합)
contingency_table = pd.merge(pos_counts, neg_counts, left_index=True, right_index=True, how='outer').fillna(0).astype(int)
print("긍정 vs 부정 리뷰 카테고리별 빈도 교차표")
print(contingency_table.to_string())

긍정 vs 부정 리뷰 카테고리별 빈도 교차표
                  positive  negative
category                            
attraction           27361       487
dirty                 7834      1491
eat                  12529       521
expensive             6726      1196
facility_amenity     38234      3994
noisy                  835       963
other               229249     54024
roomsize              5826      1780
service             228724      5399
transport           131376      1360


In [None]:
### **2. 카이제곱 검정 실행**

# Contingency Table을 Numpy 배열로 변환하여 카이제곱 검정 실행
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

print(" 카이제곱 검정 결과: 긍정 vs 부정 리뷰 카테고리")
print(f"Chi-square Statistic (카이제곱 통계량): {chi2:.2f}")
print(f"p-value (p-값): {p_value:.10f}")
print(f"Degrees of Freedom (자유도): {dof}")

 카이제곱 검정 결과: 긍정 vs 부정 리뷰 카테고리
Chi-square Statistic (카이제곱 통계량): 65063.38
p-value (p-값): 0.0000000000
Degrees of Freedom (자유도): 9
