In [24]:
# !pip install folium

In [23]:
import pandas as pd
import ast
import folium
from IPython.display import display
from collections import defaultdict

# 1) 토큰 → 표준 국가명 매핑
token_to_country = {
    'United Arab Emirates': 'United Arab Emirates',
    '한반도': 'South Korea', '대한': 'South Korea', '한국': 'South Korea', '한': 'South Korea',
    '인천': 'South Korea', '부산': 'South Korea', '서울': 'South Korea', '울산': 'South Korea',
    '국군': 'South Korea', '우리 군': 'South Korea', '당국': 'South Korea', '국내': 'South Korea',
    '북한': 'North Korea',
    '영국': 'United Kingdom',
    '소련': 'Russia', '러시아': 'Russia',
    '서독': 'Germany', '동독': 'Germany', '독일': 'Germany',
    '일제': 'Japan', '일본': 'Japan',
    '중국': 'China',
    '이란': 'Iran',
    '미 공군': 'United States', '미공군': 'United States',
    '미 해병대': 'United States', '미해병대': 'United States',
    '미 육군': 'United States', '미육군': 'United States',
    '미 해군': 'United States', '미해군': 'United States',
    '미군': 'United States', '미 대통령': 'United States', '미국': 'United States',
    '프랑스': 'France',
    '이탈리아': 'Italy',
    '네덜란드': 'Netherlands',
    '벨기에': 'Belgium',
    '터키': 'Turkey',
    '베트남': 'Vietnam',
    '오스트레일리아': 'Australia', '호주': 'Australia',
    '대만': 'Taiwan',
    '아프가니스탄': 'Afghanistan',
    '리비아': 'Libya',
    '유럽': 'Europe',
    '이라크': 'Iraq',
    '인도네시아': 'Indonesia',
    '필리핀': 'Philippines',
    '태국': 'Thailand',
    '페루': 'Peru',
    '캐나다': 'Canada',
    '브라질': 'Brazil',
    '스웨덴': 'Sweden',
    '스페인': 'Spain',
    '이스라엘': 'Israel',
    'UAE': 'United Arab Emirates',
    '사우디': 'Saudi Arabia',
    '폴란드': 'Poland',
    '말레이시아': 'Malaysia',
}

# 토큰을 길이 내림차순으로 정렬 (긴 토큰부터 매칭)
tokens_sorted = sorted(token_to_country.keys(), key=len, reverse=True)

# 2) 표준 국가명 → 수도 위도·경도 매핑
coords = {
    'South Korea': (37.5665, 126.9780),          # Seoul
    'North Korea': (39.0392, 125.7625),          # Pyongyang
    'United Kingdom': (51.5074, -0.1278),        # London
    'Russia': (55.7558, 37.6173),                # Moscow
    'Germany': (52.5200, 13.4050),               # Berlin
    'Japan': (35.6895, 139.6917),                # Tokyo
    'China': (39.9042, 116.4074),                # Beijing
    'Iran': (35.6892, 51.3890),                  # Tehran
    'United States': (38.9072, -77.0369),        # Washington, D.C.
    'France': (48.8566, 2.3522),                 # Paris
    'Italy': (41.9028, 12.4964),                 # Rome
    'Netherlands': (52.3676, 4.9041),            # Amsterdam
    'Belgium': (50.8503, 4.3517),                # Brussels
    'Turkey': (39.9334, 32.8597),                # Ankara
    'Vietnam': (21.0278, 105.8342),              # Hanoi
    'Australia': (-35.2809, 149.1300),           # Canberra
    'Taiwan': (25.03297, 121.56542),             # Taipei
    'Afghanistan': (34.5553, 69.2075),           # Kabul
    'Libya': (32.8872, 13.1913),                 # Tripoli
    'Europe': (50.8503, 4.3517),                 # Brussels (EU)
    'Iraq': (33.3152, 44.3661),                  # Baghdad
    'Indonesia': (-6.2088, 106.8456),            # Jakarta
    'Philippines': (14.5995, 120.9842),          # Manila
    'Thailand': (13.7563, 100.5018),             # Bangkok
    'Peru': (-12.0464, -77.0428),                # Lima
    'Canada': (45.4215, -75.6972),               # Ottawa
    'Brazil': (-15.7939, -47.8828),              # Brasília
    'Sweden': (59.3293, 18.0686),                # Stockholm
    'Spain': (40.4168, -3.7038),                 # Madrid
    'Israel': (31.7683, 35.2137),                # Jerusalem
    'United Arab Emirates': (24.4539, 54.3773),  # Abu Dhabi
    'Saudi Arabia': (24.7136, 46.6753),          # Riyadh
    'Poland': (52.2297, 21.0122),                # Warsaw
    'Malaysia': (3.1390, 101.6869),              # Kuala Lumpur
}

# 3) CSV 읽고 '용례리스트' 파싱
df = pd.read_csv('통합.csv')
df['용례리스트'] = df['용례리스트'].apply(
    lambda x: ast.literal_eval(x) if pd.notnull(x) else []
)

# 4) 국가별 집계 (행 단위 한 번씩 카운트 + 세부항목별 빈도 집계)
country_data = {}
for _, row in df.iterrows():
    countries_in_row = set()
    for sentence in row['용례리스트']:
        for token in tokens_sorted:
            if token in sentence:
                countries_in_row.add(token_to_country[token])
    for country in countries_in_row:
        rec = country_data.setdefault(
            country,
            {
                'count': 0,
                'item_freq': defaultdict(int),
                'words': set()
            }
        )
        rec['count'] += 1
        rec['item_freq'][row['세부항목']] += 1
        rec['words'].add(row['용어'])

# 5) Folium 맵 생성 및 마커 추가
m = folium.Map(location=[20, 0], zoom_start=2)
for country, rec in country_data.items():
    lat, lon = coords[country]
    # 세부항목별 빈도를 "항목: 빈도" 형태로 정리
    item_lines = [f"{item}: {cnt}" for item, cnt in rec['item_freq'].items()]
    # 용어들을 쉼표로 나열
    term_list = ", ".join(rec['words'])
    popup = (
        f"<b>{country}</b><br>"
        f"등장 행 수: {rec['count']}건<br>"
        f"세부항목별 빈도:<br>{'<br>'.join(item_lines)}<br>"
        f"용어:<br>{term_list}"
    )
    folium.Marker(
        [lat, lon],
        popup=folium.Popup(popup, max_width=300)
    ).add_to(m)

# 6) Jupyter Notebook에서 바로 보기
display(m)
