# 필요한 패키지 import
- kiwi 형태소 분석기

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# !pip install kiwi
# !pip install kiwipiepy
# !pip install pandas scikit-learn matplotlib wordcloud konlpy
# !pip install kiwi-python
# !pip install kiwi-kr

In [4]:
import json
from kiwipiepy import Kiwi

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# 수행과정
- 구축해놓은 국방 전용 말뭉치 사전을 통해 kiwi 형태소 분석기를 학습
- 학습된 Kiwi로 '용례'에 대해 형태소 분석
- TF-IDF를 통해 국방-무기체계 관련 키워드 추출

# KIWI 학습
- 국방전용 말뭉치 사전의 '용어' 칼럼을 add_user_word()로 학습

In [5]:
# 1. 말뭉치.json 파일 로드
with open('/content/drive/MyDrive/크롤링프로젝트_정리본/말뭉치사전/말뭉치사전.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 2. Kiwi 초기화
kiwi = Kiwi()

# 3. 각 용어에 대해 형태소 분석을 실행하고, 사용자 사전에 추가하기
for item in data:
    term = item['용어']
    category = item['분야']
    sub_category = item['세부항목']
    definition = item['설명']
    source = item['출처']
    examples = item['용례리스트']
    example_sources = item['용례출처리스트']

    # 4. 용어를 사용자 사전에 하나씩 추가
    kiwi.add_user_word(term, tag='NNP')  # 고유명사로 등록

    # 5. 형태소 분석 실행
    result = kiwi.analyze(term)

    # 형태소 분석 결과 출력
    print("형태소 분석 결과:")
    for word, tag in result:
        print(f"({word}, {tag})")

    print("="*50)  # 구분선


형태소 분석 결과:
([Token(form='45형 데어링급 구축함', tag='NNP', start=0, len=12)], -12.459455490112305)
형태소 분석 결과:
([Token(form='5세대 전투기', tag='NNP', start=0, len=7)], -12.459455490112305)
형태소 분석 결과:
([Token(form='5호 전차 판터', tag='NNP', start=0, len=8)], -12.459455490112305)
형태소 분석 결과:
([Token(form='99식 소총', tag='NNP', start=0, len=6)], -12.459455490112305)
형태소 분석 결과:
([Token(form='A-10', tag='NNP', start=0, len=4)], -12.459455490112305)
형태소 분석 결과:
([Token(form='A-4 스카이호크 공격기', tag='NNP', start=0, len=13)], -12.459455490112305)
형태소 분석 결과:
([Token(form='A-6 인트루더', tag='NNP', start=0, len=8)], -12.459455490112305)
형태소 분석 결과:
([Token(form='AC-130 건십', tag='NNP', start=0, len=9)], -12.459455490112305)
형태소 분석 결과:
([Token(form='AGM-86', tag='NNP', start=0, len=6)], -12.459455490112305)
형태소 분석 결과:
([Token(form='AH-1Z 바이퍼', tag='NNP', start=0, len=9)], -12.459455490112305)
형태소 분석 결과:
([Token(form='AIR-2 지니', tag='NNP', start=0, len=8)], -12.459455490112305)
형태소 분석 결과:
([Token(form='AK', tag='SL', start=0, l

## 결과
: 띄어쓰기가 있는 국방 용어를 하나의 단어로 인식 가능하게함
- 기존 kiwi : 국방 용어인 '현무 미사일' => 현무, 미사일 로 토큰화
- 학습된 kiwi : 국방 용어인 '현무 미사일' => [현무 미사일] 한 단어로 인식

- 기존 kiwi 형태소 분석기

In [6]:
# 1. Kiwi 초기화
kiwi = Kiwi()

sentence = "대한민국의 흑표 전차는 T-50 전차와 함께 우수한 열차포의 성능을 가지고, 현무 미사일도 있습니다."
result = kiwi.analyze(sentence)
print(result)

[([Token(form='대한민국', tag='NNP', start=0, len=4), Token(form='의', tag='JKG', start=4, len=1), Token(form='흑표', tag='NNP', start=6, len=2), Token(form='전차', tag='NNG', start=9, len=2), Token(form='는', tag='JX', start=11, len=1), Token(form='T', tag='SL', start=13, len=1), Token(form='-', tag='SO', start=14, len=1), Token(form='50', tag='SN', start=15, len=2), Token(form='전차', tag='NNG', start=18, len=2), Token(form='와', tag='JKB', start=20, len=1), Token(form='함께', tag='MAG', start=22, len=2), Token(form='우수', tag='NNG', start=25, len=2), Token(form='하', tag='XSA', start=27, len=1), Token(form='ᆫ', tag='ETM', start=27, len=1), Token(form='열차포', tag='NNP', start=29, len=3), Token(form='의', tag='JKG', start=32, len=1), Token(form='성능', tag='NNG', start=34, len=2), Token(form='을', tag='JKO', start=36, len=1), Token(form='가지', tag='VV', start=38, len=2), Token(form='고', tag='EC', start=40, len=1), Token(form=',', tag='SP', start=41, len=1), Token(form='현무', tag='NNG', start=43, len=2), Toke

- 학습시킨 kiwi

In [7]:
with open('/content/drive/MyDrive/크롤링프로젝트_정리본/말뭉치사전/말뭉치사전.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

custom_words = [(item["용어"], "NNP") for item in data]

for word, tag in custom_words:
    try:
        kiwi.add_user_word(word, tag)
    except Exception as e:
        print(f"단어 추가 실패: {word} -> {e}")

- 결과
: '현무 미사일' , '5세대 전투기', '알레이버크급 구축함' 모두 하나의 단어로 인식

In [8]:
sentence = "대한민국의 흑표 전차는 T-50 전차와 함께 우수한 열차포의 성능을 가지고, 현무 미사일과 5세대 전투기와 알레이버크급 구축함도 있습니다."
result = kiwi.analyze(sentence)

print("형태소 분석 결과:")
for token in result[0][0]:
    print(f"{token.form} ({token.tag})")

형태소 분석 결과:
대한민국 (NNP)
의 (JKG)
흑표 (NNP)
전차 (NNG)
는 (JX)
T-50 (NNP)
전 (MM)
차 (NNG)
와 (JKB)
함께 (MAG)
우수 (NNG)
하 (XSA)
ᆫ (ETM)
열차포 (NNP)
의 (JKG)
성능 (NNG)
을 (JKO)
가지 (VV)
고 (EC)
, (SP)
현무 미사일 (NNP)
과 (JC)
5세대 전투기 (NNP)
와 (JC)
알레이버크급 구축함 (NNP)
도 (JX)
있 (VV)
습니다 (EF)
. (SF)


# 말뭉치 TF-IDF
TF-IDF란?
- TF-IDF를 통해 특정 문서에서 자주 쓰이면서, 전체적으로는 흔하지 않아 “문서 특징”을 잘 나타내는 키워드를 추출


## 용례 벡터화
- kiwi_tokenizer: 학습된 kiwi 토크나이저를 기반으로 용례 토큰화 후, 고유명사만 추출
- TfidfVectorizer: 각 용례의 토큰별 TF-IDF 계산

In [15]:
example = pd.read_csv('/content/drive/MyDrive/크롤링프로젝트_정리본/용례테이블/용례테이블.csv')

In [16]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# kiwi 기반 토큰 함수 정의

def kiwi_tokenizer(text):
    result = kiwi.analyze(text)[0][0]
    return [token.form for token in result
            if token.tag in ['NNP']]

# 각 용례에 kiwi 토크나이저 적용
vectorizer = TfidfVectorizer(tokenizer=kiwi_tokenizer, lowercase=False)
tfidf_matrix = vectorizer.fit_transform(example['용례'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),
                        columns=vectorizer.get_feature_names_out()) # 행렬: 용례 index X 토큰 개수



- 결과 [tfidf_df]
: 문서 × 토큰 형태의 DataFrame 완성

- 예: tfidf_df.loc[17, '현무 미사일'] 은 17번째 문서에서 “현무 미사일”의 TF-IDF 점수

- 학습한 kiwi토크나이저로 인해 특수문자, 띄어쓰기가 있는 용어 'A-10'이나 '45형 데어링급 구축함'등이 하나의 토큰으로 분리됨을 확인

In [17]:
tfidf_df

Unnamed: 0,007 죽느냐 사느냐,1D,2차대전,45형 데어링급 구축함,5세대 전투기,5호 전차 판터,97식,99식,99식 소총,A-10,...,히긴스,히데요시,히로시마,히말라야,히브리어,히스토리,히트,히틀러,힐트,힝클리
0,0.0,0.0,0.0,0.0,0.399093,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.782347,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.473462,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.392777,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11010,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11011,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11012,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11013,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 말뭉치 TF-IDF 결과
- 전체 문서 평균 TF-IDF 계산
- 가장 ‘차별화된’ 단어들 상위 20개 추출

In [20]:
custom_stopwords = ['개발', '사용', '운용', '가능']
# 전체 용례 평균 TF-IDF 계산
global_mean = tfidf_df.mean(axis=0) # 모든 용례에 거친 TF-IDF값 평균

# 불용어 제외
filtered = global_mean[~global_mean.index.isin(custom_stopwords)]

# 상위 20개 키워드 추출
top20 = filtered.sort_values(ascending=False).head(20)

# 결과 출력
print("도메인 핵심 키워드 (상위 20개):")
for term, score in top20.items():
    print(f"{term}: {score:.4f}")

도메인 핵심 키워드 (상위 20개):
기관단총: 0.0417
장갑차: 0.0346
어뢰: 0.0335
공격기: 0.0240
미: 0.0236
미국: 0.0215
함포: 0.0192
기뢰: 0.0185
리볼버: 0.0160
정찰기: 0.0147
소련: 0.0144
지뢰: 0.0138
돌격소총: 0.0138
경기관총: 0.0124
AK-47: 0.0124
독일: 0.0123
박격포: 0.0121
대함미사일: 0.0114
한국: 0.0111
영국: 0.0102


In [24]:
df_top20 = top20.reset_index()
df_top20.columns = ['term', 'tfidf_score']
df_top20.to_csv('/content/drive/MyDrive/크롤링프로젝트_정리본/TF_IDF_시각화.csv', index=False, encoding = 'utf-8-sig')