In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt                         


df = pd.read_csv("https://bit.ly/seoul-120-text-csv")
df.shape


(2645, 5)

In [2]:
df.head(3)

Unnamed: 0,번호,분류,제목,내용,내용번호
0,2645,복지,아빠 육아휴직 장려금,아빠 육아휴직 장려금 업무개요 남성근로자의 육아휴직을 장려하고 양육에 따른 경...,23522464
1,2644,경제,[서울산업진흥원] 서울메이드란?,서울산업진흥원 서울메이드란 서울의 감성을 담은 다양하고 새로운 경험을 제공하기 위해...,23194045
2,2643,환경,(강북구) 정비중,강북구 정비중 업무개요 투명 폐트병을 교환보상하므로 수거율을 높이고 폐기물을 감...,23032485


In [3]:
df = df.dropna()
df.shape

(2645, 5)

In [29]:
df.dtypes

번호        int64
분류       object
제목       object
내용       object
내용번호      int64
문서       object
유사도     float64
dtype: object

In [4]:
df.isnull().sum()

번호      0
분류      0
제목      0
내용      0
내용번호    0
dtype: int64

In [5]:
df["문서"] = df["제목"] + " " + df["내용"]

In [6]:
# 단어들의 출현 빈도(frequency)로 여러 문서들을 벡터화하기 위해 CountVectorizer 사용
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words=["돋움", "경우", "또는"])

In [7]:
print(cv)

CountVectorizer(stop_words=['돋움', '경우', '또는'])


In [8]:
# fit_transform을 사용하여 문장에서 노출되는 feature(특징이 될만한 단어) 수를 합한 변수 Document Term Matrix(이하 dtm)를 생성
dtm_cv = cv.fit_transform(df["문서"])

In [9]:
# cv.vocabulary_ 를 봅니다.
cv.vocabulary_

{'아빠': 30166,
 '육아휴직': 35794,
 '장려금': 40098,
 '업무개요': 31494,
 '남성근로자의': 9780,
 '육아휴직을': 35798,
 '장려하고': 40101,
 '양육에': 31079,
 '따른': 14184,
 '경제적': 3650,
 '부담을': 20458,
 '완화함으로써': 33605,
 '일과': 37802,
 '가정생활의': 694,
 '양립': 31009,
 '가족친화적인': 784,
 '사회환경': 22911,
 '조성': 43603,
 '지원대상': 46358,
 '신청일': 29324,
 '기준': 8969,
 '이상': 36691,
 '계속하여': 3781,
 '서초구에': 24773,
 '주민등록': 44417,
 '되어': 13584,
 '있는': 38959,
 '육아휴직자': 35799,
 '대상자녀': 11880,
 '신청기간': 29232,
 '시작일': 28426,
 '이후': 37282,
 '개월부터': 1767,
 '종료일': 44002,
 '개월': 1753,
 '이내': 36439,
 '신청방법': 29260,
 '온라인': 33390,
 '서초구청': 24776,
 '홈페이지': 55328,
 '경로': 3421,
 '분야별정보': 21117,
 '복지': 20129,
 '영유아복지': 32834,
 '아빠육아휴직장려금': 30170,
 '신청': 29215,
 '바로가기': 17408,
 '방문': 18118,
 '동주민센터': 13431,
 '여성보육과': 31951,
 '구비서류': 6902,
 '고용센터': 4208,
 '발행': 18048,
 '육아휴직급여': 35795,
 '지급결정': 45803,
 '통지서': 51047,
 '주민등록등본': 44419,
 '부세대원': 20662,
 '이름과': 36609,
 '전입일자': 41564,
 '포함': 52225,
 '모든': 15657,
 '구성원': 6951,
 '주민번호': 44460,
 '뒷자리': 13756,
 '

In [11]:
cv_cols = cv.get_feature_names_out()
print(cv_cols)

['03월' '08년' '10' ... '힘쓴다' '힘을' '힘이']


In [14]:
dtm_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [28]:
# 각 row에서 전체 단어가방에 있는 어휘에서 등장하는 단어에 대한 one-hot-vector를 확인
# toarray()로 희소 행렬(sparse matrix, 행렬의 값이 대부분 '0'인 행렬)을 NumPy array 배열로 변환하여 값을 확인

pd.DataFrame(dtm_cv.toarray(), columns=cv_cols).sum().sort_values()

03월           1
우리들           1
우리동네육아지원      1
우리동네보육반장      1
우리동           1
           ... 
대한          296
어떻게         442
서울시         448
있습니다        496
있는          536
Length: 43075, dtype: int64

In [15]:
# 정답인 '분류'의 유일한 값을 확인하여 주제 수를 확인
df["분류"].value_counts()

분류
행정        1098
경제         823
복지         217
환경         124
주택도시계획     110
문화관광        96
교통          90
안전          51
건강          23
여성가족        13
Name: count, dtype: int64

In [16]:
#from keras.preprocessing.text import Tokenizer
# 주어진 문서에 대하여 각 문서에 어떤 주제들이 존재하는지를 확인하는 잠재 디리클레 분석(LDA)을 불러옴
# n_components에 넣을 하이퍼파라미터 NUM_TOPICS로 주제수를 설정(기본값=10)
# max_iter는 훈련 데이터(epoch라고도 함)에 대한 최대 패스 수(기본값=10)

from sklearn.decomposition import LatentDirichletAllocation

NUM_TOPICS = 10
LDA_model = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state=42)

In [17]:
# LDA_model 에 dtm_cv 를 넣어 학습
LDA_model.fit(dtm_cv)

In [20]:
import pyLDAvis.lda_model

pyLDAvis.enable_notebook()
pyLDAvis.lda_model.prepare(LDA_model, dtm_cv, cv, mds='tsne')

In [21]:
# TF-IDF 방식으로 단어의 가중치를 조정한 BOW 인코딩하여 벡터화하기 위해 TfidfVectorizer를 사용

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=["돋움", "경우", "또는", "있습니다", "있는", "합니다"])
tfidf

In [22]:
# 문장에서 노출되는 feature(특징이 될만한 단어) 수를 합한 변수 Document Term Matrix(이하 dtm)를 생성
dtm_tfidf = tfidf.fit_transform(df["문서"])

In [23]:
# tfidf.vocabulary_
cols_tfidf = tfidf.get_feature_names_out()

In [24]:
# dtm_tf를 axis=0(수직 방향으로) 기준으로 합계를 낸 dist 변수를 생성
# dist 변수를 vocabulary_ 순으로 정렬하여 비율을 확인
dist = np.sum(dtm_tfidf, axis=0)
pd.DataFrame(dist, columns=cols_tfidf).T.sort_values(by=0).tail(10)

Unnamed: 0,0
의한,15.02184
무엇입니까,15.270257
이상,15.577954
관한,16.593598
무엇인가요,16.650743
따라,16.652594
대한,18.866037
있나요,19.707343
서울시,22.586695
어떻게,37.924574


In [25]:
# 각 row에서 전체 단어가방에 있는 어휘에서 등장하는 단어에 대한 가중치를 적용한 vector를 확인
# toarray()로 희소 행렬(sparse matrix, 행렬의 값이 대부분 '0'인 행렬)을 NumPy array 배열로 변환하여 값을 확인
pd.DataFrame(dtm_tfidf.toarray(), columns=cols_tfidf)

Unnamed: 0,03월,08년,10,100명이상인,100세가,10만원,10만원상당,10명이고,10인승,10인의,...,힐링프로그램을,힐링하는,힐스테이트,힘들,힘들경우,힘들고,힘쓰고있습니다,힘쓴다,힘을,힘이
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_simple_pair = cosine_similarity(dtm_tfidf[1] , dtm_tfidf)
result_list = similarity_simple_pair.tolist()[0]

In [27]:
df["유사도"] = result_list
df[["분류", "제목", "유사도"]].sort_values(by="유사도", ascending=False).head(10)

Unnamed: 0,분류,제목,유사도
1,경제,[서울산업진흥원] 서울메이드란?,1.0
84,경제,[서울산업진흥원] 아이마켓서울유 매장입점 지원사업 (오프라인판로지원),0.120297
39,경제,[서울산업진흥원] 창업지원센터,0.088462
117,경제,[서울산업진흥원] 쇼핑몰 구축 지원사업 서울샵,0.074788
605,경제,서울디자인맵은 무엇인가요?,0.073206
2181,행정,서울의 특산물은 무엇인가요?,0.064946
160,행정,동대문구 상징 캐릭터 꿈동이,0.055611
608,경제,서울의 디자인 자산 컨텐츠는 어떤 것들이 있을까요?,0.054889
312,주택도시계획,(남산) 서울 중심점,0.053904
612,경제,서울의 디자인 자산중 근현대 건축물은 어떤 것들이 있나요?,0.049457


In [None]:
import tensorflow as tf
print(tf.__version__)
