In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt                         


df = pd.read_csv("https://bit.ly/seoul-120-text-csv")
df.shape


(2645, 5)

In [2]:
df.head(3)

Unnamed: 0,번호,분류,제목,내용,내용번호
0,2645,복지,아빠 육아휴직 장려금,아빠 육아휴직 장려금 업무개요 남성근로자의 육아휴직을 장려하고 양육에 따른 경...,23522464
1,2644,경제,[서울산업진흥원] 서울메이드란?,서울산업진흥원 서울메이드란 서울의 감성을 담은 다양하고 새로운 경험을 제공하기 위해...,23194045
2,2643,환경,(강북구) 정비중,강북구 정비중 업무개요 투명 폐트병을 교환보상하므로 수거율을 높이고 폐기물을 감...,23032485


In [3]:
df = df.dropna()
df.shape

(2645, 5)

In [4]:
df.isnull().sum()

번호      0
분류      0
제목      0
내용      0
내용번호    0
dtype: int64

In [9]:
df["문서"] = df["제목"] + " " + df["내용"]

In [10]:
df["분류"].value_counts()

분류
행정        1098
경제         823
복지         217
환경         124
주택도시계획     110
문화관광        96
교통          90
안전          51
건강          23
여성가족        13
Name: count, dtype: int64

In [11]:
df = df[df["분류"].isin(["행정", "경제", "복지"])]

In [12]:
print(df.shape)
df.head(3)

(2138, 6)


Unnamed: 0,번호,분류,제목,내용,내용번호,문서
0,2645,복지,아빠 육아휴직 장려금,아빠 육아휴직 장려금 업무개요 남성근로자의 육아휴직을 장려하고 양육에 따른 경...,23522464,아빠 육아휴직 장려금 아빠 육아휴직 장려금 업무개요 남성근로자의 육아휴직을 장...
1,2644,경제,[서울산업진흥원] 서울메이드란?,서울산업진흥원 서울메이드란 서울의 감성을 담은 다양하고 새로운 경험을 제공하기 위해...,23194045,[서울산업진흥원] 서울메이드란? 서울산업진흥원 서울메이드란 서울의 감성을 담은 다양...
3,2642,복지,"광진맘택시 운영(임산부,영아 양육가정 전용 택시)",광진맘택시 운영임산부영아 양육가정 전용 택시 업무개요 교통약자인 임산부와 영아가정...,22904492,"광진맘택시 운영(임산부,영아 양육가정 전용 택시) 광진맘택시 운영임산부영아 양육가정..."


In [13]:
label_name = "분류"


In [14]:
X = df["문서"]
y = df[label_name]

In [15]:
y_onehot = pd.get_dummies(y)

In [16]:
y_onehot

Unnamed: 0,경제,복지,행정
0,False,True,False
1,True,False,False
3,False,True,False
4,False,True,False
5,False,False,True
...,...,...,...
2633,True,False,False
2637,False,False,True
2638,False,False,True
2639,False,False,True


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42, stratify=y_onehot)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1710,), (428,), (1710, 3), (428, 3))

In [18]:
y_train

Unnamed: 0,경제,복지,행정
2052,False,False,True
2594,True,False,False
1061,False,False,True
78,False,False,True
70,False,False,True
...,...,...,...
1571,True,False,False
1533,True,False,False
671,True,False,False
550,True,False,False


In [19]:
display(y_train.mean())
display(y_test.mean())

경제    0.384795
복지    0.101754
행정    0.513450
dtype: float64

경제    0.385514
복지    0.100467
행정    0.514019
dtype: float64

In [20]:
#import tensorflow as tf


In [21]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# vocab_size = 10000
# oov_tok = "<oov>"
# tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

In [22]:
#tokenizer.fit_on_texts(X_train)

In [23]:
# 단어들의 출현 빈도(frequency)로 여러 문서들을 벡터화하기 위해 CountVectorizer 사용
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words=["돋움", "경우", "또는"])

In [24]:
print(cv)

CountVectorizer(stop_words=['돋움', '경우', '또는'])


In [25]:
# fit_transform을 사용하여 문장에서 노출되는 feature(특징이 될만한 단어) 수를 합한 변수 Document Term Matrix(이하 dtm)를 생성
dtm_cv = cv.fit_transform(df["문서"])

In [26]:
# cv.vocabulary_ 를 봅니다.
cv.vocabulary_

{'아빠': 22915,
 '육아휴직': 27099,
 '장려금': 30511,
 '업무개요': 23893,
 '남성근로자의': 7496,
 '육아휴직을': 27103,
 '장려하고': 30514,
 '양육에': 23556,
 '따른': 10819,
 '경제적': 2832,
 '부담을': 15552,
 '완화함으로써': 25479,
 '일과': 28653,
 '가정생활의': 556,
 '양립': 23506,
 '가족친화적인': 637,
 '사회환경': 17471,
 '조성': 33216,
 '지원대상': 35372,
 '신청일': 22238,
 '기준': 6955,
 '이상': 27802,
 '계속하여': 2936,
 '서초구에': 18853,
 '주민등록': 33859,
 '되어': 10357,
 '있는': 29585,
 '육아휴직자': 27104,
 '대상자녀': 9043,
 '신청기간': 22159,
 '시작일': 21520,
 '이후': 28222,
 '개월부터': 1390,
 '종료일': 33530,
 '개월': 1379,
 '이내': 27597,
 '신청방법': 22182,
 '온라인': 25324,
 '서초구청': 18856,
 '홈페이지': 42107,
 '경로': 2664,
 '분야별정보': 16063,
 '복지': 15305,
 '영유아복지': 24921,
 '아빠육아휴직장려금': 22919,
 '신청': 22144,
 '바로가기': 13222,
 '방문': 13765,
 '동주민센터': 10229,
 '여성보육과': 24253,
 '구비서류': 5289,
 '고용센터': 3255,
 '발행': 13718,
 '육아휴직급여': 27100,
 '지급결정': 34904,
 '통지서': 38898,
 '주민등록등본': 33860,
 '부세대원': 15723,
 '이름과': 27731,
 '전입일자': 31616,
 '포함': 39784,
 '모든': 11917,
 '구성원': 5326,
 '주민번호': 33896,
 '뒷자리': 10481,
 '미

In [27]:
cv_cols = cv.get_feature_names_out()

In [28]:
# 각 row에서 전체 단어가방에 있는 어휘에서 등장하는 단어에 대한 one-hot-vector를 확인
# toarray()로 희소 행렬(sparse matrix, 행렬의 값이 대부분 '0'인 행렬)을 NumPy array 배열로 변환하여 값을 확인

pd.DataFrame(dtm_cv.toarray(), columns=cv_cols).sum().sort_values()

03월           1
우리들           1
우리동네육아지원      1
우리동네보육반장      1
우리동           1
           ... 
대한          296
어떻게         442
서울시         448
있습니다        496
있는          536
Length: 43075, dtype: int64

In [29]:
# 정답인 '분류'의 유일한 값을 확인하여 주제 수를 확인
df["분류"].value_counts()

분류
행정    1098
경제     823
복지     217
Name: count, dtype: int64

In [30]:
#from keras.preprocessing.text import Tokenizer
# 주어진 문서에 대하여 각 문서에 어떤 주제들이 존재하는지를 확인하는 잠재 디리클레 분석(LDA)을 불러옴
# n_components에 넣을 하이퍼파라미터 NUM_TOPICS로 주제수를 설정(기본값=10)
# max_iter는 훈련 데이터(epoch라고도 함)에 대한 최대 패스 수(기본값=10)

from sklearn.decomposition import LatentDirichletAllocation

NUM_TOPICS = 10
LDA_model = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state=42)

In [31]:
# LDA_model 에 dtm_cv 를 넣어 학습
LDA_model.fit(dtm_cv)

In [32]:
# TF-IDF 방식으로 단어의 가중치를 조정한 BOW 인코딩하여 벡터화하기 위해 TfidfVectorizer를 사용

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=["돋움", "경우", "또는", "있습니다", "있는", "합니다"])
tfidf

In [33]:
# 문장에서 노출되는 feature(특징이 될만한 단어) 수를 합한 변수 Document Term Matrix(이하 dtm)를 생성
dtm_tfidf = tfidf.fit_transform(df["문서"])

In [36]:
# tfidf.vocabulary_
cols_tfidf = tfidf.get_feature_names_out()

In [37]:
# dtm_tf를 axis=0(수직 방향으로) 기준으로 합계를 낸 dist 변수를 생성
# dist 변수를 vocabulary_ 순으로 정렬하여 비율을 확인
dist = np.sum(dtm_tfidf, axis=0)
pd.DataFrame(dist, columns=cols_tfidf).T.sort_values(by=0).tail(10)

Unnamed: 0,0
하는,12.752024
무엇인가요,13.002227
의한,13.374883
무엇입니까,13.454213
따라,13.69311
관한,14.06136
대한,14.939892
있나요,16.167323
서울시,18.142777
어떻게,30.962001


In [38]:
# 각 row에서 전체 단어가방에 있는 어휘에서 등장하는 단어에 대한 가중치를 적용한 vector를 확인
# toarray()로 희소 행렬(sparse matrix, 행렬의 값이 대부분 '0'인 행렬)을 NumPy array 배열로 변환하여 값을 확인
pd.DataFrame(dtm_tfidf.toarray(), columns=cols_tfidf)

Unnamed: 0,03월,08년,10,100명이상인,100세가,10만원,10만원상당,10인의,10일이내에,120,...,힐링체험농장서울시,힐링프로그램을,힐링하는,힐스테이트,힘들,힘들경우,힘쓰고있습니다,힘쓴다,힘을,힘이
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_simple_pair = cosine_similarity(dtm_tfidf[0] , dtm_tfidf)
result_list = similarity_simple_pair.tolist()[0]

In [40]:
df["유사도"] = result_list
df[["분류", "제목", "유사도"]].sort_values(by="유사도", ascending=False).head(10)

Unnamed: 0,분류,제목,유사도
0,복지,아빠 육아휴직 장려금,1.0
1772,경제,도시계획시설부지 재결신청 이후 진행단계는 어떤 과정을 거칩니까?,0.060642
850,경제,주민대표회의 구성원 몇명입니까?,0.058009
539,행정,행려자도 아니고 시설수용자도 아닌 사람이 살고 있던 비닐하우스에서 화상을 입었습니다...,0.051756
3,복지,"광진맘택시 운영(임산부,영아 양육가정 전용 택시)",0.04933
155,경제,[농업기술센터] 후계농업경영인 선정 및 청년창업형 후계농업경영인 신청 안내,0.048589
35,행정,[시ㆍ구정외 타기관 관련 상담] 고용노동부 [일자리 안정자금],0.048087
141,경제,[농업기술센터] 도시농업전문가양성교육 신청,0.046866
233,경제,[농업기술센터] 귀농창업 평일반 교육 신청,0.04403
174,행정,찾아가는 아버지교실,0.042454


: 

In [None]:
import tensorflow as tf
print(tf.__version__)
