In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
from tqdm import tqdm
from collections import Counter
import ast

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sb

from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import scipy.stats as stats

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()

%matplotlib inline

import xgboost
from xgboost import XGBClassifier
from xgboost import plot_tree
from sklearn.model_selection import train_test_split

## 여행지 키워드 전처리

In [2]:
tour = pd.read_csv("./final_data/전체_final_data_edit.csv", encoding = 'utf-8-sig')

# 각 행의 키워드를 문자열 형태로 만들어주기 위해 인덱스 초기화(인덱스 활용 접근)
tour.reset_index()

# 각 행의 키워드들을 문자열화
for i in range(tour.shape[0]):  
    keyword = tour.loc[i, 'keyword'].replace("'관광지'", "")
    keyword = keyword.replace('[', '').replace(']', '').replace("'", "")
    keyword = keyword.replace(' ', '').split(',')
    keyword = ' '.join(keyword)
    tour.loc[i, 'keyword'] = keyword
    if tour.loc[i, 'keyword'] == '':
        tour.loc[i, 'keyword'] = np.nan

tour.head()

Unnamed: 0,name,region,tel,keyword,photo,address,lat,lon
0,(구)강경노동조합,충남 논산시,041-746-5412,건축여행 근대건축물 등록문화재,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,충청남도 논산시 강경읍 옥녀봉로27번길 30-5,36.160849,127.014602
1,(구)인천일본제58은행지점,인천 중구,032-760-6474,교과서속여행 문화재 역사,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,인천광역시 중구 신포로23번길 77,37.472859,126.621154
2,(구)태백등기소,강원 태백시,166,,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,강원도 태백시 하장성 1길 14,37.102615,129.020972
3,(구)한일은행 강경지점,충남 논산시,041-746-5412,건축여행 근대건축물 역사 역사공부,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,충청남도 논산시 강경읍 계백로167번길 50,36.16213,127.015154
4,(전)광주 성거사지 오층석탑,광주 남구,062-607-2333,탑 보물 역사공부 역사관광지,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,광주광역시 남구 천변좌로338번길 7,35.147999,126.90871


In [3]:
tour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14060 entries, 0 to 14059
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     14060 non-null  object 
 1   region   14060 non-null  object 
 2   tel      13904 non-null  object 
 3   keyword  13782 non-null  object 
 4   photo    14060 non-null  object 
 5   address  14060 non-null  object 
 6   lat      14060 non-null  float64
 7   lon      14060 non-null  float64
dtypes: float64(2), object(6)
memory usage: 878.9+ KB


In [4]:
# tfidf 벡터에 없는 키워드만 가진 여행지 제거하기 위한 작업
tour['tel'].fillna(value=0, inplace=True)

In [5]:
# tfidf 벡터에 없는 키워드만 가진는 모든 topic의 값이 null 값이므로 제거
tour.dropna(inplace=True)

In [6]:
tour.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13782 entries, 0 to 14059
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     13782 non-null  object 
 1   region   13782 non-null  object 
 2   tel      13782 non-null  object 
 3   keyword  13782 non-null  object 
 4   photo    13782 non-null  object 
 5   address  13782 non-null  object 
 6   lat      13782 non-null  float64
 7   lon      13782 non-null  float64
dtypes: float64(2), object(6)
memory usage: 969.0+ KB


In [7]:
# 여행지 제거 후 인덱스 초기화
tour.reset_index(drop=True, inplace=True)

## gensim

In [None]:
from gensim import corpora
import gensim
import pyLDAvis.gensim

tour.reset_index()

documents = []
for i in range(tour.shape[0]):
    tour.loc[i, 'keyword'] = tour.loc[i, 'keyword']
    documents.append(tour.loc[i, 'keyword'].split())
    
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(text) for text in documents]
print(dictionary)
for c in corpus:
    print(c)

In [9]:
NUM_TOPICS = 7
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15, random_state = 77)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.161*"자연" + 0.124*"레포츠" + 0.024*"자연좋은곳" + 0.023*"생태관광지"')
(1, '0.164*"역사" + 0.160*"역사공부" + 0.145*"역사관광지" + 0.135*"역사를품은곳"')
(2, '0.112*"불교" + 0.106*"역사" + 0.105*"사찰" + 0.105*"불교문화"')
(3, '0.095*"가족과함께" + 0.055*"아이와함께" + 0.052*"경치좋은곳" + 0.040*"남녀노소"')
(4, '0.098*"쇼핑" + 0.033*"사진찍기좋은곳" + 0.031*"친구와함께" + 0.025*"사진찍기"')
(5, '0.156*"가족과함께" + 0.074*"나들이" + 0.061*"아이와함께" + 0.046*"쇼핑"')
(6, '0.148*"문화시설" + 0.090*"아이와함께" + 0.067*"가족과함께" + 0.040*"교과서속여행"')


In [10]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

In [11]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [12]:
topictable = make_topictable_per_doc(ldamodel, corpus)

topictable.columns = ['topic', 'topic_rate', 'topic_list']
topictable

Unnamed: 0,topic,topic_rate,topic_list
0,4.0,0.5358,"[(0, 0.03572094), (1, 0.28555897), (2, 0.03572..."
1,1.0,0.7855,"[(0, 0.03571455), (1, 0.785476), (2, 0.0357532..."
2,1.0,0.4285,"[(0, 0.028573046), (1, 0.42854363), (2, 0.0286..."
3,1.0,0.6289,"[(0, 0.028584184), (1, 0.62892115), (2, 0.2281..."
4,6.0,0.7679,"[(0, 0.017858913), (1, 0.017858911), (2, 0.017..."
...,...,...,...
13777,6.0,0.8928,"[(0, 0.01785726), (1, 0.01785726), (2, 0.01785..."
13778,5.0,0.3496,"[(0, 0.017866297), (1, 0.017863857), (2, 0.267..."
13779,3.0,0.8283,"[(0, 0.028834809), (1, 0.02857842), (2, 0.0285..."
13780,3.0,0.8282,"[(0, 0.028666003), (1, 0.028578162), (2, 0.028..."


In [13]:
topictable['topic'].value_counts()

0.0    3139
1.0    2665
3.0    2360
6.0    2256
5.0    1576
2.0    1125
4.0     661
Name: topic, dtype: int64

In [14]:
tour_topic = pd.concat([tour, topictable], axis = 1)

In [15]:
tour_topic.head()

Unnamed: 0,name,region,tel,keyword,photo,address,lat,lon,topic,topic_rate,topic_list
0,(구)강경노동조합,충남 논산시,041-746-5412,건축여행 근대건축물 등록문화재,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,충청남도 논산시 강경읍 옥녀봉로27번길 30-5,36.160849,127.014602,4.0,0.5358,"[(0, 0.03572094), (1, 0.28555897), (2, 0.03572..."
1,(구)인천일본제58은행지점,인천 중구,032-760-6474,교과서속여행 문화재 역사,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,인천광역시 중구 신포로23번길 77,37.472859,126.621154,1.0,0.7855,"[(0, 0.03571455), (1, 0.785476), (2, 0.0357532..."
2,(구)한일은행 강경지점,충남 논산시,041-746-5412,건축여행 근대건축물 역사 역사공부,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,충청남도 논산시 강경읍 계백로167번길 50,36.16213,127.015154,1.0,0.4285,"[(0, 0.028573046), (1, 0.42854363), (2, 0.0286..."
3,(전)광주 성거사지 오층석탑,광주 남구,062-607-2333,탑 보물 역사공부 역사관광지,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,광주광역시 남구 천변좌로338번길 7,35.147999,126.90871,1.0,0.6289,"[(0, 0.028584184), (1, 0.62892115), (2, 0.2281..."
4,(주) 교보문고,서울 종로구,1544-1900,서점 도서관 도심여행 문화시설 문화행사 사계절 아이와함께,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,서울특별시 종로구 종로 1,37.573054,127.01659,6.0,0.7679,"[(0, 0.017858913), (1, 0.017858911), (2, 0.017..."


In [16]:
tour_topic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13782 entries, 0 to 13781
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        13782 non-null  object 
 1   region      13782 non-null  object 
 2   tel         13782 non-null  object 
 3   keyword     13782 non-null  object 
 4   photo       13782 non-null  object 
 5   address     13782 non-null  object 
 6   lat         13782 non-null  float64
 7   lon         13782 non-null  float64
 8   topic       13782 non-null  float64
 9   topic_rate  13782 non-null  float64
 10  topic_list  13782 non-null  object 
dtypes: float64(4), object(7)
memory usage: 1.2+ MB


In [17]:
# 키워드 입력
documents_input = []
keyword_input = input().split()

keyword_str = ""
for i in range(len(dictionary)):
    for k in keyword_input:
        if k in dictionary[i]:
            keyword_str += dictionary[i] + " "

keyword_input = keyword_str.split()
documents_input.append(keyword_input)
corpus_input = [dictionary.doc2bow(text) for text in documents_input]
topictable_input = make_topictable_per_doc(ldamodel, corpus_input)
topictable_input.columns = ['topic', 'topic_rate', 'topic_list']
topictable_input

바다


Unnamed: 0,topic,topic_rate,topic_list
0,3.0,0.5222,"[(0, 0.2587787), (3, 0.5222136), (5, 0.1700086..."


In [18]:
topic_result = topictable_input['topic'][0]
tour_topic_keyword = tour_topic[tour_topic['topic'] == topic_result]

In [19]:
tour_topic.set_index('name', inplace = True)

In [20]:
tour_topic.head()

Unnamed: 0_level_0,region,tel,keyword,photo,address,lat,lon,topic,topic_rate,topic_list
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
(구)강경노동조합,충남 논산시,041-746-5412,건축여행 근대건축물 등록문화재,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,충청남도 논산시 강경읍 옥녀봉로27번길 30-5,36.160849,127.014602,4.0,0.5358,"[(0, 0.03572094), (1, 0.28555897), (2, 0.03572..."
(구)인천일본제58은행지점,인천 중구,032-760-6474,교과서속여행 문화재 역사,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,인천광역시 중구 신포로23번길 77,37.472859,126.621154,1.0,0.7855,"[(0, 0.03571455), (1, 0.785476), (2, 0.0357532..."
(구)한일은행 강경지점,충남 논산시,041-746-5412,건축여행 근대건축물 역사 역사공부,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,충청남도 논산시 강경읍 계백로167번길 50,36.16213,127.015154,1.0,0.4285,"[(0, 0.028573046), (1, 0.42854363), (2, 0.0286..."
(전)광주 성거사지 오층석탑,광주 남구,062-607-2333,탑 보물 역사공부 역사관광지,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,광주광역시 남구 천변좌로338번길 7,35.147999,126.90871,1.0,0.6289,"[(0, 0.028584184), (1, 0.62892115), (2, 0.2281..."
(주) 교보문고,서울 종로구,1544-1900,서점 도서관 도심여행 문화시설 문화행사 사계절 아이와함께,https://cdn.visitkorea.or.kr/img/call?cmd=VIEW...,서울특별시 종로구 종로 1,37.573054,127.01659,6.0,0.7679,"[(0, 0.017858913), (1, 0.017858911), (2, 0.017..."


In [21]:
tour_topic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13782 entries, (구)강경노동조합 to 힐튼양복점
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   region      13782 non-null  object 
 1   tel         13782 non-null  object 
 2   keyword     13782 non-null  object 
 3   photo       13782 non-null  object 
 4   address     13782 non-null  object 
 5   lat         13782 non-null  float64
 6   lon         13782 non-null  float64
 7   topic       13782 non-null  float64
 8   topic_rate  13782 non-null  float64
 9   topic_list  13782 non-null  object 
dtypes: float64(4), object(6)
memory usage: 1.2+ MB


In [22]:
tour_topic.to_csv("./final_data/LDA_gensim_topic.csv", mode='w', encoding ='utf-8-sig')