In [1]:
import json
import re

import pandas as pd
from gensim.models.word2vec import Word2Vec

In [2]:
df = pd.read_csv("classification_articles.csv")

# Data preprocessing

In [3]:
# https://stackoverflow.com/questions/17796446/convert-a-list-to-a-string-and-back
df["sentences"] = df["sentences"].apply(lambda s: eval(s))
df["keyword_top10"] = df["keyword_top10"].apply(lambda s: eval(s))
df["locations"] = df["locations"].apply(lambda s: eval(s))
df["entity_address"] = df["entity_address"].apply(lambda s: eval(s))

In [4]:
df_classified = df.loc[~df["self_defined_category"].isna()]

In [5]:
print(f"# of total rows: {df.shape[0]}")

# of total rows: 872


In [6]:
print(f"# of rows with pre-defined category: {df_classified.shape[0]}")

# of rows with pre-defined category: 200


# Explore categories and keywords

In [7]:
df_classified["self_defined_category"].value_counts()

其它      79
自然景觀    51
人文藝術    38
娛樂購物    32
Name: self_defined_category, dtype: int64


In [8]:
other_sentences = []
for l in df_classified.loc[df_classified["self_defined_category"] == "其它", "sentences"]:
    other_sentences.extend(l)

In [9]:
series_other = pd.Series(other_sentences)
series_other.value_counts().head(10)

飯店    14
房間    12
空間    11
推薦    10
景點    10
住宿     9
美食     8
台中     8
設施     7
早餐     7
dtype: int64

In [10]:
# series_other.value_counts()[series_other.value_counts() > 1]

In [11]:
nature_sentences = []
for l in df_classified.loc[df_classified["self_defined_category"] == "自然景觀", "sentences"]:
    nature_sentences.extend(l)

In [12]:
series_nature = pd.Series(nature_sentences)
series_nature.value_counts().head(10)

景點        16
櫻花        10
2019       9
js         9
露營         8
步道         8
push       7
window     7
花況         6
賞花         5
dtype: int64

In [13]:
# series_nature.value_counts()[series_nature.value_counts() > 1]

In [14]:
art_sentences = []
for l in df_classified.loc[df_classified["self_defined_category"] == "人文藝術", "sentences"]:
    art_sentences.extend(l)

In [15]:
series_art = pd.Series(art_sentences)
series_art.value_counts().head(10)

景點     11
園區      5
宜蘭      5
台灣      5
建築      4
ig      4
空間      4
體驗      4
打卡      4
小朋友     4
dtype: int64

In [16]:
# series_art.value_counts()[series_art.value_counts() > 1]

In [17]:
shop_sentences = []
for l in df_classified.loc[df_classified["self_defined_category"] == "娛樂購物", "sentences"]:
    shop_sentences.extend(l)

In [18]:
series_shop = pd.Series(shop_sentences)
series_shop.value_counts().head(10)

小朋友    6
溜滑梯    5
公園     4
js     4
diy    3
台中     3
桃園     3
台南     3
宜蘭     3
特賣會    3
dtype: int64

In [19]:
# series_shop.value_counts()[series_shop.value_counts() > 1]

In [20]:
# Find intersections of top-10-common keywords accross categories
# (to be excluded in following predictions)
series_common = (
    series_other.value_counts().head(10).index
        .append(series_art.value_counts().head(10).index)
        .append(series_nature.value_counts().head(10).index)
        .append(series_shop.value_counts().head(10).index)
)
series_common.value_counts()

景點        3
空間        2
小朋友       2
宜蘭        2
台中        2
js        2
早餐        1
房間        1
台南        1
美食        1
體驗        1
台灣        1
push      1
賞花        1
溜滑梯       1
飯店        1
建築        1
露營        1
公園        1
特賣會       1
ig        1
步道        1
2019      1
打卡        1
diy       1
住宿        1
設施        1
櫻花        1
window    1
花況        1
推薦        1
桃園        1
園區        1
dtype: int64

In [21]:
list_excluding_keywords = series_common.value_counts()[series_common.value_counts() > 1].index.tolist()
list_excluding_keywords

['景點', '空間', '小朋友', '宜蘭', '台中', 'js']

In [22]:
# Keywords can't be locations either
list_excluding_locations = [s for l in df_classified["locations"] for s in l]
list_excluding_locations = list(set(list_excluding_locations))
list_excluding_locations

['情人橋',
 '達邦',
 '嘉義',
 '芭蕾城市渡假旅店',
 '亞太飯店',
 '大安區',
 '森林步道',
 '東海岸',
 '秘密花園',
 '水舞行館',
 '復興區',
 '員林',
 '三峽',
 '大雅',
 '谷關',
 '日本',
 '南方莊園渡假飯店',
 '墾丁',
 '歐洲',
 '竹北',
 '七股',
 '清水',
 '新北市',
 '糖廠',
 '冬山鄉',
 '新社',
 '太原',
 '拉拉山恩愛農場',
 '大溪',
 '美術館',
 '頭城',
 '總爺藝文中心',
 '台灣滷味博物館',
 '太平洋',
 '龍潭',
 '捷運忠孝復興站',
 '禾風新棧度假飯店',
 '巴黎',
 '海南',
 '樹谷農場',
 '曼谷',
 '客家文物館',
 '太平山森林遊樂區',
 '室內',
 '遠雄海洋公園',
 '台東',
 '國內',
 '西屯',
 '紅樓',
 '新竹縣',
 '帳篷內',
 '菱潭街興創基地',
 '赤崁璽樓民宿',
 '花蓮港',
 '南投',
 '探索私旅',
 '八卦山',
 '月眉糖廠',
 '童話世界',
 '自由天地民宿',
 '印度',
 '新竹',
 '青森',
 '瑞春醬油',
 '台中市',
 '休閒農業區',
 '桃園',
 '西螺',
 '台北',
 '高雄',
 '銅鑼',
 '中壢',
 '三重',
 '綠園道',
 '台中',
 '世界',
 '峇里島',
 '后里',
 '六龜區',
 '湖口',
 '信義區',
 '草屯',
 '功維敘隧道',
 '保安車站',
 '板橋',
 '台灣',
 '苗栗',
 '亞洲',
 '雲林',
 '屏東',
 '紅樹林站',
 '展覽館',
 '箱根',
 '澎湖跨海大橋',
 '鹿場',
 '青埔',
 '陽明山天籟渡假酒店',
 '南印度',
 '眺望',
 '文化村',
 '北投',
 '武陵農場',
 '后里區',
 '澎湖',
 '士林',
 '花蓮',
 '東港',
 '台南',
 '天元宮',
 '南門',
 '綠川',
 '松山區',
 '大湖',
 '西區',
 '彩虹',
 '宜蘭伯朗大道',
 '運動公園',
 '冬山',
 '白河',
 '二崁聚落',
 '新竹火車站',
 '華山',
 '青草湖',


In [23]:
df["processed_sentences"] = df["sentences"].apply(
    lambda l: [
        s for s in l
        # Excluding common keywords
        if s not in list_excluding_keywords and
        # Excluding locations
        s not in list_excluding_locations and
        # Excluding pure digits
        not re.match(r"\d+$", s)
    ]
)

# Word2Vec

In [24]:
def most_similar(w2v_model, words, topn=10):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except KeyError:
            print(word, "not found in Word2Vec model!")
    return similar_df

In [25]:
model = Word2Vec(df["processed_sentences"], size=250, iter=10, min_count=2, sg=1, window=1)

In [26]:
most_similar(model, ["櫻花", "溜滑梯", "門票", "飯店", "民宿", "步道", "芒果"])

Unnamed: 0,櫻花,cos,溜滑梯,cos.1,門票,cos.2,飯店,cos.3,民宿,cos.4,步道,cos.5,芒果,cos.6
0,步道,0.618693,免費,0.706825,體驗,0.656593,打卡,0.745094,打卡,0.74678,美食,0.710871,飯店,0.489679
1,美食,0.608576,飯店,0.686636,美食,0.655374,民宿,0.743819,飯店,0.743818,園區,0.70003,喜歡,0.479636
2,主題,0.606184,推薦,0.675625,打卡,0.649058,主題,0.712123,喜歡,0.717999,喜歡,0.699814,活動,0.477327
3,飯店,0.575091,體驗,0.674279,免費,0.626249,喜歡,0.711856,適合,0.705853,民宿,0.697728,美食,0.474001
4,設計,0.569717,打卡,0.664555,步道,0.622614,活動,0.708912,步道,0.697728,飯店,0.693598,園區,0.470462
5,喜歡,0.569102,感覺,0.664494,園區,0.622223,公園,0.702633,設施,0.69333,主題,0.688553,打卡,0.454905
6,打卡,0.560988,設施,0.662848,參觀,0.621438,設施,0.698961,設計,0.689648,打卡,0.687769,步道,0.454352
7,可愛,0.558152,活動,0.661363,房間,0.619572,免費,0.697114,免費,0.689298,適合,0.683312,民宿,0.44407
8,id,0.550712,民宿,0.657386,喜歡,0.619081,步道,0.693598,活動,0.68462,入住,0.682987,window,0.44135
9,民宿,0.545034,小孩,0.655929,民宿,0.618751,感覺,0.688922,推薦,0.679703,推薦,0.682492,主題,0.440389


In [27]:
dict_df_classified = df.where(df.notnull(), None).to_dict()
with open("preprocessed_data.json", "w") as f:
    json.dump(dict_df_classified, f, indent=2, ensure_ascii=False)

In [28]:
model.save("word2vec.model")