In [1]:
import json
import re

import pandas as pd
from gensim.models.word2vec import Word2Vec

In [2]:
df = pd.read_csv("classification_articles.csv")

# Data preprocessing

In [3]:
# https://stackoverflow.com/questions/17796446/convert-a-list-to-a-string-and-back
df["sentences"] = df["sentences"].apply(lambda s: eval(s))
df["keyword_top10"] = df["keyword_top10"].apply(lambda s: eval(s))
df["locations"] = df["locations"].apply(lambda s: eval(s))
df["entity_address"] = df["entity_address"].apply(lambda s: eval(s))

In [4]:
df_classified = df.loc[~df["self_defined_category"].isna()]

In [5]:
print(f"# of total rows: {df.shape[0]}")

# of total rows: 859


In [6]:
print(f"# of rows with pre-defined category: {df_classified.shape[0]}")

# of rows with pre-defined category: 194


# Explore categories and keywords

In [7]:
df_classified["self_defined_category"].value_counts()

其它      78
自然景觀    47
人文藝術    37
娛樂購物    32
Name: self_defined_category, dtype: int64

In [8]:
other_sentences = []
for l in df_classified.loc[df_classified["self_defined_category"] == "其它", "sentences"]:
    other_sentences.extend(l)

In [9]:
series_other = pd.Series(other_sentences)
series_other.value_counts().head(10)

飯店    13
房間    12
空間    11
推薦    10
景點    10
住宿     9
台中     8
美食     8
設施     7
早餐     7
dtype: int64

In [10]:
# series_other.value_counts()[series_other.value_counts() > 1]

In [11]:
nature_sentences = []
for l in df_classified.loc[df_classified["self_defined_category"] == "自然景觀", "sentences"]:
    nature_sentences.extend(l)

In [12]:
series_nature = pd.Series(nature_sentences)
series_nature.value_counts().head(10)

景點        15
櫻花        10
2019       9
露營         8
步道         7
花況         6
js         6
push       5
window     5
賞花         5
dtype: int64

In [13]:
# series_nature.value_counts()[series_nature.value_counts() > 1]

In [14]:
art_sentences = []
for l in df_classified.loc[df_classified["self_defined_category"] == "人文藝術", "sentences"]:
    art_sentences.extend(l)

In [15]:
series_art = pd.Series(art_sentences)
series_art.value_counts().head(10)

景點     11
宜蘭      5
台灣      5
園區      5
體驗      4
ig      4
小朋友     4
建築      4
空間      4
打卡      4
dtype: int64

In [16]:
# series_art.value_counts()[series_art.value_counts() > 1]

In [17]:
shop_sentences = []
for l in df_classified.loc[df_classified["self_defined_category"] == "娛樂購物", "sentences"]:
    shop_sentences.extend(l)

In [18]:
series_shop = pd.Series(shop_sentences)
series_shop.value_counts().head(10)

小朋友    6
溜滑梯    5
公園     4
js     4
中壢     3
diy    3
台中     3
小米     3
台南     3
打卡     3
dtype: int64

In [19]:
# series_shop.value_counts()[series_shop.value_counts() > 1]

In [20]:
# Find intersections of top-10-common keywords accross categories
# (to be excluded in following predictions)
series_common = (
    series_other.value_counts().head(10).index
        .append(series_art.value_counts().head(10).index)
        .append(series_nature.value_counts().head(10).index)
        .append(series_shop.value_counts().head(10).index)
)
series_common.value_counts()

景點        3
台中        2
js        2
空間        2
打卡        2
小朋友       2
公園        1
設施        1
美食        1
步道        1
建築        1
push      1
賞花        1
園區        1
ig        1
推薦        1
diy       1
早餐        1
飯店        1
台南        1
2019      1
露營        1
小米        1
房間        1
中壢        1
宜蘭        1
花況        1
櫻花        1
體驗        1
住宿        1
溜滑梯       1
台灣        1
window    1
dtype: int64

In [21]:
list_excluding_keywords = series_common.value_counts()[series_common.value_counts() > 1].index.tolist()
list_excluding_keywords

['景點', '台中', 'js', '空間', '打卡', '小朋友']

In [22]:
# Keywords can't be locations either
list_excluding_locations = [s for l in df_classified["locations"] for s in l]
list_excluding_locations = list(set(list_excluding_locations))
list_excluding_locations

['苗栗市',
 '冬山',
 '台北車站',
 '青埔',
 '彩虹',
 '苗栗',
 '宜蘭伯朗大道',
 '文物館',
 '自由天地民宿',
 '樹谷農場',
 '拉拉山恩愛農場',
 '屏東',
 '青森',
 '大雅',
 '武陵農場',
 '冬山鄉',
 '東大門夜市',
 '泰安',
 '南京三民站',
 '童話世界',
 '中壢',
 '花蓮',
 '來了',
 '東海岸',
 '總爺藝文中心',
 '新竹火車站',
 '綠園道',
 '勤美綠園道',
 '西區',
 '亞洲',
 '青草湖',
 '卓蘭',
 '大甲溪',
 '紅樹林站',
 '礁溪',
 '芭蕾城市渡假旅店',
 '竹北',
 '東勢',
 '松山區',
 '太平洋',
 '環保公園',
 '澎湖跨海大橋',
 '頭城',
 '松山',
 '壽豐',
 '白河',
 '西屯',
 '曼谷',
 '楠西',
 '台中',
 '月亮',
 '八卦山',
 '小北家',
 '黃金瀑布',
 '梅花',
 '南寮',
 '后里',
 '綠舞國際觀光飯店',
 '后里區',
 '亞太飯店',
 '新竹',
 '歐洲',
 '宜蘭',
 '村却國際溫泉酒店',
 '望龍埤',
 '大甲',
 '六龜區',
 '露營區',
 '玉井',
 '運動公園',
 '集集火車站',
 '板橋',
 '雲林',
 '大溪',
 '淡水',
 '東旅',
 '車路墘教會',
 '西螺',
 '逢甲夜市',
 '小琉球',
 '達邦',
 '勤美術館',
 '台中市',
 '臺中市眷村文物館',
 '秘密花園',
 '龍潭',
 '南屯',
 '功維敘隧道',
 '桃園',
 '眺望',
 '易家仙人掌冰',
 '東港',
 '美術館',
 '峇里島',
 '高雄',
 '花蓮理想<ModifierP>大地</ModifierP>渡假飯店',
 '在水一方',
 '華山文創園區',
 '七股',
 '美國',
 '赤崁璽樓民宿',
 '台灣穀堡',
 '復興區',
 '準園休閒農場',
 '三重',
 '谷關',
 '太平山森林遊樂區',
 '台南',
 '報告班長',
 '三峽',
 '東豐自行車綠廊',
 '禾風新棧度假飯店',
 '印度',
 '桃園國際棒球場',
 '南門',
 '馬來西亞',
 

In [23]:
df["processed_sentences"] = df["sentences"].apply(
    lambda l: [
        s for s in l
        # Excluding common keywords
        if s not in list_excluding_keywords and
        # Excluding locations
        s not in list_excluding_locations and
        # Excluding pure digits
        not re.match(r"\d+$", s)
    ]
)

# Word2Vec

In [24]:
def most_similar(w2v_model, words, topn=10):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except KeyError:
            print(word, "not found in Word2Vec model!")
    return similar_df

In [25]:
model = Word2Vec(df["processed_sentences"], size=250, iter=10, min_count=2, sg=1, window=1)

In [26]:
most_similar(model, ["櫻花", "溜滑梯", "門票", "飯店", "民宿", "步道", "芒果"])

Unnamed: 0,櫻花,cos,溜滑梯,cos.1,門票,cos.2,飯店,cos.3,民宿,cos.4,步道,cos.5,芒果,cos.6
0,飯店,0.592804,體驗,0.56803,步道,0.552969,時間,0.646032,步道,0.533831,免費,0.659057,步道,0.415202
1,時間,0.528756,免費,0.566124,時間,0.54641,免費,0.622134,時間,0.533541,推薦,0.652054,飯店,0.396064
2,大廳,0.527727,園區,0.565376,欣賞,0.536126,體驗,0.599348,飯店,0.524914,時間,0.645037,參觀,0.393054
3,diy,0.526735,推薦,0.555808,園區,0.523721,推薦,0.59778,美食,0.523322,園區,0.627523,免費,0.380129
4,園區,0.526129,停車,0.543292,親子,0.515512,園區,0.593591,園區,0.512779,設施,0.626321,新鮮,0.377385
5,步道,0.525356,時間,0.532321,溜滑梯,0.515334,櫻花,0.592804,親子,0.509526,美食,0.620592,體驗,0.370449
6,推薦,0.51496,親子,0.522433,設計,0.514792,設計,0.592302,賞花,0.508686,參觀,0.605918,賞櫻,0.368937
7,喜歡,0.513004,建築,0.516311,體驗,0.513835,入住,0.591087,推薦,0.492652,diy,0.595345,美食,0.366169
8,設施,0.504499,門票,0.515334,參觀,0.506796,步道,0.579177,住宿,0.492622,設計,0.590093,推薦,0.365036
9,參觀,0.502718,喜歡,0.515169,id,0.505257,露營,0.576477,diy,0.485238,親子,0.590083,cafe,0.364047


In [27]:
dict_df_classified = df.where(df.notnull(), None).to_dict()
with open("preprocessed_data.json", "w") as f:
    json.dump(dict_df_classified, f, indent=2, ensure_ascii=False)

In [28]:
model.save("word2vec.model")