In [1]:
import pickle

import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec

In [2]:
# Load data and models
df = pd.read_json("preprocessed_data.json")
wv_model = Word2Vec.load("word2vec.model")
with open("xgboost.pickle", "rb") as f:
    xgbc = pickle.load(f)
with open("rf.pickle", "rb") as f:
    rfc = pickle.load(f)

In [3]:
# Filter out those which don't really have locations
# Should be dealt in get_materials though
df = df.loc[df["entity_address"] != {'address': []}]

In [4]:
word_vectors = []
tf_idfs = []
urls = []
for row in df.itertuples(index=False):
    for keyword in row.keyword_top10:
        word = keyword["word"]
        tf_idf = keyword["tfidf"]
        try:
            word_vectors.append(wv_model[word])
        except KeyError:
            continue
        tf_idfs.append(tf_idf)
        urls.append(row.url)

  if __name__ == '__main__':


In [5]:
df_word_vectors = pd.DataFrame(np.vstack(word_vectors))
df_word_vectors.columns = [f"wv_d{i}" for i in range(1, 251)]
df_tf_idf = pd.DataFrame(tf_idfs).rename(columns={0: "tf_idf"})
df_url = pd.DataFrame(urls).rename(columns={0: "url"})

In [6]:
data = pd.concat([df_url, df_word_vectors, df_tf_idf], axis=1)

In [7]:
def predict_with_xgboost_and_rf(data):
    labels = sorted(["人文藝術", "其它", "娛樂購物", "自然景觀"])
    probs_xgbc = xgbc.predict_proba(data[[c for c in data.columns if c != "url"]])
    probs_rfc = rfc.predict_proba(data[[c for c in data.columns if c != "url"]])
    xgbc_list = [{label: p for label, p in zip(labels, prob)} for prob in probs_xgbc]
    rfc_list = [{label: p for label, p in zip(labels, prob)} for prob in probs_rfc]
    final_list = xgbc_list
    final_list.extend(rfc_list)
    return pd.DataFrame(final_list).mean().idxmax()

In [8]:
predicted_categories = []
for url in data["url"].unique():
    sub_data = data.loc[data["url"] == url]
    prediction = predict_with_xgboost_and_rf(sub_data)
    predicted_categories.append({"url": url, "predicted_category": prediction})

In [9]:
df_predicted = df.merge(pd.DataFrame(predicted_categories), on="url", how="right").rename(columns={"locations": "segmented_locations"})

In [10]:
df_predicted.head()

Unnamed: 0,url,title,segmented_locations,entity_address,keyword_top10,sentences,self_defined_category,predicted_category
0,http://eeooa0314.pixnet.net/blog/post/467928992,宜蘭親子玩水景點 ▶ 武荖坑風景區 ▶ 炎炎夏日消暑妙方 超清澈超沁涼溪水 全家出遊、親子...,"[宜蘭, 武荖坑風景區]","[{'location': {'lon': '121.812772', 'lat': '24...","[{'tfidf': 0.49494000000000005, 'word': '武荖坑風景...","[戲水, 玩水, 烤肉, 宜蘭親子, 出遊, 門票, 露營]",自然景觀,自然景觀
1,http://eeooa0314.pixnet.net/blog/post/467899943,2019台北最新親子公園 ▶ 華山大草原遊戲場 ▶ 極限滑索、煙囪遊戲塔、飛天鞦韆、水沙世界...,"[台北, 華山, 世界]","[{'location': {'lon': 121.526707, 'lat': 25.04...","[{'tfidf': 0.37110000000000004, 'word': '華山'},...","[公園, 大草原, 遊戲場, 煙囪, 遊戲, 暑假, 滑索, 鞦韆, 水沙]",人文藝術,人文藝術
2,http://vreranda.pixnet.net/blog/post/227769134,宜蘭景點/親子同遊/煮蛋『鳩之澤』宜蘭旅遊/泡湯/太平山森林遊樂區,"[宜蘭, 太平山森林遊樂區]","[{'location': {'lon': 121.5081831, 'lat': 24.5...","[{'tfidf': 0.45949000000000007, 'word': '煮蛋'},...","[煮蛋, 沙拉, 鳩之澤, 溫泉, 泡湯, 太平山, 牛牛, 宜蘭旅遊]",自然景觀,自然景觀
3,http://tadli.pixnet.net/blog/post/227821865,2019/06/22<親子露營第30露>宜蘭-武林秘密基地露營區：報告班長，小小兵集合囉~,"[宜蘭, 露營區, 報告班長]","[{'location': {'lon': 121.6838752, 'lat': 24.6...","[{'tfidf': 0.18113, 'word': '活動'}, {'tfidf': 0...","[活動, 營區, 小孩, 民宿, 開心, 泡泡, 雨棚, 任務, 秘密基地]",自然景觀,自然景觀
4,http://abby0318.pixnet.net/blog/post/467921939,【桃園景點美食】桃園龍潭一日遊~13個景點美食景觀餐廳懒人包！,"[桃園, 龍潭]","[{'location': {'lon': '121.2432972', 'lat': '2...","[{'word': 'abby', 'tfidf': 0.41703}, {'tfidf':...","[abby, 閱讀, 環境, 點選, 連結, 詳細, 電話, 營業時間]",其它,其它
