In [1]:
import pickle

import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec

In [2]:
# Load data and models
df = pd.read_json("preprocessed_data.json")
wv_model = Word2Vec.load("word2vec.model")
with open("xgboost.pickle", "rb") as f:
    xgbc = pickle.load(f)
with open("rf.pickle", "rb") as f:
    rfc = pickle.load(f)

In [3]:
# Randomly pick one article
df_sample = df.loc[df["self_defined_category"].isna()].sample(n=1)
df_sample

Unnamed: 0,url,title,locations,entity_address,keyword_top10,sentences,self_defined_category
489,http://su327396.pixnet.net/blog/post/119331709,蓮臺山妙音淨苑 八重櫻盛開 清靜美地好視野｜苗栗獅潭賞櫻秘境景點,"[苗栗, 獅潭]","[{'location': {'lat': 24.4656981, 'lon': 120.9...","[{'tfidf': 0.28859, 'word': '妙音'}, {'tfidf': 0...","[妙音, 淨苑, 蓮臺山, 八重, 盛開, 獅潭, 櫻花, 賞櫻, 停車場, 秘境]",


In [9]:
word_vectors = []
tf_idfs = []
words = []
for row in df_sample.itertuples(index=False):
    for keyword in row.keyword_top10:
        word = keyword["word"]
        tf_idf = keyword["tfidf"]
        try:
            word_vectors.append(wv_model[word])
        except KeyError:
            continue
        tf_idfs.append(tf_idf)
        words.append(word)

  if __name__ == '__main__':


In [10]:
df_word_vectors = pd.DataFrame(np.vstack(word_vectors))
df_word_vectors.columns = [f"wv_d{i}" for i in range(1, 251)]
df_tf_idf = pd.DataFrame(tf_idfs).rename(columns={0: "tf_idf"})
df_word = pd.DataFrame(words).rename(columns={0: "word"})

In [11]:
data = pd.concat([df_word_vectors, df_tf_idf, df_word], axis=1)
data

Unnamed: 0,wv_d1,wv_d2,wv_d3,wv_d4,wv_d5,wv_d6,wv_d7,wv_d8,wv_d9,wv_d10,...,wv_d243,wv_d244,wv_d245,wv_d246,wv_d247,wv_d248,wv_d249,wv_d250,tf_idf,word
0,0.000531,0.000977,0.000266,0.000676,-0.002128,0.000746,-0.002183,0.000572,-0.00024,-4.5e-05,...,-0.001397,0.000634,-0.000653,-0.001703,-0.000185,-0.001823,0.000244,0.000624,0.28859,妙音
1,-0.001024,0.000597,0.0011,-0.000267,0.000326,-0.00152,-0.000745,-0.001384,-0.000473,0.002039,...,0.001166,0.002124,0.001815,-0.002128,0.002427,-0.000469,0.001873,0.001227,0.27569,淨苑
2,0.001351,-0.000104,0.000851,-1.8e-05,0.001618,-0.000593,0.00105,-0.000206,0.00193,0.000595,...,-3e-05,0.001259,-0.001435,0.000802,-0.000582,0.000934,-0.000767,-0.001717,0.25062,蓮臺山
3,0.000736,-0.000106,0.000291,0.000747,0.000446,-0.000361,-0.001416,0.001162,0.001812,-0.000985,...,0.001205,0.001943,-0.000791,-0.002612,-0.000302,0.000372,0.000886,0.002133,0.23191,八重
4,0.000912,0.002333,0.001958,0.001029,-0.002176,-0.000184,-0.002346,0.000203,0.002546,0.001038,...,-0.001508,-0.001732,-0.002615,0.000125,0.002299,-0.002408,-0.002377,0.002379,0.2005,盛開
5,-0.001619,-0.000877,-0.001379,0.000447,0.001007,-0.00163,-0.000161,-0.0006,0.001376,0.002051,...,-0.001292,0.000517,-0.000942,0.00118,-0.000591,-0.001152,-0.001264,0.001346,0.15037,獅潭
6,0.001559,-0.000217,-0.000488,0.002507,-0.000818,-0.000107,-0.001687,-0.002298,-0.000122,0.001895,...,2.8e-05,0.000754,-0.000647,-0.000638,0.004923,-0.00074,-0.001739,0.000155,0.15037,櫻花
7,0.001941,5.5e-05,0.000779,0.001933,0.000533,0.001273,-0.001811,0.000358,0.001972,0.00205,...,-0.000666,0.0008,-0.000842,-0.002486,0.001683,-0.000385,0.001549,-0.00033,0.12531,賞櫻
8,0.000447,0.001091,0.001346,0.000679,0.001038,-0.001254,-0.003495,-0.001214,0.003151,0.001362,...,-0.001156,0.000693,-0.0015,-0.003398,0.001748,-0.001115,-0.000288,0.000714,0.12531,停車場
9,0.0011,0.002365,0.000763,0.002027,-0.003194,0.001906,-0.002002,0.000558,0.000105,-0.00074,...,-0.002576,-0.000508,-0.000953,-0.002175,0.004069,-6.2e-05,2.3e-05,-0.000498,0.10025,秘境


In [18]:
def predict_with_xgboost_and_rf(data):
    labels = sorted(["人文藝術", "其它", "娛樂購物", "自然景觀"])
    probs_xgbc = xgbc.predict_proba(data[[c for c in data.columns if c != "word"]])
    probs_rfc = rfc.predict_proba(data[[c for c in data.columns if c != "word"]])
    xgbc_list = [{label: p for label, p in zip(labels, prob)} for prob in probs_xgbc]
    print("With XGBoost:")
    for w, ps in zip(data["word"], xgbc_list):
        print(w, ps)
    rfc_list = [{label: p for label, p in zip(labels, prob)} for prob in probs_rfc]
    print("")
    print("With random forest:")
    for w, ps in zip(data["word"], rfc_list):
        print(w, ps)
    final_list = xgbc_list
    final_list.extend(rfc_list)
    print("")
    print(pd.DataFrame(final_list).mean())
    return pd.DataFrame(final_list).mean().idxmax()

In [19]:
predict_with_xgboost_and_rf(data)

With XGBoost:
妙音 {'人文藝術': 0.24794231, '其它': 0.1791944, '娛樂購物': 0.25826028, '自然景觀': 0.314603}
淨苑 {'人文藝術': 0.2557175, '其它': 0.18661447, '娛樂購物': 0.14831433, '自然景觀': 0.40935367}
蓮臺山 {'人文藝術': 0.37834173, '其它': 0.22717316, '娛樂購物': 0.19240086, '自然景觀': 0.2020842}
八重 {'人文藝術': 0.09550486, '其它': 0.077333085, '娛樂購物': 0.018026339, '自然景觀': 0.80913574}
盛開 {'人文藝術': 0.09502242, '其它': 0.076111585, '娛樂購物': 0.0714363, '自然景觀': 0.75742966}
獅潭 {'人文藝術': 0.117010675, '其它': 0.34121707, '娛樂購物': 0.28463587, '自然景觀': 0.25713637}
櫻花 {'人文藝術': 0.027699728, '其它': 0.032599263, '娛樂購物': 0.032264937, '自然景觀': 0.90743613}
賞櫻 {'人文藝術': 0.06345774, '其它': 0.05997097, '娛樂購物': 0.03821673, '自然景觀': 0.8383545}
停車場 {'人文藝術': 0.042364974, '其它': 0.18805662, '娛樂購物': 0.35920963, '自然景觀': 0.41036877}
秘境 {'人文藝術': 0.022537077, '其它': 0.5965953, '娛樂購物': 0.029744338, '自然景觀': 0.35112327}

With random forest:
妙音 {'人文藝術': 0.24, '其它': 0.32, '娛樂購物': 0.16, '自然景觀': 0.28}
淨苑 {'人文藝術': 0.25, '其它': 0.34, '娛樂購物': 0.1, '自然景觀': 0.31}
蓮臺山 {'人文藝術': 0.29, '其它': 0

'自然景觀'