In [1]:
import pickle

import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
from xgboost import XGBClassifier
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier

In [2]:
wv_model = Word2Vec.load("word2vec.model")

In [3]:
df = pd.read_json("preprocessed_data.json")
df_classified = df.loc[~df["self_defined_category"].isna()]

# Training

## Data spliting

In [4]:
df_classified["is_train"] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [5]:
df_classified.loc[df_classified.sample(frac=0.75, random_state=1314).index, "is_train"] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [6]:
word_vectors = []
self_defined_categories = []
tf_idfs = []
urls = []
is_trains = []
for row in df_classified.itertuples(index=False):
    for keyword in row.keyword_top10:
        word = keyword["word"]
        tf_idf = keyword["tfidf"]
        try:
            word_vectors.append(wv_model[word])
        except KeyError:
            continue
        tf_idfs.append(tf_idf)
        self_defined_categories.append(row.self_defined_category)
        urls.append(row.url)
        is_trains.append(row.is_train)

  # This is added back by InteractiveShellApp.init_path()


In [7]:
df_word_vectors = pd.DataFrame(np.vstack(word_vectors))
df_word_vectors.columns = [f"wv_d{i}" for i in range(1, 251)]
df_category = pd.DataFrame(self_defined_categories).rename(columns={0: "self_defined_category"})
df_tf_idf = pd.DataFrame(tf_idfs).rename(columns={0: "tf_idf"})
df_url = pd.DataFrame(urls).rename(columns={0: "url"})
df_is_train = pd.DataFrame(is_trains).rename(columns={0: "is_train"})

In [8]:
data = pd.concat([df_url, df_is_train, df_word_vectors, df_tf_idf, df_category], axis=1)

In [9]:
data.shape

(1062, 254)

In [10]:
data["is_train"].value_counts(normalize=True)

True     0.748588
False    0.251412
Name: is_train, dtype: float64

In [11]:
data.head()

Unnamed: 0,url,is_train,wv_d1,wv_d2,wv_d3,wv_d4,wv_d5,wv_d6,wv_d7,wv_d8,...,wv_d243,wv_d244,wv_d245,wv_d246,wv_d247,wv_d248,wv_d249,wv_d250,tf_idf,self_defined_category
0,http://eeooa0314.pixnet.net/blog/post/467928992,False,-0.00144,-0.000754,-0.001495,0.000338,0.001114,-0.002328,-0.002061,-0.000683,...,-0.000828,-0.001194,0.002148,-0.001943,0.000335,-0.001493,0.000799,-0.002531,0.45687,自然景觀
1,http://eeooa0314.pixnet.net/blog/post/467928992,False,-1.6e-05,-0.001283,-0.001066,0.001528,-0.001518,-0.001233,0.001043,-0.000697,...,-0.00218,-0.000471,0.001972,0.000367,-0.000487,0.000571,0.000225,-0.00154,0.44848,自然景觀
2,http://eeooa0314.pixnet.net/blog/post/467928992,False,-0.001987,0.001773,-0.001049,-0.002416,-0.002354,0.000235,-0.001835,0.001868,...,-0.002286,-0.001587,0.000977,-0.00241,0.001159,-0.002101,0.000674,0.001321,0.23,自然景觀
3,http://eeooa0314.pixnet.net/blog/post/467928992,False,-0.001868,-0.000627,-0.000278,0.001747,0.000706,-0.001108,0.001229,0.001589,...,-0.001928,0.000958,-0.001023,-0.000883,-0.000978,0.001172,0.001607,-0.000649,0.15229,自然景觀
4,http://eeooa0314.pixnet.net/blog/post/467928992,False,-0.000798,0.002967,0.00069,-0.000339,-0.000521,-0.00273,0.001429,-0.001333,...,-0.002905,-0.001011,0.000246,-0.002608,0.001904,-0.0026,0.001735,-0.002354,0.15229,自然景觀


In [12]:
ref_columns = ["url", "is_train"]
X_train = data.loc[data["is_train"], [c for c in data.columns if c != "self_defined_category" and c not in ref_columns]]
y_train = data.loc[data["is_train"], "self_defined_category"]
X_test = data.loc[~data["is_train"], [c for c in data.columns if c != "self_defined_category" and c not in ref_columns]]
y_test = data.loc[~data["is_train"], "self_defined_category"]

## XGBoost

In [13]:
# https://medium.com/jameslearningnote/資料分析-機器學習-第5-2講-kaggle機器學習競賽神器xgboost介紹-1c8f55cffcc
# https://xgboost.readthedocs.io/en/latest/parameter.html
xgbc = XGBClassifier(objective="multi:softprob", num_class=4)

In [14]:
weights = class_weight.compute_class_weight("balanced", np.unique(y_train), y_train)
weights_dict = {c: w for c, w in zip(np.unique(y_train), weights)}
w_array = [weights_dict[c] for c in y_train]
# Evolved from https://datascience.stackexchange.com/a/60134/58799
xgbc.fit(X_train, y_train, sample_weight=w_array)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, num_class=4, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [15]:
print(f"xgbc.score for training set: {xgbc.score(X_train, y_train)}")
print(f"xgbc.score for testing set: {xgbc.score(X_test, y_test)}")

xgbc.score for training set: 0.8754716981132076
xgbc.score for testing set: 0.449438202247191


In [16]:
def predict_given_url_xgboost(url):
    sub_data_index = data.loc[data["url"] == url].index
    probs = xgbc.predict_proba(X_test.loc[sub_data_index])
    # https://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html
    # np.unique does sorting, while pd.Series.unique doesn't
    labels = np.unique(y_test)
    final_list = [{label: p for label, p in zip(labels, prob)} for prob in probs]
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.idxmax.html
    return pd.DataFrame(final_list).mean().idxmax()

In [17]:
data_test = data.loc[~data["is_train"], ["url", "self_defined_category"]].drop_duplicates()
df_test_predicted = pd.DataFrame(
    [
        {"url": url, "predicted_category": predict_given_url_xgboost(url)}
        for url in data_test["url"]
    ]
)
df_test_predicted = df_test_predicted.merge(data_test, on="url")

In [18]:
print("The subset accuracy of XGBoost to classify articles is {}".format((df_test_predicted["predicted_category"] == df_test_predicted["self_defined_category"]).mean()))

The subset accuracy of XGBoost to classify articles is 0.6458333333333334


## Random forest

In [19]:
rfc = RandomForestClassifier(n_estimators=100, class_weight="balanced")

In [20]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [21]:
print(f"rfc.score for training set: {rfc.score(X_train, y_train)}")
print(f"rfc.score for testing set: {rfc.score(X_test, y_test)}")

rfc.score for training set: 1.0
rfc.score for testing set: 0.4756554307116105


In [22]:
def predict_given_url_rf(url):
    sub_data_index = data.loc[data["url"] == url].index
    probs = rfc.predict_proba(X_test.loc[sub_data_index])
    labels = np.unique(y_test)
    final_list = [{label: p for label, p in zip(labels, prob)} for prob in probs]
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.idxmax.html
    return pd.DataFrame(final_list).mean().idxmax()

In [23]:
data_test = data.loc[~data["is_train"], ["url", "self_defined_category"]].drop_duplicates()
df_test_predicted = pd.DataFrame(
    [
        {"url": url, "predicted_category": predict_given_url_rf(url)}
        for url in data_test["url"]
    ]
)
df_test_predicted = df_test_predicted.merge(data_test, on="url")

In [24]:
print("The subset accuracy of RF to classify articles is {}".format((df_test_predicted["predicted_category"] == df_test_predicted["self_defined_category"]).mean()))

The subset accuracy of RF to classify articles is 0.6041666666666666


## Bucket of XGBoost and RF

In [25]:
def predict_given_url_xgboost_and_rf(url):
    sub_data_index = data.loc[data["url"] == url].index
    probs_xgbc = xgbc.predict_proba(X_test.loc[sub_data_index])
    probs_rfc = rfc.predict_proba(X_test.loc[sub_data_index])
    labels = np.unique(y_test)
    final_list = [{label: p for label, p in zip(labels, prob)} for prob in probs_xgbc]
    final_list.extend([{label: p for label, p in zip(labels, prob)} for prob in probs_rfc])
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.idxmax.html
    return pd.DataFrame(final_list).mean().idxmax()

In [27]:
data_test = data.loc[~data["is_train"], ["url", "self_defined_category"]].drop_duplicates()
df_test_predicted = pd.DataFrame(
    [
        {"url": url, "predicted_category": predict_given_url_xgboost_and_rf(url)}
        for url in data_test["url"]
    ]
)
df_test_predicted = df_test_predicted.merge(data_test, on="url")

In [28]:
print("The subset accuracy of stacked models to classify articles is {}".format((df_test_predicted["predicted_category"] == df_test_predicted["self_defined_category"]).mean()))

The subset accuracy of stacked models to classify articles is 0.6458333333333334


# Saving trained models

In [29]:
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier.save_model
# https://scikit-learn.org/stable/modules/model_persistence.html
# https://morvanzhou.github.io/tutorials/machine-learning/sklearn/3-5-save/
with open("xgboost.pickle", "wb") as f:
    pickle.dump(xgbc, f)
with open("rf.pickle", "wb") as f:
    pickle.dump(rfc, f)