![](https://cdn-images-1.medium.com/max/1600/1*jX6Gwn1rt4da7e-yUj84IQ.png)

### 請先在terminal執行
`pip install --user catboost --no-cache-dir`

`pip install --user lightgbm`

In [None]:
import sys, os, psutil


def cpuStats():
    """ @author: RDizzl3 @address: https://www.kaggle.com/rdizzl3"""
    print("########## CPU STATS ############")
    pid = os.getpid()
    print(pid)
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2. ** 30
    print('memory GB:', memoryUse)
    print("########## CPU STATS ############")

import gc
gc.enable()
import time
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack, vstack

from sklearn.linear_model import SGDRegressor
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.cross_validation import KFold

from sklearn.metrics import mean_squared_error

from multiprocessing import Process, Pool
import functools

import re
import unidecode
import math

import xgboost as xgb
import lightgbm as lgb
import catboost as ctb

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)



In [None]:
PATH = 'data/'
train = pd.read_table(PATH + 'train.tsv', engine='c')
test = pd.read_table(PATH + 'test.tsv', engine='c')

train = train.loc[train.price > 0]
train.reset_index(drop=True, inplace=True)
y = np.log1p(train["price"].values)
test_id = test.test_id

## Helper Function，請執行後忽略。

In [None]:
Hash_binary = True

def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='No description yet', inplace=True)

    
def preprocess(text): # 這是之前的作業的簡易版
    non_alphanums = re.compile(u'[^A-Za-z0-9]+')
    # regex for short = re 請參考 http://ccckmit.wikidot.com/regularexpression 
    
    text = unidecode.unidecode(text)
    text = str(text).lower()
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).strip().split(" ")]])
        # strip split 請參考 http://ericbbs.blogspot.tw/2009/07/python-strip-split.html
        # [x for x in.....] 這文言文：是 list comprehension

    
### 以下是multithread ，請自行忽略 ，多執行緒不在本課程範圍。 

def multi_hash(data=None, hash_vec=None, n_jobs=4):

    p = Pool(n_jobs)
    csr_parts = p.map(hash_vec.fit_transform, np.array_split(data, n_jobs))
    p.close()
    p.join()
    return vstack(csr_parts).tocsr

def multi_apply(df=None, feat_list=None, func=None, axis=0, raw=True, n_jobs=4):

    p = Pool(n_jobs)
    f_ = p.map(functools.partial(apply_func, func=func, axis=axis, raw=raw),
               np.array_split(df[feat_list], n_jobs))
    f_ = pd.concat(f_, axis=0, ignore_index=True)
    p.close()
    p.join()
    return f_.values

def apply_func_series(data=None, func=None):

    return data.apply(func)

def multi_apply_series(df=None, feature=None, func=None, n_jobs=4):

    p = Pool(n_jobs)
    f_ = p.map(functools.partial(apply_func_series, func=func),
               np.array_split(df[feature], n_jobs))
    f_ = pd.concat(f_, axis=0, ignore_index=True)
    p.close()
    p.join()
    return f_.values
    

def apply_func(data=None, func=None, axis=0, raw=True):

    return data.apply(func, axis=axis, raw=raw)


def preprocess_text_features(df):

    df["item_description"] = multi_apply_series(df=df[["item_description"]],
                                                feature="item_description",
                                                func=preprocess,
                                                n_jobs=4)
    df["name"] = multi_apply_series(df=df[["name"]],
                                    feature="name",
                                    func=preprocess,
                                    n_jobs=4)
    
def get_hashing_features(df, Hash_binary, start_time):
    # df = pd.concat([train, test])
    dim = 20
    ha = HashingVectorizer(
        n_features=2 ** dim,
        ngram_range=(1, 2),
        norm=None,
        alternate_sign=False,
        binary=Hash_binary
        # stop_words='english'
    )

    X_name = ha.fit_transform(df['name'])
    cpuStats()
    X_name += ha.fit_transform(df['item_description'])
    cpuStats()
    
    print('[{}] Finished hashing'.format(time.time() - start_time))
    return X_name

## 預處理

In [None]:
start_time = time.time()
handle_missing_inplace(train) # 處理 NaN 

nrows = train.shape[0]
merge = pd.concat([train, test])
del train, test 
gc.collect()

preprocess_text_features(df=merge)
merge = get_hashing_features(merge, Hash_binary, start_time) # Hash Trick

print('有 {} 欄位'.format(merge.shape[1]) )

csr_train = merge[:nrows]
csr_test = merge[nrows:]
del merge
gc.collect()

########## CPU STATS ############
14854
memory GB: 1.5250625610351562
########## CPU STATS ############
########## CPU STATS ############
14854
memory GB: 1.0345611572265625
########## CPU STATS ############
[203.96181082725525] Finished hashing
有多少欄位: 1048576
########## CPU STATS ############
14854
memory GB: 1.4660377502441406
########## CPU STATS ############


63

### 避免Hub，跑太久，使用L1 Selection，選特徵

In [None]:
print('[{}] Starting SGD l1 selection'.format(time.time() - start_time))
sgd_l1 = SGDRegressor(max_iter=30, penalty="l1", random_state=1, alpha=1e-6)
sgd_l1.fit(csr_train, y)
good_feats = np.abs(np.array(sgd_l1.coef_)) > 1e-6 
print("Features reduced from %10d to %10d" % (csr_train.shape[1], int(good_feats.sum())))

csr_train = csr_train[:, good_feats]
csr_test = csr_test[:, good_feats]
gc.collect()

[208.7959930896759] Starting SGD l1 selection
Features reduced from    1048576 to     446994


0

### XGBoost Lightgbm Catboost
- XGBoost [參數](https://github.com/dmlc/xgboost/blob/master/doc/parameter.md)
- LightGBM [參數](http://lightgbm.readthedocs.io/en/latest/Parameters.html)
- CatBoost [參數](https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning-docpage/)

In [None]:
class Xgb(object):
    def __init__(self, seed=2018, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 100) # 避免跑太久，所以設100

    def train(self, xtra, ytra, xte, yte):
        dtrain = xgb.DMatrix(xtra, label=ytra)
        dvalid = xgb.DMatrix(xte, label=yte)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds,
            watchlist, verbose_eval=20)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

class Lgb(object):
    def __init__(self, seed=2018, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 100)# 避免跑太久，所以設100

    def train(self, xtra, ytra, xte, yte):
        #ytra = ytra.ravel()
        #yte = yte.ravel()
        dtrain = lgb.Dataset(xtra, label=ytra)
        dvalid = lgb.Dataset(xte, label=yte)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.gbdt = lgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(x)

class Cat(object):
    def __init__(self, seed=2018, params=None):
        self.seed = seed
        self.param = params
        self.nrounds = 100 # 避免跑太久，所以設100

    def train(self, xtra, ytra, xte, yte):
        self.gbdt = ctb.CatBoostRegressor(depth=4,
            iterations=self.nrounds, random_seed=self.seed,
            use_best_model=True)

        xtra = pd.DataFrame(xtra)
        ytra = pd.DataFrame(ytra)
        xte = pd.DataFrame(xte)
        yte = pd.DataFrame(yte)

        self.gbdt.fit(X=xtra, y=ytra, eval_set=(xte, yte),
                      use_best_model=True)

    def predict(self, x):
        return self.gbdt.predict(x)

## Meta KFold with OOF (Out Of Fold)
### K折交叉驗證

In [None]:
fold = 5 # 手動設置要幾個fold
        
        # ==== 以下建議搭配slide 圖示會更清楚運作過程 ====
        # ==== 以下建議搭配slide 圖示會更清楚運作過程 ====
        # ==== 以下建議搭配slide 圖示會更清楚運作過程 ====

def oof(model, ntrain, ntest, kf, train, labels, test):
    # model, 用的模型
    # ntrain, 訓練集的row number
    # ntest,  測試集的row number
    # kf,     Kfold obj
    # train,  訓練集
    # labels, 目標
    # test    測試集
    
    # 先配置空間
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((fold, ntest)) # fold X ntest 空間 

    for i, (train_index, test_index) in enumerate(kf): # 開始分割
        x_tr = train[train_index]
        y_tr = labels[train_index]
        x_te = train[test_index]
        y_te = labels[test_index]

        model.train(x_tr, y_tr, x_te, y_te) # 訓練 (fold-1)個 fold

        oof_train[test_index] = model.predict(x_te) # 去預測 train left fold，稱作meta-train
        oof_test_skf[i, :] = model.predict(test) # 去預測 test，稱作meta-test

    oof_test[:] = oof_test_skf.mean(axis=0) # all folds score 取平均
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

### Level 1

In [None]:
def level_1(train, labels, test):
    #train = train
    #test = test
    #labels = labels

    ntrain = train.shape[0]
    ntest = test.shape[0]

    kf = KFold(ntrain, n_folds=fold ,
               shuffle=True, random_state=2018)

    lgb_params = {}
    lgb_params['boosting_type'] = 'gbdt'
    lgb_params['objective'] = 'regression'
    lgb_params['metric'] = 'rmse'
    lgb_params['num_leaves'] = 2**5
    lgb_params['max_depth'] = 4
    lgb_params['feature_fraction'] = 0.9
    lgb_params['bagging_fraction'] = 0.95
    lgb_params['bagging_freq'] = 5
    lgb_params['learning_rate'] = 0.3

    xgb_params = {}
    xgb_params['booster'] = 'gbtree'
    xgb_params['objective'] = 'reg:linear'
    xgb_params['learning_rate'] = 0.3
    xgb_params['max_depth'] = 4
    xgb_params['subsample'] = 0.8
    xgb_params['colsample_bytree'] = 0.7
    xgb_params['colsample_bylevel'] = 0.7

    cat_params = {}
    cat_params['learning_rate'] = 0.3
    cat_params['depth'] = 3
    cat_params['bagging_temperature'] = 0.8
    cat_params['loss_function']='RMSE'
    cat_params['eval_metric']='RMSE'
    
    cg = Cat(seed=2018, params=cat_params)
    xg = Xgb(seed=2018, params=xgb_params)
    lg = Lgb(seed=2018, params=lgb_params)
    
    ##########################################################################
    xg_oof_train, xg_oof_test = oof(xg, ntrain, ntest, kf, train, labels, test)
    lg_oof_train, lg_oof_test = oof(lg, ntrain, ntest, kf, train, labels, test)
    cg_oof_train, cg_oof_test = oof(cg, ntrain, ntest, kf, train, labels, test)
    ##########################################################################
    
    print("CG-CV: {}".format(mean_squared_error(labels, cg_oof_train)))
    print("XG-CV: {}".format(mean_squared_error(labels, xg_oof_train)))
    print("LG-CV: {}".format(mean_squared_error(labels, lg_oof_train)))

    x_train = np.concatenate((cg_oof_train, xg_oof_train, lg_oof_train), axis=1)
    x_test = np.concatenate((cg_oof_test, xg_oof_test, lg_oof_test), axis=1)

    np.save(arr=x_train, file='x_concat_train.npy')
    np.save(arr=x_test, file='x_concat_test.npy')
    np.save(arr=labels, file='y_labels.npy')

    return x_train, labels, x_test

### Level 2

In [None]:
def level_2():
    train = np.load('x_concat_train.npy')
    labels = np.load('y_labels.npy')
    test = np.load('x_concat_test.npy')

    dtrain = xgb.DMatrix(train, label=labels)
    dtest = xgb.DMatrix(test)

    xgb_params = {}
    xgb_params["objective"] = "reg:linear"
    xgb_params["eta"] = 0.1
    xgb_params["subsample"] = 0.9
    xgb_params["max_depth"] = 5
    xgb_params['eval_metric'] = 'rmse'
    xgb_params['min_child_weight'] = 10
    xgb_params['seed'] = 2018

    res = xgb.cv(xgb_params, dtrain, num_boost_round=500, nfold=5, seed=2018, stratified=False,
                 early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

    best_nrounds = res.shape[0] - 1
    cv_mean = res.iloc[-1, 0]
    cv_std = res.iloc[-1, 1]

    print('')
    print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))
    bst = xgb.train(xgb_params, dtrain, best_nrounds)

    preds = np.expm1(bst.predict(dtest)) # 一開始把目標取了np.log1p()，現在inverse回去
    return preds


In [None]:
def main():
    x_train, labels, x_test = level_1(csr_train, y, csr_test)
    preds = level_2()
    sub = pd.DataFrame()
    sub['id'] = test_id
    sub['price'] = preds
    sub.to_csv('stacking.csv', index=False)

if __name__ == '__main__':
    main()

[0]	train-rmse:1.88501	eval-rmse:1.88423
[1]	train-rmse:1.41393	eval-rmse:1.4132
[2]	train-rmse:1.11129	eval-rmse:1.11056
[3]	train-rmse:0.926055	eval-rmse:0.925453
[4]	train-rmse:0.817426	eval-rmse:0.816743
[5]	train-rmse:0.757215	eval-rmse:0.756551
[6]	train-rmse:0.723132	eval-rmse:0.722545
[7]	train-rmse:0.704646	eval-rmse:0.704118
[8]	train-rmse:0.694136	eval-rmse:0.693649
[9]	train-rmse:0.68765	eval-rmse:0.687339
[10]	train-rmse:0.682583	eval-rmse:0.6824
[11]	train-rmse:0.678968	eval-rmse:0.678809
[12]	train-rmse:0.676046	eval-rmse:0.675904
[13]	train-rmse:0.673466	eval-rmse:0.673319
[14]	train-rmse:0.671143	eval-rmse:0.670928
[15]	train-rmse:0.668533	eval-rmse:0.668357
[16]	train-rmse:0.666427	eval-rmse:0.666331
[17]	train-rmse:0.664565	eval-rmse:0.664521
[18]	train-rmse:0.662856	eval-rmse:0.662915
[19]	train-rmse:0.661313	eval-rmse:0.661537
[20]	train-rmse:0.659742	eval-rmse:0.659943
[21]	train-rmse:0.658045	eval-rmse:0.658273
[22]	train-rmse:0.65655	eval-rmse:0.656831
[23]	trai

- 我們已同意將完整版solution開源，如果您有興趣 [here](https://github.com/goldentom42/kaggle_mercari_2017/blob/master/mercari.py)
    - 完整版因為資源限制，所以跟Excercise 99%作法不同，也更powerful。
- Kudos to Teammates [Olivier](), [Mark Peng](), [Rand Xie](), [Yifan Xie]()