In [31]:
%%time 
from data.pair_feature import *
from data.original import item_pairs_train, item_pairs_test, item_info_train, item_info_test
from data.corpus_based import feature_nodes as corpus_based_feature_nodes
from data.dummy import dummy_features, categoryID_shuffle_features
from data.item import get_item, show_item_pair
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import train_test_split
n_train = 3344613
n_test = 1315205
import pickle

CPU times: user 3.9 s, sys: 3.22 s, total: 7.12 s
Wall time: 20.4 s


In [23]:
def build_mxnet_header(model_names):
    header = ["line_num"]
    for name in model_names:
        header.extend('mxnet_{}_batch_{}_sim'.format(name, x) for x in  ['min', 'max', 'summean', 'mean'])
    return header

image_mxnet_features_train = pd.read_csv('data/data_files/image_mxnet_feature_train_bn.csv', names=build_mxnet_header(['bn']), index_col='line_num').sort_index()
image_mxnet_features_test = pd.read_csv('data/data_files/image_mxnet_feature_test_bn.csv', names=build_mxnet_header(['bn']), index_col='line_num').sort_index()

In [24]:
from feature.image_histogram_feature import header as image_histogram_feature_header
image_histogram_features_train = pd.read_csv('data/data_files/image_histogram_feature_train.csv', names = image_histogram_feature_header, index_col='index').sort_index()
image_histogram_features_test = pd.read_csv('data/data_files/image_histogram_feature_test.csv', names = image_histogram_feature_header, index_col='index').sort_index()

In [25]:
from feature.image_hash_feature import headers as image_hash_feature_headers

image_hash_features_train = pd.read_csv('data/data_files/image_hash_feature_train.csv',
                                       names=image_hash_feature_headers, index_col='index').sort_index()
image_hash_features_test = pd.read_csv('data/data_files/image_hash_feature_test.csv',
                                       names=image_hash_feature_headers, index_col='index').sort_index()

In [26]:
image_features_train = pd.concat((image_histogram_features_train, image_hash_features_train, image_mxnet_features_train), axis=1)
image_features_test = pd.concat((image_histogram_features_test, image_hash_features_test, image_mxnet_features_test), axis=1)

In [27]:
corpus_based_features = []
for node in corpus_based_feature_nodes:
    corpus_based_features.append(node.get_data())
corpus_based_features = pd.concat(corpus_based_features, axis=1)

In [28]:
corpus_based_features.shape

(4035592, 110)

In [29]:
def split_feats(feats):
    n_train = 2991396
    feats_train = feats[:n_train]
    feats_test = feats[n_train:]
    feats_test.index = simple_features_test.index.copy()
    return feats_train, feats_test

In [32]:
corpus_based_features_train, corpus_based_features_test = split_feats(corpus_based_features)
dummy_features_train, dummy_features_test = split_feats(dummy_features)
categoryID_shuffle_features_train, categoryID_shuffle_features_test = split_feats(categoryID_shuffle_features)

In [33]:
print(simple_features_train.shape[1])
print(aggregation_features_train.shape[1])
print(title_features_train.shape[1])
print(description_features_train.shape[1])
print(image_features_train.shape[1])
print(corpus_based_features_train.shape[1])
# print(categoryID_shuffle_features_train.shape[1])

26
20
4
10
44
110


In [34]:
features_train = pd.concat((
        simple_features_train,
        aggregation_features_train,
        title_features_train, description_features_train,
        ncd_features_train,
        image_features_train,
        corpus_based_features_train,
        dummy_features_train,
        categoryID_shuffle_features_train,
    ), axis=1)
features_test = pd.concat((
        simple_features_test,
        aggregation_features_test,
        title_features_test, description_features_test,
        ncd_features_test,
        image_features_test, 
        corpus_based_features_test,
        dummy_features_test,
        categoryID_shuffle_features_test
    ), axis=1)

In [35]:
print(features_train.shape)
print(features_test.shape) 

(2991396, 290)
(1044196, 290)


In [36]:
weight_map = {
    1: 3,
    2: 1,
    3: 1
}

weight = np.empty(item_pairs_train.shape[0])
generationMethod = item_pairs_train.generationMethod.values
for i in [1,2,3]:
    weight[generationMethod==i] = weight_map[i]

In [37]:
%%time
X_train, X_test, y_train, y_test, I_train, I_test =\
train_test_split(features_train.values, item_pairs_train.isDuplicate.values, np.arange(features_train.shape[0]),
                 test_size=0.2, random_state=42)
weight_train = weight[I_train]

CPU times: user 3min 27s, sys: 29.3 s, total: 3min 56s
Wall time: 3min 57s


In [38]:
model = XGBClassifier(learning_rate=0.05, max_depth=10, subsample=1, colsample_bytree=0.8, n_estimators=500,
                      min_child_weight = 1,
                      nthread=32)

In [39]:
model.fit(X_train, y_train, eval_metric='auc', eval_set=[(X_test, y_test)],  early_stopping_rounds=20)

[0]	validation_0-auc:0.932434
Will train until validation_0-auc hasn't improved in 20 rounds.
[1]	validation_0-auc:0.936084
[2]	validation_0-auc:0.939251
[3]	validation_0-auc:0.940383
[4]	validation_0-auc:0.941008
[5]	validation_0-auc:0.94168
[6]	validation_0-auc:0.942284
[7]	validation_0-auc:0.942762
[8]	validation_0-auc:0.943242
[9]	validation_0-auc:0.94366
[10]	validation_0-auc:0.944016
[11]	validation_0-auc:0.944275
[12]	validation_0-auc:0.944648
[13]	validation_0-auc:0.945084
[14]	validation_0-auc:0.945342
[15]	validation_0-auc:0.945872
[16]	validation_0-auc:0.946113
[17]	validation_0-auc:0.946409
[18]	validation_0-auc:0.94667
[19]	validation_0-auc:0.946953
[20]	validation_0-auc:0.94726
[21]	validation_0-auc:0.947525
[22]	validation_0-auc:0.947729
[23]	validation_0-auc:0.94798
[24]	validation_0-auc:0.948206
[25]	validation_0-auc:0.948424
[26]	validation_0-auc:0.948613
[27]	validation_0-auc:0.948881
[28]	validation_0-auc:0.949085
[29]	validation_0-auc:0.949308
[30]	validation_0-auc

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=500, nthread=32,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [40]:
%time pred_test = model.predict_proba(X_test)[:,1]

CPU times: user 1min 43s, sys: 2.16 s, total: 1min 46s
Wall time: 39.6 s


In [41]:
eval_set = item_pairs_train.iloc[I_test]
eval_set['prob_predicted'] = pred_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [42]:
from bokeh.io import output_notebook, show
from bokeh.charts import Histogram
output_notebook()

In [43]:
from ml_metrics import auc

In [44]:
eval_set.groupby('isDuplicate')['prob_predicted'].mean()

isDuplicate
0    0.124484
1    0.827428
Name: prob_predicted, dtype: float32

In [45]:
eval_set.groupby('isDuplicate')['prob_predicted'].std()

isDuplicate
0    0.196732
1    0.246477
Name: prob_predicted, dtype: float32

In [46]:
from data.dummy import mergeLeftInOrder2

In [47]:
eval_set = mergeLeftInOrder2(eval_set, item_info_train[['categoryID']], left_on='itemID_1', right_index=True)

In [48]:
auc_by_category = eval_set.groupby('categoryID')[['isDuplicate', 'prob_predicted']].apply(lambda x: auc(x['isDuplicate'].values, x['prob_predicted'].values))

In [49]:
auc_by_category.order()

  if __name__ == '__main__':


categoryID
112    0.861898
85     0.902402
33     0.914760
111    0.920949
105    0.926524
101    0.929792
31     0.935724
99     0.935863
10     0.941604
23     0.946223
97     0.949035
98     0.950030
19     0.950759
26     0.951343
25     0.952084
32     0.952663
86     0.952899
115    0.955378
34     0.956140
42     0.956171
24     0.958461
21     0.958798
40     0.960437
87     0.960494
39     0.961754
84     0.961862
96     0.964935
29     0.967048
83     0.967535
36     0.968038
102    0.968630
38     0.968755
90     0.969396
27     0.969502
28     0.973903
94     0.975563
20     0.975588
114    0.976178
89     0.978343
9      0.978396
106    0.979690
88     0.982304
93     0.983249
30     0.984044
81     0.984905
82     0.985275
14     0.986669
92     0.987034
91     0.990017
11     0.990300
116    0.993248
dtype: float64

In [50]:
from bokeh.charts import Bar

In [51]:
show(Bar(auc_by_category.order()))

  if __name__ == '__main__':


In [52]:
eval_set.shape[0]

598280

In [53]:
p = Histogram(eval_set.head(5000), values='prob_predicted', color='isDuplicate', bins=30, legend=True, plot_width=1000)
show(p)

In [54]:
feature_importance = pd.DataFrame({'feature': features_train.columns,
              'importance': model.feature_importances_}).sort('importance', ascending=False)

  from ipykernel import kernelapp as app


In [55]:
from bokeh.charts import Bar
from bokeh.charts.attributes import CatAttr

In [56]:
feature_importance.head(50)

Unnamed: 0,feature,importance
180,title_word_1_2gram_dtm_0_predict_log_price__1,0.015352
181,title_word_1_2gram_dtm_0_predict_log_price__2,0.015343
107,mxnet_bn_batch_mean_sim,0.014696
4,description_length_max,0.014154
5,description_length_min,0.013829
11,price_diff,0.013002
184,title_description_dtm_0_predict_log_price__1,0.012954
185,title_description_dtm_0_predict_log_price__2,0.0125
105,mxnet_bn_batch_max_sim,0.011103
14,price_min,0.010932


In [57]:
feature_importance.tail(50)

Unnamed: 0,feature,importance
230,categoryID_28,0.000242
235,categoryID_33,0.00024
251,categoryID_90,0.000208
222,categoryID_19,0.000202
257,categoryID_97,0.000202
267,categoryID_115,0.000171
236,categoryID_34,0.00016
224,categoryID_21,0.000154
218,categoryID_9,0.000123
244,categoryID_83,0.00012


In [58]:
corpus_based_features.columns

Index(['title_word_dtm_0__tf__cosine', 'title_word_dtm_0__binary_tf__cosine',
       'title_word_dtm_0__tfidf__cosine',
       'title_word_dtm_0__binary_tfidf__cosine',
       'title_word_dtm_1__tf__cosine', 'title_word_dtm_1__binary_tf__cosine',
       'title_word_dtm_1__tfidf__cosine',
       'title_word_dtm_1__binary_tfidf__cosine',
       'title_word_dtm_0_1__tf__cosine',
       'title_word_dtm_0_1__binary_tf__cosine',
       ...
       'description_sentence_word_dtm_1__binary_tf__cosine_similarity__0_mean',
       'description_sentence_word_dtm_1__binary_tf__cosine_similarity__1_min',
       'description_sentence_word_dtm_1__binary_tf__cosine_similarity__1_max',
       'description_sentence_word_dtm_1__binary_tf__cosine_similarity__1_mean',
       'description_sentence_word_dtm_1_1__binary_tf__cosine_similarity__0_min',
       'description_sentence_word_dtm_1_1__binary_tf__cosine_similarity__0_max',
       'description_sentence_word_dtm_1_1__binary_tf__cosine_similarity__0_mean',


In [59]:
item_info_train.shape[0] + item_info_test.shape[0]

4659818

In [60]:
bst = model.booster()
bst.feature_names = features_train.columns.tolist()
print(pd.Series(bst.get_score(importance_type='gain')).sort_values(ascending=False).head(50))
print(pd.Series(bst.get_score(importance_type='cover')).sort_values(ascending=False).head(50))

image_phash_hamming_0_min                         2875.218537
mxnet_bn_batch_max_sim                            1869.406305
categoryID_112                                    1714.816033
title_word_dtm_0_1__binary_tfidf__cosine          1053.358729
image_dhash_hamming_0_min                          811.860253
title_word_dtm_0_1__tfidf__cosine                  359.690265
categoryID_9                                       311.754781
categoryID_33                                      275.383379
categoryID_111                                     254.657611
price_diff_ratio                                   254.372805
image_phash_hamming_0_mean                         250.078365
categoryID_84                                      239.853817
title_word_dtm_0_1__max_disjoint_idf_min           190.558471
image_phash_jaccard                                184.227701
categoryID_price_std                               183.384268
categoryID_36                                      179.627319
category

In [61]:
feature_importance.set_index('feature').loc[corpus_based_features.columns].sort('importance', ascending=False)

  if __name__ == '__main__':


Unnamed: 0,importance
title_word_1_2gram_dtm_0_predict_log_price__1,0.015352
title_word_1_2gram_dtm_0_predict_log_price__2,0.015343
title_description_dtm_0_predict_log_price__1,0.012954
title_description_dtm_0_predict_log_price__2,0.012500
description_word_dtm_0__max_disjoint_idf_min,0.008754
title_word_2gram_dtm_0__max_disjoint_idf_min,0.006669
description_word_dtm_1__max_disjoint_idf_min,0.006635
description_word_nmf_0_1_1,0.006330
description_word_lsa_0_1_1,0.006239
title_word_lsa_0_1,0.006059


In [62]:
show(Bar(feature_importance.head(50), label=CatAttr(columns=['feature'], sort=False), values='importance',
        plot_width=1000))

In [63]:
model_name = 'xgboost.39.weighted'

In [64]:
%%time
model.fit(features_train, item_pairs_train.isDuplicate, sample_weight=weight)
prob_test = model.predict_proba(features_test)[:,1]

pickle.dump(model, open(model_name+'.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pd.DataFrame({'id': item_pairs_test['id'], 'probability': prob_test}).to_csv(model_name+'.csv', index=False)

CPU times: user 1d 17h 9min 6s, sys: 2min 1s, total: 1d 17h 11min 8s
Wall time: 1h 35min 21s


In [65]:
%%time
model_name = 'xgboost.39.tuning.1'
model = XGBClassifier(learning_rate=0.05, max_depth=10, subsample=0.8, colsample_bytree=0.8, n_estimators=1000,
                      min_child_weight = 1,
                      nthread=32)
model.fit(features_train, item_pairs_train.isDuplicate, sample_weight=weight)
prob_test = model.predict_proba(features_test)[:,1]

pickle.dump(model, open(model_name+'.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pd.DataFrame({'id': item_pairs_test['id'], 'probability': prob_test}).to_csv(model_name+'.csv', index=False)

CPU times: user 3d 10h 19min 40s, sys: 2min 51s, total: 3d 10h 22min 32s
Wall time: 3h 4min 33s


In [66]:
%%time
model_name = 'xgboost.39.tuning.2'
model = XGBClassifier(learning_rate=0.1, max_depth=10, subsample=0.8, colsample_bytree=0.8, n_estimators=500,
                      min_child_weight = 1,
                      nthread=32)
model.fit(features_train, item_pairs_train.isDuplicate, sample_weight=weight)
prob_test = model.predict_proba(features_test)[:,1]

pickle.dump(model, open(model_name+'.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pd.DataFrame({'id': item_pairs_test['id'], 'probability': prob_test}).to_csv(model_name+'.csv', index=False)

CPU times: user 1d 16h 49min 8s, sys: 1min 41s, total: 1d 16h 50min 49s
Wall time: 1h 31min 34s
