In [4]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path)

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
color = sns.color_palette()
%matplotlib inline
matplotlib.style.use('ggplot')

import time
import numpy as np
import pandas as pd
from IPython.display import display

# remove warnings
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
from itertools import product

from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

# my module
from conf.configure import Configure
from utils import data_utils, dataframe_util
from utils.common_utils import common_num_range

import model.get_datasets as gd

In [2]:
train, test = gd.load_datasets()

y_train = train['orderType']
train.drop(['orderType'], axis=1, inplace=True)

df_columns = train.columns.values
print('train: {}, test: {}, feature count: {}, orderType 1:0 = {:.5f}'.format(
    train.shape[0], test.shape[0], len(df_columns), 1.0*sum(y_train) / len(y_train)))

load baseline features
merge advance_action_features1
merge action_order_features2
merge action_order_features1
merge other_features1
merge advance_order_history_features
train: 40307, test: 10076, feature count: 552, orderType 1:0 = 0.16436


In [10]:
rfc = RandomForestClassifier(n_estimators=200, n_jobs=-1, class_weight='balanced', max_depth=6)
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2)
boruta_selector.fit(train.fillna(-1).values, y_train.values)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	552
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	552
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	552
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	552
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	552
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	552
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	552
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	334
Tentative: 	137
Rejected: 	81
Iteration: 	9 / 100
Confirmed: 	334
Tentative: 	137
Rejected: 	81
Iteration: 	10 / 100
Confirmed: 	334
Tentative: 	137
Rejected: 	81
Iteration: 	11 / 100
Confirmed: 	334
Tentative: 	137
Rejected: 	81
Iteration: 	12 / 100
Confirmed: 	347
Tentative: 	124
Rejected: 	81
Iteration: 	13 / 100
Confirmed: 	347
Tentative: 	124
Rejected: 	81
Iteration: 	14 / 100
Confirmed: 	347
Tentative: 	113
Rejected: 	92
Iteration: 	15 / 100
Confirmed: 	347
Tentative: 	113
Rejected: 	92
Iteration: 	16 / 100
Conf

BorutaPy(alpha=0.05,
     estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=6, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=477, n_jobs=-1, oob_score=False,
            random_state=<mtrand.RandomState object at 0x7f0f9c072c80>,
            verbose=0, warm_start=False),
     max_iter=100, n_estimators='auto', perc=100,
     random_state=<mtrand.RandomState object at 0x7f0f9c072c80>,
     two_step=True, verbose=2)

In [12]:
print ('Number of selected features:')
print (boruta_selector.n_features_)

Number of selected features:
369


In [15]:
feature_df = pd.DataFrame(train.columns.tolist(), columns=['features'])

In [16]:
feature_df['rank']=boruta_selector.ranking_

In [17]:
feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)

In [19]:
feature_df

Unnamed: 0,features,rank
0,last_pay_money_now_browse_product3_ratio,1
1,order_vs_actiontype_1_ratio,1
2,operate_num_after_last_order,1
3,open_num_after_last_order,1
4,open_app_std_delta,1
5,open_app_ratio,1
6,open_app_pay_money_ratio,1
7,open_app_mean_delta,1
8,open_app_max_delta,1
9,multi_order_has_good_order,1


In [20]:
print ('Top %d features:' % boruta_selector.n_features_)
print (feature_df.head(boruta_selector.n_features_))

Top 369 features:
                                          features  rank
0         last_pay_money_now_browse_product3_ratio     1
1                      order_vs_actiontype_1_ratio     1
2                     operate_num_after_last_order     1
3                        open_num_after_last_order     1
4                               open_app_std_delta     1
5                                   open_app_ratio     1
6                         open_app_pay_money_ratio     1
7                              open_app_mean_delta     1
8                               open_app_max_delta     1
9                       multi_order_has_good_order     1
10                    most_free_month_action_count     1
11                     order_vs_actiontype_4_ratio     1
12                                 most_free_month     1
13                                  month_11_count     1
14                                  month_10_count     1
15                     last_time_x_actiontypr_mean     1
16           

In [24]:
sum(boruta_selector.support_)

369

In [26]:
# check ranking of features
print ('\n Feature ranking:')
print (boruta_selector.ranking_.shape)


 Feature ranking:
(552,)


In [34]:
print feature_df[feature_df['rank'] > 2]['features'].values.tolist()

['raing3_ratio', 'action_last_5', 'days_ratio_since_last_order', 'last_submit_order_action_count', 'nearestDiff_max9', 'order_vs_actiontype_8_ratio', u'userid', 'action_type_68_time_delta_std', 'raing1_ratio', '234_5_diff_time', 'nearestDiff_min9', 'avg_rating', 'action56_rate', 'two_gram_89_time_max', 'has_comment_flag', 'three_gram_action_456_ratio', 'action_2_num_after_last_order', 'action_last_18', 'last_time_order_now_action_submit_order_count', 'last_time_order_now_action_browse_product2_count', 'two_gram_89_last_time', 'raing2_ratio', 'nearestTime3', 'raing4_ratio', 'submit_order_pay_money_ratio', 'action_last_19', 'action_last_20', 'month_4_count', '234_9_diff_time', 'actiontype_seq_kalman_real_1', 'avg_rating_type0', 'two_gram_12_time_min', 'timespan_last_8', 'actiontypeproplast20_2', 'open_app_min_delta', 'last_time_order_now_has_submited_order', 'action_last_17', 'month_6_count', 'two_gram_12_time_max', 'actiontypeproplast20_mean', 'two_gram_12_last_time', 'action_last_15', 