In [1]:
%matplotlib inline
import os, sys, time
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from __future__ import print_function

In [2]:
# load the raw data
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
print('Shape of the training dataset: {}'.format(df_train.shape))
print('Shape of the test dataset: {}'.format(df_test.shape))
df_train.head()

Shape of the training dataset: (595212, 59)
Shape of the test dataset: (892816, 58)


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [3]:
# is there class imbalance? The answer is very much...
df_train['target'].value_counts()

0    573518
1     21694
Name: target, dtype: int64

In [4]:
# what features are we looking at?
# there are 57 features in total (=59-2)
all_features = df_train.columns.tolist()
all_features.remove('target')
all_features.remove('id')
bin_features = []
cat_features = []
other_features = []
for col in df_train.columns[2:]:  # exclude 'id' and 'target'
    if col.split('_')[-1] == 'bin':
        bin_features.append(col)
    elif col.split('_')[-1] == 'cat':
        cat_features.append(col)
    else:
        other_features.append(col)

In [5]:
len(bin_features)

17

In [6]:
len(cat_features)

14

In [7]:
# how many missing values are we looking at?
# percentage by classes
df_tmp = df_train[['target']+all_features].groupby('target')[all_features]\
.apply(lambda x: 100*np.sum(x == -1)/len(x)).T
# percentage by all classes
df_tmp2 = (df_train[all_features].apply(lambda x: x == -1).sum() / df_train.shape[0] * 100)\
.to_frame()
df_tmp2.rename(columns={0: 'total'}, inplace=True)
# merge!
df_tmp = df_tmp.merge(df_tmp2, left_index=True, right_index=True)
del df_tmp2
df_tmp.sort_values(by='total', ascending=False, inplace=True)
df_tmp.head()

Unnamed: 0,0,1,total
ps_car_03_cat,69.358067,61.998709,69.089837
ps_car_05_cat,45.002772,38.960081,44.782531
ps_reg_03,18.25784,14.105283,18.10649
ps_car_14,7.130901,7.942288,7.160474
ps_car_07_cat,1.846673,4.139393,1.930237


## Dealing with missing values
Before we dive deep into the model, we need to deal with the missing values first. The simpliest method is just to ignore all the samples that as **any** missing value in there. However, by doing so, we reduce the total sample size from ~ 600k to less than 100k. 

Therefore, we need a better way to impute the missing values. Jingfei's current methodolgy is to treat the missing value as a new category, for the cases of binary and categorical features. (How about continuous features?)

Another possibility is to geuss the missing values intelligently. To start, we can fill the missing binary and categorical values with the most frequency entries, and fill the missing continuous with their medians.

There are much more can be done with regard to data imputation: for example, to train regression models on the missing variable, from other varibles. However, we need to keep in mind how to carry the regression to test data or new data, with missing entries. 

We have to pay special attention to those features with many missing values, namely, `ps_car_03_cat`, `ps_cat_05_cat`, `ps_reg_03`, etc.

### Metric

When comparing different imputation methods, we need have a unified metric to evaluate their success. The simplest metric can be the testing error - same as how we evaluate different models, but in this case, the model should stay the same. The default choice will be xgboost.

In [8]:
# replace -1 with nan for easier handling
df_train.replace({-1: np.float('NaN')}, inplace=True)

# helper function to get the best and last training, valid gini coefficients
def get_gini_stats(eval_results):
    last_training, last_valid = [], []
    best_training, best_valid = [], []
    for x in eval_results:
        last_training.append(x['train']['gini'][-1])
        last_valid.append(x['valid']['gini'][-1])
        best_training.append(max(x['train']['gini']))
        best_valid.append(max(x['valid']['gini']))
    return last_training, last_valid, best_training, best_valid

### Method 1: mode and median
In the simplest case, let's fill the missing binary / categorical values with their mode, and continoues variables with their median.

In [9]:
# can also be done with sklearn
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
fills = {}  # make the dict so that we can fill test data
for c in df_train.columns[2:]:
    if c.endswith(('bin', 'cat')):
        fills[c] = df_train[c].mode()[0]
    else: 
        fills[c] = df_train[c].median()
df_train_impute_1 = df_train.fillna(fills)
# df_test_impute_1 = df_test.fillna(fills)

In [10]:
%run '../py/models_ccy.py'
clf = my_xgb(df_train_impute_1[all_features], 
             df_train_impute_1['target'].astype(int, inplace=True))
clf.input_scaling(ratio=0.05)

Splitted training/test, and applied standard scaler.
Total number of training samples: 565451


In [11]:
# clf.one_hot_encoding()
models_impute_1, eval_results_impute_1 = clf.fit(one_hot=False)

Performing 5 fold CV




Fold 1 of 5
class weight is: 26
[0]	train-gini:0.225807	valid-gini:0.211856
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.388041	valid-gini:0.276855
[200]	train-gini:0.462206	valid-gini:0.274221
Stopping. Best iteration:
[108]	train-gini:0.394901	valid-gini:0.278067

Fold 2 of 5
class weight is: 26
[0]	train-gini:0.226864	valid-gini:0.193245
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.388274	valid-gini:0.268205
[200]	train-gini:0.464047	valid-gini:0.267004
Stopping. Best iteration:
[125]	train-gini:0.408428	valid-gini:0.269496

Fold 3 of 5
class weight is: 26
[0]	train-gini:0.227315	valid-gini:0.195788
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100

In [12]:
last_training, last_valid, best_training, best_valid = get_gini_stats(eval_results_impute_1)
print(np.mean(last_valid))
print(np.std(last_valid))

0.2689892
0.00348325740651


### Method 2: random imputation
The idea here is that, for the missing values, we want to fill them with values drawn randomly from their underlying distribution. We will use the existing values to infer the underlying distribution. This can be easily done for binary and categorical features. (How to do this for continous feature?)

In [13]:
# get distributions for a categorical feature
def fill_w_distribution(s):
    '''
    Input
    =======
    s: <pd series>
        with NaN as missing values. Binary or categorical feature
    Return
    =======
    filled_s: <pd series>
        without NaNs
    '''
    import numpy as np
    
    Xs = s.value_counts().index.values
    p  = s.value_counts().values
    p  = 1.0 * p / p.sum()
    num_to_fill = s.isnull().sum()
    idx_to_fill = s[s.isnull()].index
    fills = np.random.choice(Xs, size=num_to_fill, p=p)
    val_to_fill = pd.Series(fills, index=idx_to_fill)
    # fill it!
    filled_s = s.add(val_to_fill, fill_value=0)
    
    return filled_s

In [14]:
df_train_impute_2 = df_train.copy()
for c in df_train_impute_2.columns[2:]:
    if c.endswith(('bin', 'cat')):
        df_train_impute_2[c] = fill_w_distribution(df_train_impute_2[c])       
# fill the continous NaN with median 
df_train_impute_2 = df_train_impute_2.fillna(df_train_impute_2.median())

In [15]:
clf = my_xgb(df_train_impute_2[all_features], 
             df_train_impute_2['target'].astype(int, inplace=True))
clf.input_scaling(ratio=0.05)

Splitted training/test, and applied standard scaler.
Total number of training samples: 565451


In [16]:
# clf.one_hot_encoding()
models_impute_2, eval_results_impute_2 = clf.fit(one_hot=False)

Performing 5 fold CV
Fold 1 of 5
class weight is: 26
[0]	train-gini:0.227758	valid-gini:0.207258
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.389612	valid-gini:0.27981
[200]	train-gini:0.460674	valid-gini:0.278787
Stopping. Best iteration:
[138]	train-gini:0.418906	valid-gini:0.281073

Fold 2 of 5
class weight is: 26
[0]	train-gini:0.231998	valid-gini:0.198477
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.393472	valid-gini:0.265036
[200]	train-gini:0.463581	valid-gini:0.26509
Stopping. Best iteration:
[134]	train-gini:0.420819	valid-gini:0.2665

Fold 3 of 5
class weight is: 26
[0]	train-gini:0.226431	valid-gini:0.213402
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in

In [17]:
last_training, last_valid, best_training, best_valid = get_gini_stats(eval_results_impute_2)
print(np.mean(last_valid))
print(np.std(last_valid))

0.274151
0.0122478211777


### Method 3: imputation with joint distribution
The idea is that, we want to guess the missing values of one feature, with the joint distribution bewteen this feature and other features. In another word, we want to train a simple model to predict the missing values. A possible model can be Naive Bayes with Gaussian distribution. 

In [20]:
df_train.isnull().sum()

id                     0
target                 0
ps_ind_01              0
ps_ind_02_cat        216
ps_ind_03              0
ps_ind_04_cat         83
ps_ind_05_cat       5809
ps_ind_06_bin          0
ps_ind_07_bin          0
ps_ind_08_bin          0
ps_ind_09_bin          0
ps_ind_10_bin          0
ps_ind_11_bin          0
ps_ind_12_bin          0
ps_ind_13_bin          0
ps_ind_14              0
ps_ind_15              0
ps_ind_16_bin          0
ps_ind_17_bin          0
ps_ind_18_bin          0
ps_reg_01              0
ps_reg_02              0
ps_reg_03         107772
ps_car_01_cat        107
ps_car_02_cat          5
ps_car_03_cat     411231
ps_car_04_cat          0
ps_car_05_cat     266551
ps_car_06_cat          0
ps_car_07_cat      11489
ps_car_08_cat          0
ps_car_09_cat        569
ps_car_10_cat          0
ps_car_11_cat          0
ps_car_11              5
ps_car_12              1
ps_car_13              0
ps_car_14          42620
ps_car_15              0
ps_calc_01             0


## Feature engineering

In [19]:
# TODO