In [34]:
%matplotlib inline
import os, sys, time
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from __future__ import print_function

In [35]:
# load the raw data
df_train = pd.read_csv('../data/train.csv')
print('Shape of the dataset: {}'.format(df_train.shape))
df_train.head()

Shape of the dataset: (595212, 59)


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [36]:
# is there class imbalance? The answer is very much...
df_train['target'].value_counts()

0    573518
1     21694
Name: target, dtype: int64

In [37]:
# what features are we looking at?
# there are 57 features in total (=59-2)
all_features = df_train.columns.tolist()
all_features.remove('target')
all_features.remove('id')
bin_features = []
cat_features = []
other_features = []
for col in df_train.columns[2:]:  # exclude 'id' and 'target'
    if col.split('_')[-1] == 'bin':
        bin_features.append(col)
    elif col.split('_')[-1] == 'cat':
        cat_features.append(col)
    else:
        other_features.append(col)

In [38]:
len(bin_features)

17

In [39]:
len(cat_features)

14

In [65]:
# how many missing values are we looking at?
# percentage by classes
df_tmp = df_train[['target']+all_features].groupby('target')[all_features]\
.apply(lambda x: 100*np.sum(x == -1)/len(x)).T
# percentage by all classes
df_tmp2 = (df_train[all_features].apply(lambda x: x == -1).sum() / df_train.shape[0] * 100).to_frame()
df_tmp2.rename(columns={0: 'total'}, inplace=True)
# merge!
df_tmp = df_tmp.merge(df_tmp2, left_index=True, right_index=True)
df_tmp.sort_values(by='total', ascending=False, inplace=True)

In [66]:
df_tmp

Unnamed: 0,0,1,total
ps_car_03_cat,69.358067,61.998709,69.089837
ps_car_05_cat,45.002772,38.960081,44.782531
ps_reg_03,18.25784,14.105283,18.10649
ps_car_14,7.130901,7.942288,7.160474
ps_car_07_cat,1.846673,4.139393,1.930237
ps_ind_05_cat,0.92848,2.231032,0.975955
ps_car_09_cat,0.090145,0.239698,0.095596
ps_ind_02_cat,0.030688,0.184383,0.03629
ps_car_01_cat,0.012728,0.156725,0.017977
ps_ind_04_cat,0.008718,0.152116,0.013945


In [11]:
# replace -1 with nan for easier handling
df_train.replace({-1: np.float('NaN')}, inplace=True)
# get the median for the non-binary / cat features
df_train

In [33]:
for c in other_features:
    print(df_train[c].value_counts())

0    187594
1    143984
2     82468
5     61464
3     51193
4     33524
6     17682
7     17303
Name: ps_ind_01, dtype: int64
2     96110
3     81973
1     67994
4     67213
5     62803
6     60406
7     52364
8     39667
9     25436
0     15514
10    14484
11    11248
Name: ps_ind_03, dtype: int64
0    588832
1      5495
2       744
3       136
4         5
Name: ps_ind_14, dtype: int64
7     65336
8     59600
6     58408
10    54341
11    53215
9     45497
12    44851
5     42553
4     41770
13    35256
3     32267
0     31826
2     18675
1     11617
Name: ps_ind_15, dtype: int64
0.9    194608
0.7     67897
0.8     60277
0.6     56243
0.4     53569
0.1     43192
0.3     40290
0.5     32341
0.2     25113
0.0     21682
Name: ps_reg_01, dtype: int64
0.2    114886
0.3     95033
0.0     89297
0.4     59746
0.5     36528
0.6     33474
0.1     31816
0.7     21210
0.8     19907
0.9     16788
1.2     15843
1.0     12570
1.3     11761
1.1     11321
1.4      6877
1.5      5845
1.6      5095
1.8 