# Santander customer satisfaction: feature engineering

Improving data by transforming the features.

In [None]:
# starting up a console attached to this kernel
%matplotlib inline
%qtconsole
import os

# importing base code
os.chdir('/home/guilherme/Documents/Kaggle/santander-satisfaction/code')
from base import *

# changing to competition dir
os.chdir('/home/guilherme/Documents/Kaggle/santander-satisfaction')

# target variable
target = pd.read_csv('data/target.csv')

## Zero-sum feature

Counting how many columns are equal to 0.

In [None]:
# reading train and test (real)
train = pd.read_csv('data/no-duplicates/train.csv')
test = pd.read_csv('data/no-duplicates/test.csv')

# creating features
train['zero-sum'] = (train == 0).sum(axis=1)
test['zero-sum'] = (test == 0).sum(axis=1)

train['TARGET'] = target

# visualize
sns.FacetGrid(train, hue="TARGET", size=10) \
   .map(sns.kdeplot, "zero-sum") \
   .add_legend()

train.pop('TARGET')
#train.to_csv('data/no-duplicates/train.csv', index=False)
#test.to_csv('data/no-duplicates/test.csv',index=False)

Let us split the zero-sum feature into bins.

In [None]:
# splitting into bins
bins = np.array([0]+range(200,320,10))
train['zero-sum-bins'] = np.digitize(train['zero-sum'], bins)
test['zero-sum-bins'] = np.digitize(test['zero-sum'], bins)

plt.figure(figsize=[18,10])
train['TARGET'] = target
sns.violinplot(x="zero-sum-bins", y="var15", hue="TARGET", data=train, split=True);

One-hot encoding.

In [None]:
# loading categorical data
cat_train = load_obj('data/categorical/train')
cat_test = load_obj('data/categorical/test')

# getting new dummies
dummies_zs_train = csr_matrix(pd.get_dummies(train['zero-sum-bins'], prefix='zsb', prefix_sep='-'))
dummies_zs_test = csr_matrix(pd.get_dummies(test['zero-sum-bins'], prefix='zsb', prefix_sep='-'))

# updating
cat_train = hstack([cat_train, dummies_zs_train])
cat_test = hstack([cat_test, dummies_zs_test])

## var38

var38 has a lot of examples with the same value. This indicates a missing value treatment by the people at Santander. Let us fix it.

In [None]:
train['var38-peak'] = np.isclose(train['var38'], 117310.979016)
train['var38-log'] = train.loc[~train['var38-peak'], 'var38'].map(np.log)
train['var38-peak'] = train['var38-peak'].astype(int)
train.loc[train['var38-peak'], 'var38-log'] = 0
train.pop('var38')

test['var38-peak'] = np.isclose(test['var38'], 117310.979016)
test['var38-log'] = test.loc[~test['var38-peak'], 'var38'].map(np.log)
test['var38-peak'] = test['var38-peak'].astype(int)
test.loc[test['var38-peak'], 'var38-log'] = 0
test.pop('var38')

# plot
sns.FacetGrid(train, hue="TARGET", size=10) \
   .map(sns.kdeplot, "var38-log") \
   .add_legend()

# plot
plt.figure(figsize=[18,10])
sns.violinplot(x="var38-peak", y="var15", hue="TARGET", data=train, split=True);

## var3

va3 is suspected to be the country of the account holder. Let us explore it.

In [None]:
c_train = train.var3.value_counts()
c_test = test.var3.value_counts()

In [None]:
print c_train

In [None]:
print c_test

In [None]:
print 'number of "countries" in train:', len(c_train),'test:', len(c_test)

Let us group the uncommon countries under the same categorical variable. So we will end up with 3 categories: 2 (most common), 1 (uncommon) and 0 (error code).

In [None]:
uncommon_train = [i for i in c_train.index if (c_train[i] < 500) and not (i == -999999)]
uncommon_test = [i for i in c_test.index if (c_test[i] < 500) and not (i == -999999)]

for val in uncommon_train:
    replace = train['var3'] == val
    train['var3'][replace] = 1

for val in uncommon_train + uncommon_test:
    replace = test['var3'] == val
    test['var3'][replace] = 1

# replace error codes
val = -999999
replace = train['var3'] == val
train['var3'][replace] = 0
replace = train['var3'] == val
train['var3'][replace] = 0
    
plt.figure(figsize=[15,8])
sns.violinplot(x="var3", y="var15", hue="TARGET", data=train, split=True);

In [None]:
# getting new dummies
dummies_var3_train = csr_matrix(pd.get_dummies(train['var3'], prefix='var3', prefix_sep='-'))
dummies_var3_test = csr_matrix(pd.get_dummies(test['var3'], prefix='var3', prefix_sep='-'))

# updating
cat_train = hstack([cat_train, dummies_var3_train])
cat_test = hstack([cat_test, dummies_var3_test])

## Saving

In [None]:
# removing target column
train.pop('TARGET')

# real-valued
train.to_csv('data/engineered-real/train.csv', index=False)
test.to_csv('data/engineered-real/test.csv', index=False)

# saving categorical data
save_obj(cat_train, 'data/engineered-cat/train')
save_obj(cat_test, 'data/engineered-cat/test')

## Non-linear Age (var15)

Age data is non-linear. Putting it in bins could improve result.

In [None]:
# loading data
# train
train = pd.read_csv('data/engineered-real/train.csv') 
test = pd.read_csv('data/engineered-real/test.csv') 

# test
train_cat = load_obj('data/engineered-cat/train')
test_cat = load_obj('data/engineered-cat/test')

In [None]:
# Add categorical variables to discretize var15
# splitting into bins
bins = np.array([23, 30, 40, 50, 60, 70])
train['var15-bins'] = np.digitize(train['var15'], bins)
test['var15-bins'] = np.digitize(test['var15'], bins)

plt.figure(figsize=[18,10])
train['TARGET'] = target

sns.violinplot(x="var15-bins", y="zero-sum", hue="TARGET", data=train, split=True);

# plot
sns.FacetGrid(train, hue="TARGET", size=10) \
   .map(sns.distplot, "var15") \
   .add_legend();

# getting new dummies
dummies_v15_train = csr_matrix(pd.get_dummies(train["var15-bins"], prefix='v15b', prefix_sep='-'))
dummies_v15_test = csr_matrix(pd.get_dummies(test["var15-bins"], prefix='v15b', prefix_sep='-'))

# updating
train_cat = hstack([train_cat, dummies_v15_train])
test_cat = hstack([test_cat, dummies_v15_test])

## Similar variables

Some variables are expressed as time series. Let us analyse them.

In [None]:
# getting linked variables
# imp prefix - getting all
imp_all = [c for c in train.columns if c.startswith('imp')]

# spreading into subsets
imp_op = [c for c in imp_all if c.startswith('imp_op')]
imp_aport = [c for c in imp_all if c.startswith('imp_aport')]
imp_trasp = [c for c in imp_all if c.startswith('imp_trasp')]
imp_reemb = [c for c in imp_all if c.startswith('imp_reemb')]
imp_compra = [c for c in imp_all if c.startswith('imp_compra')]

# updating all
imp_all = [c for c in imp_all if c not in imp_op + imp_aport + imp_trasp + imp_reemb + imp_compra]

# saldo prefix - getting all
saldo = [c.replace('_medio_','_') for c in train.columns if c.startswith('saldo')]

# delta
delta = [c for c in train.columns if c.startswith('delta')]

## Interactions of same variable

Suffixes hace e ult indicate time series. Let us explore them.

In [None]:
# grouping same variables
# will generate new features from these groups
var_names = list(set([s[6:min(len(s), find_nth(s,'_',2))] for s in saldo]))

saldo_dict ={}
for v in var_names:
    saldo_dict[v] = target
    col_names = [s for s in [c for c in train.columns if c.startswith('saldo')] if v in s]
    for c in col_names:
        saldo_dict[v] = pd.concat([saldo_dict[v],train[c]], axis=1)

In [None]:
feat_exp = FeatureExpansion()
for var in saldo_dict.keys():
    df = saldo_dict[var].drop('TARGET', axis=1)
    for r in range(6):
        df, op_log = feat_exp.fit_transform(df,10)

## Stability selection

Too expensive to do on CV. Do it separately and add as a new dataset.

In [None]:
# loading data
# train
train = pd.read_csv('data/engineered-real/train.csv') 
test = pd.read_csv('data/engineered-real/test.csv') 

cols = train.columns

# joining
train = hstack([csr_matrix(train), load_obj('data/engineered-cat/train')]).tocsr()
test = hstack([csr_matrix(test), load_obj('data/engineered-cat/test')]).tocsr()

p = preprocessing({'na_input': {'strategy': 'mean'}})

train = p.fit_transform(train, target)
test = p.transform(test)

# feature selection algo
sel = RandomizedLogisticRegression(sample_fraction=0.50, n_resampling=500, 
                                   selection_threshold=0.0)
# transforming
train = sel.fit_transform(train.todense(), np.array(target['TARGET']))
test = sel.transform(test.todense())
    
sel_cols = cols[sel.scores_[0:len(cols)]>0]
sel_scores = sel.scores_[sel.scores_[0:len(cols)]>0]

print [x for (y,x) in sorted(zip(sel_scores, sel_cols), key=lambda pair: pair[0], reverse=True)] 

train = csr_matrix(train)
test = csr_matrix(test)

load_obj('data/selected/st-train')
load_obj('data/selected/st-test')

In [None]:
%qtconsole