# Santander Value Prediction Challenge
---

In [125]:
import preprocessing as pp
import ensemble as em
import pandas as pd

import numpy as np

from sklearn.preprocessing import RobustScaler, FunctionTransformer, Binarizer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression
from sklearn.feature_selection import SelectPercentile, SelectFromModel, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge, Lasso, Ridge
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split

import lightgbm as lgb

from xgboost import XGBRegressor

from sklearn.base import BaseEstimator, TransformerMixin

In [83]:
class BinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns = None):
        self.columns = columns

    def transform(self, X, y=None):
        output = X.copy()
        print(str(len(self.columns)) + " to binarize!")
        output.loc[:,self.columns] = Binarizer().fit_transform(output.loc[:,self.columns])
        return output

    def fit(self, X, y=None):
        return self
    
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [2]:
train, test = pp.read_train_test(train_file = 'train.csv', test_file = 'test.csv')

In [3]:
ids = list(test.ID)

In [4]:
train_X = train.drop(['ID','target'], axis=1)
train_y = (np.log1p(train.target)).values

test_X = test.drop(['ID'], axis=1)

In [5]:
scaler = RobustScaler()

In [6]:
threshold = .98 * (1 - .98)
variance = VarianceThreshold(threshold)

In [60]:
int64_columns = list(train_X.select_dtypes(include=['int64']).columns)
float64_columns = list(train_X.select_dtypes(include=['float64']).columns)

In [84]:
binarizer = BinarizerTransformer(columns=int64_columns)

In [85]:
binarizer

BinarizerTransformer(columns=['0deb4b6a8', 'a8cb14b00', '2f0771a37', '30347e683', 'd08d1fbe3', '6ee66e115', '77c9823f2', '8d6c2a0b2', '4681de4fd', 'adf119b9a', '96f83a237', '6c7a4567c', '4fcfd2b4d', '71cebf11c', 'd966ac62c', 'c88d108c9', 'ff7b471cd', 'd5308d8bc', 'bc3f77679', '0eff5bf95', 'c330f1a67', '2cb4d123e', 'eeac...5fd5508', '3a13ed79a', 'f677d4d13', '71b203550', '137efaa80', 'fb36b89d9', '7e293fbaf', '9fc776466'])

In [7]:
anova_filter = SelectKBest(f_regression, k=4000)

In [8]:
model_rforest = RandomForestRegressor(n_estimators = 50, 
                                      max_features = 0.7,
                                      random_state=2018,
                                      max_depth=20,
                                      min_samples_leaf = 4,
                                      min_samples_split = 10)

In [107]:
model_lgb = lgb.LGBMRegressor(objective='regression',
                              metric="rmse",
                              n_estimators = 500,
                              num_leaves = 30,
                              learning_rate = 0.01,
                              bagging_fraction = 0.7,
                              feature_fraction = 0.7,
                              bagging_frequency = 5,
                              bagging_seed = 2018,
                              verbosity = -1)

In [108]:
model_lgb

LGBMRegressor(bagging_fraction=0.7, bagging_frequency=5, bagging_seed=2018,
       boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       feature_fraction=0.7, learning_rate=0.01, max_depth=-1,
       metric='rmse', min_child_samples=20, min_child_weight=0.001,
       min_split_gain=0.0, n_estimators=500, n_jobs=-1, num_leaves=30,
       objective='regression', random_state=None, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1, verbosity=-1)

In [10]:
model_xgb = XGBRegressor(n_estimators = 100, 
                         colsample_bytree = 0.7,
                         colsample_bylevel = 0.7,
                         learning_rate=0.1)

In [20]:
model_byr = BayesianRidge()

percentile = SelectPercentile(mutual_info_regression, percentile=35)

from_model_lasso = SelectFromModel(Lasso())
from_model_extra_tree = SelectFromModel(ExtraTreesRegressor(n_estimators=200, max_depth=20, 
                                                            max_features=0.5, n_jobs=-1, random_state=0))
from_model_lgb = SelectFromModel(model_lgb)

In [21]:
log_transformer = FunctionTransformer(np.log1p)

In [216]:
feature_selection = []
#feature_selection.append(('percentile', percentile))
#feature_selection.append(('from_model_lasso', from_model_lasso))
#feature_selection.append(('from_model_extra_tree', from_model_extra_tree))
feature_selection.append(('from_model_lgb', from_model_lgb))
feature_selection_union = FeatureUnion(feature_selection)

In [111]:
estimators = []
#estimators.append(('binarizer', binarizer))
estimators.append(('low_variance', variance))
#estimators.append(('scaler', scaler))
#estimators.append(('anova', anova_filter))
#estimators.append(('log_transform', log_transformer))
#estimators.append(('percentile', percentile))
#estimators.append(('from_model', from_model))
#estimators.append(('feature_selection', feature_selection_union))

In [112]:
pipe = Pipeline(estimators)
pipe.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('low_variance', VarianceThreshold(threshold=0.019600000000000017))])

In [113]:
train_X_reduced = pipe.transform(train_X)
test_X_reduced = pipe.transform(test_X)

In [114]:
type(train_X_reduced)

numpy.ndarray

In [115]:
print(train_X_reduced.shape)
print(test_X_reduced.shape)

(4459, 4735)
(49342, 4735)


In [116]:
train_set_X, test_set_X, train_set_y, test_set_y = train_test_split(train_X_reduced, train_y, test_size=0.1)

In [117]:
train_set_X.shape

(4013, 4735)

In [118]:
tree_models = []
tree_models.append(("lgb", model_lgb))
#tree_models.append(("rf", model_rforest))
#tree_models.append(("xgb", model_xgb))

In [119]:
%%time
cross_val_table = pp.get_validation_scores(tree_models, train_set_X, train_set_y, 10)
print(cross_val_table)

   Cross Validation (Mean)                          Cross Validation (Scores)  \
0                 1.427358  [1.28994783841, 1.45459540163, 1.42740622591, ...   

   Cross Validation (Std) Model  
0                0.071635   lgb  
Wall time: 10min 31s


In [137]:
pd.set_option("display.precision",2)
print(cross_val_table)
print(cross_val_table.loc[:,["Cross Validation (Scores)"]].values)

   Cross Validation (Mean)                          Cross Validation (Scores)  \
0                     1.43  [1.28994783841, 1.45459540163, 1.42740622591, ...   

   Cross Validation (Std) Model  
0                    0.07   lgb  
[[ array([ 1.28994784,  1.4545954 ,  1.42740623,  1.40929093,  1.37168061,
        1.48151798,  1.44677035,  1.57800285,  1.38027808,  1.43409188])]]


In [37]:
averaged_models = em.AveragingModels(models = [model_lgb, model_rforest])

In [39]:
ensemble_models = []
ensemble_models.append(("averaged", averaged_models))

In [40]:
cross_val_table_avg = pp.get_validation_scores(ensemble_models, train_X_reduced, train_y, 5)
print(cross_val_table_avg)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


   Cross Validation (Mean)                          Cross Validation (Scores)  \
0                 1.436595  [1.46571933907, 1.3990139641, 1.44531166378, 1...   

   Cross Validation (Std)     Model  
0                0.021752  averaged  


In [223]:
pp.make_submission(model_lgb, train_X_reduced, train_y, test_X_reduced, ids, filename = 'submission.csv')

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


In [64]:
print(cross_val_table["Cross Validation (Scores)"][0])

[ 1.36466376  1.44970895  1.54590952  1.43400053  1.3840896   1.4852663
  1.40555309  1.40547909  1.37186718  1.36653694]


In [48]:
train_X.columns.groupby(train_X.dtypes)

{dtype('int64'): Index(['0deb4b6a8', 'a8cb14b00', '2f0771a37', '30347e683', 'd08d1fbe3',
        '6ee66e115', '77c9823f2', '8d6c2a0b2', '4681de4fd', 'adf119b9a',
        ...
        '9437d8b64', '2e84e09c5', 'd45fd5508', '3a13ed79a', 'f677d4d13',
        '71b203550', '137efaa80', 'fb36b89d9', '7e293fbaf', '9fc776466'],
       dtype='object', length=3147),
 dtype('float64'): Index(['48df886f9', '34b15f335', '20aa07010', 'dc5a8f1d8', '11d86fa6a',
        'cff75dd09', 'b8a716ebf', 'f3b9c0b95', '68b647452', '0d866c3d7',
        ...
        '2a879b4f7', '6b119d8ce', '7ad6b38bd', '85dcc913d', '8d8bffbae',
        '5831f4c76', 'a165f5761', '3ecc09859', '9281abeea', '8675bec0b'],
       dtype='object', length=1844)}

In [91]:
train_X_reduced.loc[:,"fb36b89d9"].value_counts()

0    4425
1      34
Name: fb36b89d9, dtype: int64

In [206]:
train_X.loc[:,"f1c272f04"].value_counts()

0             4259
4234000         16
1000000         11
200000          11
12000000        10
2000000          6
4000000          5
100000           5
8000             5
32600000         5
800000           4
100000000        3
600000           3
7000000          3
5000000          3
50000000         3
1200000          3
20000000         3
16000            3
460000           2
1600000          2
390000           2
8000000          2
1400000          2
10000000         2
400000           2
3800000          2
140000           2
1336000          2
6800000          2
              ... 
1900000          1
158400000        1
10600000         1
2400000          1
180000           1
84600000         1
16980000         1
19200000         1
3600000          1
2243000000       1
520000           1
258000           1
14000000         1
163960000        1
668000           1
2200000          1
32830000         1
92648000         1
190600000        1
300000           1
80000000         1
17020000    

In [50]:
int64_columns = list(train_X.select_dtypes(include=['int64']).columns)
float64_columns = list(train_X.select_dtypes(include=['float64']).columns)

In [51]:
print(len(int64_columns))
print(len(float64_columns))

3147
1844


In [151]:
threshold = .98 * (1 - .98)
variance = VarianceThreshold(threshold)
variance.fit(train_X[int64_columns])

VarianceThreshold(threshold=0.019600000000000017)

In [152]:
tXint = variance.transform(train_X[int64_columns])

In [153]:
tXint.shape

(4459, 2891)

In [136]:
variance.fit(train_X[float64_columns])
tXfloat = variance.transform(train_X[float64_columns])

In [137]:
tXfloat.shape

(4459, 1844)

In [52]:
only_int = train.loc[:,int64_columns + ["target", "ID"]]

In [54]:
only_int.to_csv("only_int.csv")

In [139]:
variance.variances_

array([  1.51543707e+11,   3.24787455e+11,   9.23349065e+13, ...,
         1.65452181e+13,   1.96106628e+13,   2.03648194e+13])

In [154]:
features_variance = list(train_X[int64_columns].loc[:, variance.get_support()==False].columns)

In [155]:
features_variance

['d5308d8bc',
 'c330f1a67',
 'eeac16933',
 '7df8788e8',
 '5b91580ee',
 '6f29fbbc7',
 '46dafc868',
 'ae41a98b6',
 'f416800e9',
 '6d07828ca',
 '7ac332a1d',
 '70ee7950a',
 '833b35a7c',
 '2f9969eab',
 '8b1372217',
 '68322788b',
 '2288ac1a6',
 'dc7f76962',
 '467044c26',
 '39ebfbfd9',
 '9a5ff8c23',
 'f6fac27c8',
 '664e2800e',
 'ae28689a2',
 'd87dcac58',
 '4065efbb6',
 'f944d9d43',
 'c2c4491d5',
 'a4346e2e2',
 '1af366d4f',
 'cfff5b7c8',
 'da215e99e',
 '5acd26139',
 '9be9c6cef',
 '1210d0271',
 '21b0a54cb',
 'da35e792b',
 '754c502dd',
 '0b346adbd',
 '0f196b049',
 'b603ed95d',
 '2a50e001c',
 '1e81432e7',
 '10350ea43',
 '3c7c7e24c',
 '7585fce2a',
 '64d036163',
 'f25d9935c',
 'd98484125',
 '95c85e227',
 '9a5273600',
 '746cdb817',
 '6377a6293',
 '7d944fb0c',
 '87eb21c50',
 '5ea313a8c',
 '0987a65a1',
 '2fb7c2443',
 'f5dde409b',
 '1ae50d4c3',
 '2b21cd7d8',
 '0db8a9272',
 '804d8b55b',
 '76f135fa6',
 '7d7182143',
 'f88e61ae6',
 '378ed28e0',
 'ca4ba131e',
 '1352ddae5',
 '2b601ad67',
 '6e42ff7c7',
 '2219

In [156]:
len(features_variance)

256

In [150]:
len(variance.get_support())

1844

In [159]:
len(variance.variances_)

3147

In [179]:
import pandas as pd
varis = pd.DataFrame({'col': int64_columns, 'var': list(variance.variances_)})
varis.sort_values("var", ascending=False, inplace = True)

In [180]:
varis_dif0 = varis[varis.var >= 0]

TypeError: '>=' not supported between instances of 'method' and 'int'

In [205]:
varis.iloc[:10,:]

Unnamed: 0,col,var
240,f1c272f04,1379889000000000.0
1812,b77c707ef,1343034000000000.0
2198,1029d9146,1205822000000000.0
2004,ff3b49c1d,1187637000000000.0
1755,56cb93fd8,1185721000000000.0
1208,4c835bd02,1181086000000000.0
420,c059f2574,1177803000000000.0
542,33e4f9a0e,1176518000000000.0
1274,6488c8200,563830600000000.0
2637,453128993,554293600000000.0


In [186]:
varis

Unnamed: 0,col,var
240,f1c272f04,1.379889e+15
1812,b77c707ef,1.343034e+15
2198,1029d9146,1.205822e+15
2004,ff3b49c1d,1.187637e+15
1755,56cb93fd8,1.185721e+15
1208,4c835bd02,1.181086e+15
420,c059f2574,1.177803e+15
542,33e4f9a0e,1.176518e+15
1274,6488c8200,5.638306e+14
2637,453128993,5.542936e+14
