In [1]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn import cross_validation
from sklearn import grid_search

from sklearn import linear_model
from sklearn import ensemble

from sklearn import metrics
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

In [2]:
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
model_name = 'stack_gb'

In [3]:
!ls -lrt submissions

total 22524
-rw-r--r--. 1 root root 1074581 Jan  7 21:09 submission_rfc-2017-01-07-21-07-44.csv
-rw-r--r--. 1 root root 1074757 Jan  7 21:19 submission-rfc-2017-01-07-21-17-53.csv
-rw-r--r--. 1 root root 1186344 Jan  7 22:26 submission-rfc-tuned-2017-01-07-21-25-04.csv
-rw-r--r--. 1 root root 1284088 Jan  7 23:34 submission-rfc-tuned-2017-01-07-22-54-35.csv
-rw-r--r--. 1 root root 1074327 Jan  7 23:54 submission-bagging-2017-01-07-23-46-27.csv
-rw-r--r--. 1 root root 2216945 Jan  8 01:11 submission-gb-tuned-2017-01-08-00-18-20.csv
-rw-r--r--. 1 root root 2110632 Jan  8 03:41 submission-vote-ensemble-2017-01-08-02-49-14.csv
-rw-r--r--. 1 root root 2110653 Jan  8 16:08 submission-ada-tuned-2017-01-08-16-05-11.csv
-rw-r--r--. 1 root root 2173040 Jan  8 16:20 submission-vote-ensemble-2017-01-08-16-11-39.csv
-rw-r--r--. 1 root root 2124339 Jan  8 16:52 submission-linear-tuned-2017-01-08-16-16-12.csv
-rw-r--r--. 1 root root 2124339 Jan  8 16:55 submission-linear-tuned-2017-01-08-1

In [4]:
training_features = pd.read_csv('data/training_features.csv')
test_features = pd.read_csv('data/test_features.csv')

In [5]:
X_full = training_features.ix[:,2:]
y_full = training_features.SeriousDlqin2yrs

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, stratify=y_full, random_state=42)

In [6]:
'''
submission-rfc-tuned-2017-01-07-22-54-35.csv
submission-gb-tuned-2017-01-08-00-18-20.csv
submission-ada-tuned-2017-01-08-16-05-11.csv
submission-vote-ensemble-2017-01-08-16-11-39.csv
'''

rfc_mdl = joblib.load('models/rfc-tuned-2017-01-07-22-54-35.pkl')
gb_mdl = joblib.load('models/gb-tuned-2017-01-08-00-18-20.pkl')
ada_mdl = joblib.load('models/ada-tuned-2017-01-08-16-05-11.pkl')
ens_mdl = joblib.load('models/vote-ensemble-2017-01-08-16-11-39.pkl')

In [7]:
rfc_pred = rfc_mdl.predict(X_full)
gb_pred = gb_mdl.predict(X_full)
ada_pred = ada_mdl.predict(X_full) 
ens_pred = ens_mdl.predict(X_full) 

rfc_pred_proba = rfc_mdl.predict_proba(X_full)
gb_pred_proba = gb_mdl.predict_proba(X_full)
ada_pred_proba = ada_mdl.predict_proba(X_full)
ens_pred_proba = ens_mdl.predict_proba(X_full)

[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.8s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.7s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    3.1s
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed:    7.0s
[Parallel(n_jobs=16)]: Done 2418 tasks      | elapsed:    9.6s
[Parallel(n_jobs=16)]: Done 3168 tasks      | elapsed:   12.6s
[Parallel(n_jobs=16)]: Done 4018 tasks      | elapsed:   15.9s
[Parallel(n_jobs=16)]: Done 4968 tasks      | elapsed:   19.7s
[Parallel(n_jobs=16)]: Done 5000 out of 5000 | elapsed:   19.8s finished
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.7s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.7s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    3.1s
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed

In [8]:
'''X_stack = pd.DataFrame(
  np.hstack([rfc_pred_proba, gb_pred_proba, ada_pred_proba, ens_pred_proba]), 
  columns=['rfc_0', 'rfc_1', 'gb_0', 'gb_1', 'ada_0', 'ada_1', 'ens_0', 'ens_1'])
X_stack.head()'''

X_stack = pd.DataFrame(
  np.hstack([rfc_pred.reshape(-1, 1), gb_pred.reshape(-1, 1), ada_pred.reshape(-1, 1), ens_pred.reshape(-1, 1), 
             rfc_pred_proba, gb_pred_proba, ada_pred_proba, ens_pred_proba]), 
  columns=['rfc', 'gb', 'ada', 'ens', 'rfc_0', 'rfc_1', 'gb_0', 'gb_1', 'ada_0', 'ada_1', 'ens_0', 'ens_1'])
X_stack.head()

Unnamed: 0,rfc,gb,ada,ens,rfc_0,rfc_1,gb_0,gb_1,ada_0,ada_1,ens_0,ens_1
0,0.0,0.0,0.0,0.0,0.598,0.402,0.620199,0.379801,0.500112,0.499888,0.599872,0.400128
1,0.0,0.0,0.0,0.0,0.9716,0.0284,0.932363,0.067637,0.506503,0.493497,0.877131,0.122869
2,0.0,0.0,0.0,0.0,0.9304,0.0696,0.648939,0.351061,0.501832,0.498168,0.668132,0.331868
3,0.0,0.0,0.0,0.0,0.9964,0.0036,0.976785,0.023215,0.509138,0.490862,0.91278,0.08722
4,0.0,0.0,0.0,0.0,0.903,0.097,0.901542,0.098458,0.506518,0.493482,0.845318,0.154682


In [9]:
y_stack = y_full

In [10]:
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(X_stack)

clf = tf.contrib.learn.LinearClassifier(feature_columns=feature_columns)

clf.fit(X_stack, y_stack, steps=200)

Explicitly set `enable_centered_bias` to 'True' if you want to keep existing behaviour.
INFO:tensorflow:Using config: {'task': 0, 'save_summary_steps': 100, 'keep_checkpoint_max': 5, '_is_chief': True, 'save_checkpoints_secs': 600, 'evaluation_master': '', 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, 'master': '', 'keep_checkpoint_every_n_hours': 10000, '_job_name': None, 'cluster_spec': None, 'tf_random_seed': None, 'num_ps_replicas': 0}
INFO:tensorflow:Setting feature info to TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(None), Dimension(12)]), is_sparse=False)
INFO:tensorflow:Setting targets info to TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(None)]), is_sparse=False)
INFO:tensorflow:Transforming feature_column _RealValuedColumn(column_name='', dimension=12, default_value=None, dtype=tf.float64, normalizer=None)
INFO:tensorflow:Create CheckpointSaverHook
INFO:tensorflow:loss = 0.693323, step = 1
INFO:tensorflow:Saving checkp

Estimator(params={'enable_centered_bias': True, 'weight_column_name': None, 'optimizer': <tensorflow.python.training.ftrl.FtrlOptimizer object at 0x7ff44db8cfd0>, 'feature_columns': [_RealValuedColumn(column_name='', dimension=12, default_value=None, dtype=tf.float64, normalizer=None)], 'n_classes': 2, 'joint_weights': False, 'gradient_clip_norm': None, 'num_ps_replicas': 0})

In [11]:
clf.evaluate(X_stack, y_stack)

INFO:tensorflow:Transforming feature_column _RealValuedColumn(column_name='', dimension=12, default_value=None, dtype=tf.float64, normalizer=None)
INFO:tensorflow:Restored model from /tmp/tmpoUq70g
INFO:tensorflow:Eval steps [0,inf) for training step 200.
INFO:tensorflow:Input iterator is exhausted.
INFO:tensorflow:Saving evaluation summary for 200 step: loss = 0.0744768, auc = 0.973357, accuracy/threshold_0.500000_mean = 0.986927, labels/actual_target_mean = 0.06684, recall/positive_threshold_0.500000_mean = 0.835029, labels/prediction_mean = 0.064395, accuracy/baseline_target_mean = 0.06684, precision/positive_threshold_0.500000_mean = 0.964627, accuracy = 0.986927


{'accuracy': 0.98692667,
 'accuracy/baseline_target_mean': 0.06684,
 'accuracy/threshold_0.500000_mean': 0.98692667,
 'auc': 0.97335696,
 'global_step': 200,
 'labels/actual_target_mean': 0.06684,
 'labels/prediction_mean': 0.064395018,
 'loss': 0.074476801,
 'precision/positive_threshold_0.500000_mean': 0.96462727,
 'recall/positive_threshold_0.500000_mean': 0.83502895}

In [14]:
X_stack.head()

Unnamed: 0,rfc,gb,ada,ens,rfc_0,rfc_1,gb_0,gb_1,ada_0,ada_1,ens_0,ens_1
0,0.0,0.0,0.0,0.0,0.598,0.402,0.620199,0.379801,0.500112,0.499888,0.599872,0.400128
1,0.0,0.0,0.0,0.0,0.9716,0.0284,0.932363,0.067637,0.506503,0.493497,0.877131,0.122869
2,0.0,0.0,0.0,0.0,0.9304,0.0696,0.648939,0.351061,0.501832,0.498168,0.668132,0.331868
3,0.0,0.0,0.0,0.0,0.9964,0.0036,0.976785,0.023215,0.509138,0.490862,0.91278,0.08722
4,0.0,0.0,0.0,0.0,0.903,0.097,0.901542,0.098458,0.506518,0.493482,0.845318,0.154682


In [114]:
'''flename = 'models/%s-%s.pkl' % (model_name, timestamp)

joblib.dump(clf, flename)

log = '%s,%s\n' % (flename, clf)

fle = open('model_log.csv', 'a')
fle.write(log)
fle.flush()
fle.close()

print flename'''

models/stack_gb-2017-01-08-17-01-50.pkl


In [13]:
X_submission = test_features.ix[:,2:]
y_submission = clf.predict_proba(X_submission.astype(np.float32))

Instructions for updating:
The default behavior of predict() is changing. The default value for
as_iterable will change to True, and then the flag will be removed
altogether. The behavior of this flag is described below.
INFO:tensorflow:Transforming feature_column _RealValuedColumn(column_name='', dimension=12, default_value=None, dtype=tf.float64, normalizer=None)
INFO:tensorflow:Loading model from checkpoint: /tmp/tmpoUq70g/model.ckpt-200-?????-of-00001.


InvalidArgumentError: Input to reshape is a tensor with 8018737 values, but the requested shape requires a multiple of 12
	 [[Node: linear/linear/reshape = Reshape[T=DT_FLOAT, Tshape=DT_INT32, _device="/job:localhost/replica:0/task:0/cpu:0"](linear/linear/ToFloat, linear/linear/reshape/shape)]]

Caused by op u'linear/linear/reshape', defined at:
  File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python2.7/dist-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/zmqshell.py", line 498, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-5722d2643b9a>", line 2, in <module>
    y_submission = clf.predict_proba(X_submission.astype(np.float32))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/estimators/linear.py", line 545, in predict_proba
    as_iterable=as_iterable)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/framework/python/framework/deprecation.py", line 272, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 439, in predict
    as_iterable=as_iterable)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 793, in _infer_model
    predictions = self._get_predict_ops(features)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 1011, in _get_predict_ops
    predictions, _, _ = self._call_model_fn(features, targets, ModeKeys.INFER)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 944, in _call_model_fn
    return self._model_fn(features, targets, mode=mode, params=self.params)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/estimators/linear.py", line 213, in _linear_classifier_model_fn
    scope=scope))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/feature_column_ops.py", line 358, in weighted_sum_from_feature_columns
    transformed_tensor = transformer.transform(column)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/feature_column_ops.py", line 583, in transform
    feature_column.insert_transformed_feature(self._columns_to_tensors)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/feature_column.py", line 1226, in insert_transformed_feature
    math_ops.to_float(input_tensor), flattened_shape, name="reshape")
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 1977, in reshape
    name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 749, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2380, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1298, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Input to reshape is a tensor with 8018737 values, but the requested shape requires a multiple of 12
	 [[Node: linear/linear/reshape = Reshape[T=DT_FLOAT, Tshape=DT_INT32, _device="/job:localhost/replica:0/task:0/cpu:0"](linear/linear/ToFloat, linear/linear/reshape/shape)]]


In [15]:
X_submission.head()

Unnamed: 0,NumberRealEstateLoansOrLines,UnknownNumberOfDependents,NoDependents,UnknownMonthlyIncome,NoIncome,ZeroDebtRatio,UnknownIncomeDebtRatio,WeirdRevolvingUtilization,ZeroRevolvingUtilization,LogDebt,...,LogDebtPerDelinquency,LogDebtPer90DaysLate,LogUnknownIncomeDebtRatio,LogUnknownIncomeDebtRatioPerPerson,LogUnknownIncomeDebtRatioPerLine,LogUnknownIncomeDebtRatioPerRealEstateLine,LogUnknownIncomeDebtRatioPerDelinquency,LogUnknownIncomeDebtRatioPer90DaysLate,LowAge,LogAge
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.919508,...,6.919508,6.919508,0.0,0.0,-1.609438,0.0,0.0,0.0,0.0,3.258096
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.48042,...,8.48042,8.48042,0.0,-1.098612,-2.772589,-1.609438,0.0,0.0,0.0,3.688879
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.159178,...,8.159178,8.159178,0.0,-1.098612,-2.564949,-0.693147,0.0,0.0,0.0,3.73767
3,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.993982,...,7.300835,7.993982,0.0,0.0,-2.079442,-1.098612,-0.693147,0.0,0.0,3.044523
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.343547,...,4.343547,4.343547,0.0,-0.693147,-1.609438,0.0,0.0,0.0,0.0,2.302585


In [116]:
submission = pd.DataFrame()
submission['Id'] = test_features.ID.astype(np.int)
submission['Probability'] = y_submission[:,1]
submission.head()

Unnamed: 0,Id,Probability
0,1,0.055928
1,2,0.043133
2,3,0.01376
3,4,0.066048
4,5,0.071679


In [117]:
flename = 'submissions/submission-%s-%s.csv' % (model_name, timestamp)
submission.to_csv(flename, index=False)
print flename

submissions/submission-stack_gb-2017-01-08-17-01-50.csv
