In [28]:
# import modules

import os

import numpy as np
import pandas as pd

from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

project_dir = os.path.abspath('..')
data_dir = os.path.join(project_dir, 'data')
subs_dir = os.path.join(project_dir, 'subs')

In [6]:
# load train-test
train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
test = pd.read_csv(os.path.join(data_dir, 'test.csv'))

## Visualize data

In [7]:
train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,39205.17,0
1,3,2,34,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,49278.03,0
2,4,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,67333.77,0
3,8,2,37,0,195,195,0,0,0,0,...,0,0,0,0,0,0,0,0,64007.97,0
4,10,2,39,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,117310.979016,0


In [8]:
train.count()

ID                               76020
var3                             76020
var15                            76020
imp_ent_var16_ult1               76020
imp_op_var39_comer_ult1          76020
imp_op_var39_comer_ult3          76020
imp_op_var40_comer_ult1          76020
imp_op_var40_comer_ult3          76020
imp_op_var40_efect_ult1          76020
imp_op_var40_efect_ult3          76020
imp_op_var40_ult1                76020
imp_op_var41_comer_ult1          76020
imp_op_var41_comer_ult3          76020
imp_op_var41_efect_ult1          76020
imp_op_var41_efect_ult3          76020
imp_op_var41_ult1                76020
imp_op_var39_efect_ult1          76020
imp_op_var39_efect_ult3          76020
imp_op_var39_ult1                76020
imp_sal_var16_ult1               76020
ind_var1_0                       76020
ind_var1                         76020
ind_var2_0                       76020
ind_var2                         76020
ind_var5_0                       76020
ind_var5                 

In [9]:
train.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.80943,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.598503,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.6125,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.16,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.2525,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034738.76,1.0


In [12]:
train.dtypes

ID                                 int64
var3                               int64
var15                              int64
imp_ent_var16_ult1               float64
imp_op_var39_comer_ult1          float64
imp_op_var39_comer_ult3          float64
imp_op_var40_comer_ult1          float64
imp_op_var40_comer_ult3          float64
imp_op_var40_efect_ult1          float64
imp_op_var40_efect_ult3          float64
imp_op_var40_ult1                float64
imp_op_var41_comer_ult1          float64
imp_op_var41_comer_ult3          float64
imp_op_var41_efect_ult1          float64
imp_op_var41_efect_ult3          float64
imp_op_var41_ult1                float64
imp_op_var39_efect_ult1          float64
imp_op_var39_efect_ult3          float64
imp_op_var39_ult1                float64
imp_sal_var16_ult1               float64
ind_var1_0                         int64
ind_var1                           int64
ind_var2_0                         int64
ind_var2                           int64
ind_var5_0      

In [13]:
test.count()

ID                               75818
var3                             75818
var15                            75818
imp_ent_var16_ult1               75818
imp_op_var39_comer_ult1          75818
imp_op_var39_comer_ult3          75818
imp_op_var40_comer_ult1          75818
imp_op_var40_comer_ult3          75818
imp_op_var40_efect_ult1          75818
imp_op_var40_efect_ult3          75818
imp_op_var40_ult1                75818
imp_op_var41_comer_ult1          75818
imp_op_var41_comer_ult3          75818
imp_op_var41_efect_ult1          75818
imp_op_var41_efect_ult3          75818
imp_op_var41_ult1                75818
imp_op_var39_efect_ult1          75818
imp_op_var39_efect_ult3          75818
imp_op_var39_ult1                75818
imp_sal_var16_ult1               75818
ind_var1_0                       75818
ind_var1                         75818
ind_var2_0                       75818
ind_var2                         75818
ind_var5_0                       75818
ind_var5                 

In [15]:
# set data for train-test
data_x = train.ix[:, (train.columns != 'TARGET') & (train.columns != 'ID')].values
data_y = train.TARGET.values

data_x_test = test.ix[:, test.columns != 'ID'].values

In [29]:
# set model
clf1 = RandomForestClassifier(n_jobs=3, n_estimators=100)
cccv1 = CalibratedClassifierCV(clf1, method = 'isotonic', cv = 5)

In [30]:
# rmf_cv with 100 trees
cross_val_score(cccv1, data_x, data_y, cv = 5)

array([ 0.96040776,  0.96040776,  0.96040516,  0.96046833,  0.96046833])

In [31]:
# train and predict
cccv1.fit(data_x, data_y);
pred = cccv1.predict_proba(data_x_test)

In [34]:
# make submission
sub = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))
sub.TARGET = pred[:, 1]
sub.to_csv(os.path.join(subs_dir, 'sub01.csv'), index = False)