In [1]:
import keras
import pandas
import sklearn
import scipy
import dask.dataframe as dd

Using TensorFlow backend.


In [78]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier, BaggingClassifier

In [None]:
# use this code to transfer between csv to parquet
df.to_parquet("train_data.parquet")

In [80]:
training_data = pandas.read_csv("train_data.csv")

In [81]:
training_sample = training_data.sample(frac=0.5)

In [None]:
training_sample.shape

(116759, 247)

In [82]:
training_sample.to_csv("train_data_sample.csv", index=False)

# read sample if already available

In [83]:
training_data = pandas.read_csv("train_data_sample.csv")

In [84]:
training_data.columns

Index(['Unnamed: 0', 'id', 'UF_1', 'UF_2', 'UF_3', 'UF_4', 'UF_5', 'UF_6',
       'UF_7', 'IDADE',
       ...
       'CEP4_7', 'CEP4_8', 'CEP4_9', 'CEP4_10', 'CEP4_11', 'CEP4_12',
       'CEP4_13', 'CEP4_14', 'IND_BOM_1_1', 'IND_BOM_1_2'],
      dtype='object', length=247)

In [85]:
# keep ids and their index on database for further reference
ids = training_data["id"]

In [86]:
features = training_data.drop(["IND_BOM_1_1", "IND_BOM_1_2", "id"], axis=1)

In [87]:
features.head(10)

Unnamed: 0.1,Unnamed: 0,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,...,CEP4_5,CEP4_6,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14
0,302318,1,0,1,1,0,0,0,0.533846,0,...,1,0,0,0,0,1,1,0,1,0
1,202786,1,1,0,0,0,1,0,0.604895,1,...,1,0,0,1,1,1,1,0,0,0
2,187801,1,0,1,1,0,0,0,0.571103,1,...,1,1,0,1,0,0,1,0,1,0
3,15837,1,0,1,0,0,0,1,0.625795,0,...,0,1,0,1,0,1,0,0,1,0
4,316106,0,1,1,0,0,1,0,0.272198,0,...,0,0,0,0,0,1,1,0,0,1
5,200617,1,1,0,0,0,0,1,0.107837,1,...,1,0,0,0,0,0,0,1,0,1
6,252077,1,0,0,1,1,0,0,0.566673,1,...,1,1,1,1,1,1,0,0,0,0
7,57332,1,1,0,1,0,0,0,0.174853,1,...,1,1,1,1,1,0,0,0,0,0
8,336640,1,1,0,0,0,1,0,0.394475,0,...,1,0,1,1,0,1,0,0,1,0
9,101615,0,1,1,0,1,0,0,0.31059,0,...,1,0,1,0,0,0,0,0,0,1


In [7]:
labels = training_data["IND_BOM_1_1"]

Vou tentar reduzir a dimensionalidade do dataframe utilizando LDA para poder analisar 
melhor um subconjunto de variáveis e mensurar a acurácia

## Data Cleaning

In [10]:
training_data.dtypes.value_counts()

float64    144
int64      102
dtype: int64

Pandas leu muito dos valores de forma errada, gerando até problemas para o uso de memória, existem formas
melhores de representar estas variáveis.

In [11]:
training_data.columns

Index(['id', 'UF_1', 'UF_2', 'UF_3', 'UF_4', 'UF_5', 'UF_6', 'UF_7', 'IDADE',
       'SEXO_1',
       ...
       'CEP4_7', 'CEP4_8', 'CEP4_9', 'CEP4_10', 'CEP4_11', 'CEP4_12',
       'CEP4_13', 'CEP4_14', 'IND_BOM_1_1', 'IND_BOM_1_2'],
      dtype='object', length=246)

In [12]:
training_data.dtypes

id                                  int64
UF_1                                int64
UF_2                                int64
UF_3                                int64
UF_4                                int64
UF_5                                int64
UF_6                                int64
UF_7                                int64
IDADE                             float64
SEXO_1                              int64
NIVEL_RELACIONAMENTO_CREDITO01    float64
NIVEL_RELACIONAMENTO_CREDITO02    float64
BANCO_REST_IRPF_ULTIMA_1            int64
BANCO_REST_IRPF_ULTIMA_2            int64
BANCO_REST_IRPF_ULTIMA_3            int64
BANCO_REST_IRPF_ULTIMA_4            int64
BANCO_REST_IRPF_ULTIMA_5            int64
BANCO_REST_IRPF_ULTIMA_6            int64
BANCO_REST_IRPF_ULTIMA_7            int64
ATIVIDADE_EMAIL                   float64
EXPOSICAO_ENDERECO                float64
EXPOSICAO_EMAIL                   float64
EXPOSICAO_TELEFONE                float64
ATIVIDADE_ENDERECO                

In [88]:
category_columns = ["UF_1", "UF_2", "UF_3", "UF_4", "UF_5", "UF_6", "UF_7",
                   "BANCO_REST_IRPF_ULTIMA_1", "BANCO_REST_IRPF_ULTIMA_2", "BANCO_REST_IRPF_ULTIMA_3",
                   "BANCO_REST_IRPF_ULTIMA_4", "BANCO_REST_IRPF_ULTIMA_5", "BANCO_REST_IRPF_ULTIMA_6",
                   "BANCO_REST_IRPF_ULTIMA_7", "FLAG_BOLSA_FAMILIA_1", "SIGLA_PARTIDO_FILIADO_1",
                   "SIGLA_PARTIDO_FILIADO_2", "SIGLA_PARTIDO_FILIADO_3", "SIGLA_PARTIDO_FILIADO_4",
                   "SIGLA_PARTIDO_FILIADO_5", "SIGLA_PARTIDO_FILIADO_6", "SIGLA_PARTIDO_FILIADO_7",
                   "FLAG_FILIADO_PARTIDO_POLITICO_1", "FLAG_PROUNI_1", "RENDA_VIZINHANCA_1", 
                   "RENDA_VIZINHANCA_2", "RENDA_VIZINHANCA_3", "RENDA_VIZINHANCA_4", 
                    "COMPARATIVO_RENDA_CEP_1", "COMPARATIVO_RENDA_CEP_2", "COMPARATIVO_RENDA_CEP_3",
                   "COMPARATIVO_RENDA_CEP_4", "COMPARATIVO_RENDA_CEP_5", "CLASSE_SOCIAL_CONSUMIDOR_1",
                   "CLASSE_SOCIAL_CONSUMIDOR_2", "CLASSE_SOCIAL_CONSUMIDOR_3", "CLASSE_SOCIAL_CONSUMIDOR_4",
                   "FLAG_REDE_SOCIAL_1", "FLAG_REDE_SOCIAL_2", "FLAG_REDE_SOCIAL_3",
                   "CEP1_1", "CEP1_2", "CEP1_3", "CEP1_4", "CEP1_5", "CEP2_1", "CEP2_2", "CEP2_3", "CEP2_4",
                   "CEP2_5", "CEP2_6", "CEP2_7", "CEP2_8", "CEP2_9", "CEP3_1", "CEP3_2", "CEP3_3", "CEP3_4",
                   "CEP3_5", "CEP3_6", "CEP3_7", "CEP3_8", "CEP3_9", "CEP3_10", "CEP3_11", "CEP3_12",
                   "CEP4_1", "CEP4_2", "CEP4_3", "CEP4_4", "CEP4_5", "CEP4_6", "CEP4_7", "CEP4_8", "CEP4_9",
                   "CEP4_10", "CEP4_11", "CEP4_12", "CEP4_13", "CEP4_14"]

ordered_category_columns = ["NIVEL_RELACIONAMENTO_CREDITO02", "EXPOSICAO_CONSUMIDOR_EMAILS", 
                            "EXPOSICAO_CONSUMIDOR_TELEFONES"]

In [89]:
for column in training_data.columns:
    print(column)
    values = training_data[column].value_counts()
    if type(values) == list:
        print(values[:10])
    else:
        print(values.head(10))
    print(len(values))
    

Unnamed: 0
2047      1
66954     1
340615    1
118163    1
116114    1
384401    1
284684    1
337294    1
81293     1
341388    1
Name: Unnamed: 0, dtype: int64
194598
id
2047      1
66954     1
340615    1
118163    1
116114    1
384401    1
284684    1
337294    1
81293     1
341388    1
Name: id, dtype: int64
194598
UF_1
1    173076
0     21522
Name: UF_1, dtype: int64
2
UF_2
1    134541
0     60057
Name: UF_2, dtype: int64
2
UF_3
0    101940
1     92658
Name: UF_3, dtype: int64
2
UF_4
0    136851
1     57747
Name: UF_4, dtype: int64
2
UF_5
0    147454
1     47144
Name: UF_5, dtype: int64
2
UF_6
0    152286
1     42312
Name: UF_6, dtype: int64
2
UF_7
0    158282
1     36316
Name: UF_7, dtype: int64
2
IDADE
5.506237e-16    2054
1.000000e+00    1909
3.078642e-01      50
3.008786e-01      46
2.940066e-01      46
3.130324e-01      40
5.574156e-01      37
3.833430e-01      30
3.026960e-01      30
3.440986e-01      30
Name: IDADE, dtype: int64
17157
SEXO_1
1    101673
0     92925
Name: S

In [8]:
for column in category_columns:
    training_data[column] = training_data[column].astype('category')

NameError: name 'category_columns' is not defined

In [None]:
training_data.dtypes.value_counts()

In [17]:
confunsion_matrix = pandas.crosstab(training_data["SEXO_1"], training_data["IND_BOM_1_1"])
confunsion_matrix

IND_BOM_1_1,0,1
SEXO_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19330,36823
1,20837,39769


In [18]:
from scipy.stats import chisquare
chisquare(confunsion_matrix)

Power_divergenceResult(statistic=array([  56.54016979,  113.31360978]), pvalue=array([  5.50619646e-14,   1.84208521e-26]))

# Feature Selection

In [27]:
features.corr().abs()

Unnamed: 0,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,NIVEL_RELACIONAMENTO_CREDITO01,...,CEP4_5,CEP4_6,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14
UF_1,1.000000,0.209093,0.048255,0.146628,0.118488,0.100570,0.087990,0.019386,0.007811,0.006435,...,0.025403,0.017901,0.002279,0.000400,0.041249,0.004074,0.004246,0.009868,0.022239,0.035315
UF_2,0.209093,1.000000,0.123264,0.247423,0.187327,0.171868,0.180898,0.012217,0.013103,0.014506,...,0.040997,0.006801,0.002159,0.035769,0.002821,0.023900,0.012583,0.021424,0.007369,0.029016
UF_3,0.048255,0.123264,1.000000,0.275167,0.269899,0.246315,0.216494,0.031256,0.003892,0.013546,...,0.011452,0.008139,0.009605,0.062716,0.011476,0.038354,0.017366,0.002350,0.018033,0.011395
UF_4,0.146628,0.247423,0.275167,1.000000,0.126074,0.126333,0.135131,0.020921,0.004924,0.041647,...,0.001024,0.005757,0.009747,0.021085,0.012738,0.019959,0.028406,0.010240,0.023383,0.047764
UF_5,0.118488,0.187327,0.269899,0.126074,1.000000,0.156790,0.121600,0.003239,0.006073,0.018759,...,0.000297,0.002524,0.060533,0.009569,0.000885,0.000713,0.001331,0.023599,0.026219,0.026264
UF_6,0.100570,0.171868,0.246315,0.126333,0.156790,1.000000,0.137971,0.026368,0.017433,0.028672,...,0.002480,0.023013,0.067999,0.022139,0.029033,0.065947,0.025372,0.000670,0.015740,0.003706
UF_7,0.087990,0.180898,0.216494,0.135131,0.121600,0.137971,1.000000,0.002974,0.022839,0.000691,...,0.044504,0.024435,0.005311,0.000662,0.006608,0.035075,0.001038,0.017101,0.034493,0.003431
IDADE,0.019386,0.012217,0.031256,0.020921,0.003239,0.026368,0.002974,1.000000,0.017405,0.029988,...,0.001120,0.004402,0.013037,0.001559,0.008037,0.004025,0.000544,0.002066,0.000616,0.006353
SEXO_1,0.007811,0.013103,0.003892,0.004924,0.006073,0.017433,0.022839,0.017405,1.000000,0.020646,...,0.000789,0.008388,0.000917,0.002623,0.002129,0.002683,0.001580,0.004715,0.002303,0.003675
NIVEL_RELACIONAMENTO_CREDITO01,0.006435,0.014506,0.013546,0.041647,0.018759,0.028672,0.000691,0.029988,0.020646,1.000000,...,0.002201,0.004603,0.001246,0.002234,0.003535,0.001846,0.000003,0.000372,0.004002,0.005085


In [28]:
training_data.head(10)

Unnamed: 0,id,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,...,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14,IND_BOM_1_1,IND_BOM_1_2
0,33220,1,1,1,0,0,0,0,0.217846,0,...,0,0,1,1,0,0,0,0,0,1
1,164123,0,0,1,1,0,1,0,0.7504,0,...,0,1,0,1,1,0,0,0,0,1
2,340086,1,0,0,0,1,1,0,0.074953,0,...,0,0,0,0,0,1,1,1,1,0
3,237182,1,1,1,0,0,0,0,0.355855,0,...,0,1,1,1,0,0,0,1,1,0
4,335250,1,0,1,0,0,1,0,0.930834,1,...,0,1,0,0,0,1,0,0,1,0
5,149584,1,1,1,0,0,0,0,0.678045,0,...,1,0,0,1,0,0,1,0,1,0
6,71560,1,1,0,0,0,0,1,0.485231,1,...,1,0,1,0,1,0,0,0,1,0
7,118664,0,1,1,0,1,0,0,0.654419,1,...,0,0,0,0,0,1,1,1,1,0
8,19053,1,1,0,1,0,0,0,0.358808,1,...,0,0,0,0,1,1,0,1,1,0
9,368546,1,1,1,0,0,0,0,0.132485,1,...,1,1,1,1,0,0,0,1,1,0


In [29]:
corr_matrix = features.corr().abs()

In [30]:
values = corr_matrix[corr_matrix > 0.95].count()

In [31]:
values[values > 1]

FLAG_BOLSA_FAMILIA_1        2
RENDA_VIZINHANCA_1          2
RENDA_VIZINHANCA_4          2
FLAG_PROGRAMAS_SOCIAIS_1    2
dtype: int64

In [32]:
corr_matrix[corr_matrix["RENDA_VIZINHANCA_1"] > 0.90]

Unnamed: 0,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,NIVEL_RELACIONAMENTO_CREDITO01,...,CEP4_5,CEP4_6,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14
RENDA_VIZINHANCA_1,0.039497,0.126106,0.095179,0.120255,0.040225,0.021913,0.094908,0.056833,0.011343,0.004249,...,0.014022,0.030272,0.002249,0.002028,0.003034,0.005432,0.003514,0.008236,0.007406,0.022686
RENDA_VIZINHANCA_4,0.042773,0.123006,0.096118,0.121031,0.041934,0.020677,0.093601,0.053994,0.009825,0.003914,...,0.014699,0.02747,0.003278,0.00088,0.00388,0.005072,0.002067,0.00698,0.006323,0.020939


RENDA_VIZINHANCA_1 e  RENDA_VIZINHANCA_4 possuem alta correlação e FLAG_BOLSA_FAMILIA_1 e FLAG_PROGRAMAS_SOCIAIS_1
também. Logo, vou ficar com somente duas das 4.
Contudo, este teste é para apenas para variáveis correlacionadas linearmente, existem testes melhores para as variáveis categóricas.

In [None]:
features = features.drop(["RENDA_VIZINHANCA_1", "FLAG_BOLSA_FAMILIA_1"], axis="columns")

In [91]:
features.shape

(194598, 244)

In [92]:
features.drop_duplicates(inplace=True)
features.shape

(194598, 244)

In [None]:
training_data_model = pandas.concat([features, labels], axis=1)

In [None]:
training_data_model.head(10)

In [38]:
labels.value_counts()

1    76592
0    40167
Name: IND_BOM_1_1, dtype: int64

# Feature Engineering

In this module we're trying to build the feature that we see will be more useful in order to learn about
the class we need to predict.

# Model Tranining

In [93]:
features = training_data.drop(["IND_BOM_1_1", "IND_BOM_1_2", "id"], axis=1)

In [94]:
features.head(10)

Unnamed: 0.1,Unnamed: 0,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,...,CEP4_5,CEP4_6,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14
0,302318,1,0,1,1,0,0,0,0.533846,0,...,1,0,0,0,0,1,1,0,1,0
1,202786,1,1,0,0,0,1,0,0.604895,1,...,1,0,0,1,1,1,1,0,0,0
2,187801,1,0,1,1,0,0,0,0.571103,1,...,1,1,0,1,0,0,1,0,1,0
3,15837,1,0,1,0,0,0,1,0.625795,0,...,0,1,0,1,0,1,0,0,1,0
4,316106,0,1,1,0,0,1,0,0.272198,0,...,0,0,0,0,0,1,1,0,0,1
5,200617,1,1,0,0,0,0,1,0.107837,1,...,1,0,0,0,0,0,0,1,0,1
6,252077,1,0,0,1,1,0,0,0.566673,1,...,1,1,1,1,1,1,0,0,0,0
7,57332,1,1,0,1,0,0,0,0.174853,1,...,1,1,1,1,1,0,0,0,0,0
8,336640,1,1,0,0,0,1,0,0.394475,0,...,1,0,1,1,0,1,0,0,1,0
9,101615,0,1,1,0,1,0,0,0.31059,0,...,1,0,1,0,0,0,0,0,0,1


In [95]:
labels = training_data["IND_BOM_1_1"]

In [97]:
labels.shape

(194598,)

In [96]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

from sklearn.preprocessing import StandardScaler

In [98]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=1/4, 
                                                    random_state=42, stratify=labels)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/3, 
                                                  random_state=42, stratify=y_train)

In [99]:
input_dimension = X_train.shape[1]
input_dimension

244

# Neural Networks

In [36]:
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=20,
                              verbose=0, mode='auto')

In [37]:
classifier_1 = Sequential()
classifier_1.add(Dense(16, activation='tanh', input_dim=input_dimension))
classifier_1.add(Dense(16, activation='relu', input_dim=input_dimension))
classifier_1.add(Dense(1, activation='sigmoid'))

classifier_1.compile(optimizer='adam', loss='mean_squared_error', metrics=["accuracy"])

In [38]:
classifier_2 = Sequential()
classifier_2.add(Dense(16, activation='tanh', input_dim=input_dimension))
classifier_2.add(Dense(16, activation='tanh', input_dim=input_dimension))
classifier_2.add(Dense(8, activation='relu', input_dim=input_dimension/2))
classifier_2.add(Dense(1, activation='sigmoid'))

classifier_2.compile(optimizer='adam', loss='mean_squared_error', metrics=["accuracy"])

In [39]:
classifier_3 = Sequential()
classifier_3.add(Dense(16, activation='relu', input_dim=input_dimension))
classifier_3.add(Dense(8, activation='relu', input_dim=input_dimension))
classifier_3.add(Dense(1, activation='sigmoid'))

classifier_3.compile(optimizer='adam', loss='mean_squared_error', metrics=["accuracy"])

I use as_matrix because Keras expects a Numpy array instead of a dataframe.

In [40]:
classifier_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                3904      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 4,193
Trainable params: 4,193
Non-trainable params: 0
_________________________________________________________________


In [41]:
classifier_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 16)                3904      
_________________________________________________________________
dense_5 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_6 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 9         
Total params: 4,321
Trainable params: 4,321
Non-trainable params: 0
_________________________________________________________________


In [42]:
classifier_3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 16)                3904      
_________________________________________________________________
dense_9 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 9         
Total params: 4,049
Trainable params: 4,049
Non-trainable params: 0
_________________________________________________________________


In [43]:
model = classifier_1.fit(X_train.as_matrix(), y_train.as_matrix(),epochs=500, callbacks=[early_stopping], validation_split=0.15)

  """Entry point for launching an IPython kernel.


Train on 49622 samples, validate on 8757 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500


In [45]:
model = classifier_3.fit(X_train.as_matrix(), y_train.as_matrix(),epochs=500, callbacks=[early_stopping], validation_split=0.15)

  """Entry point for launching an IPython kernel.


Train on 49622 samples, validate on 8757 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500


In [44]:
model = classifier_2.fit(X_train.as_matrix(), y_train.as_matrix(),epochs=500, callbacks=[early_stopping], validation_split=0.15)

  """Entry point for launching an IPython kernel.


Train on 49622 samples, validate on 8757 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500


In [46]:
from keras.layers import concatenate
from keras.models import Model
from keras.layers import Input
from keras.layers.core import Dense
from keras.layers.merge import concatenate

In [47]:
inputs = Input(shape=(243,))

x1 = Dense(16, activation="tanh")(inputs)
x1 = Dense(16, activation="relu")(x1)

x2 = Dense(16, activation="tanh")(inputs)
x2 = Dense(8, activation="relu")(x2)

x3 = Dense(16, activation="tanh")(inputs)
x3 = Dense(16, activation="tanh")(x3)
x3 = Dense(8, activation="relu")(x3)

x4 = concatenate([x1,x2,x3])

prediction = Dense(1, activation="sigmoid")(x4)

voting_classifier = Model(inputs=inputs, outputs= prediction)
voting_classifier.compile(optimizer='adam', loss='mean_squared_error', metrics=["accuracy"])

In [53]:
early_stopping_voting = keras.callbacks.EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=10,
                              verbose=0, mode='auto')

In [54]:
voting_classifier.fit(X_train.as_matrix(), y_train.as_matrix(),epochs=500, callbacks=[early_stopping_voting], validation_split=0.15)

  """Entry point for launching an IPython kernel.


Train on 49622 samples, validate on 8757 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500


<keras.callbacks.History at 0x2a449f3c9e8>

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier

In [None]:
voting_classifier = VotingClassifier([('classifier_1', classifier_1), ('classifier_2', classifier_2), ('classifier_3',classifier_3)], voting='soft')

In [None]:
voting_classifier.fit(X_train, y_train)

# Random Forest

In [100]:
# parameters to choose
number_estimators = [60, 80]
max_features = ["sqrt", "log2", 1]
max_depth = [20, 30]

In [101]:
rf_classifier = GridSearchCV(estimator=RandomForestClassifier(), 
                          param_grid=dict(
                              n_estimators=number_estimators,
                              max_features=max_features,
                              max_depth=max_depth), n_jobs=-1)

In [102]:
rf_classifier.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [60, 80], 'max_features': ['sqrt', 'log2', 1], 'max_depth': [20, 30], 'class_weight': ['balanced', {0: 1, 1: 1}]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [103]:
rf_classifier

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [60, 80], 'max_features': ['sqrt', 'log2', 1], 'max_depth': [20, 30], 'class_weight': ['balanced', {0: 1, 1: 1}]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [104]:
rf_classifier.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 1},
            criterion='gini', max_depth=20, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=80, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [105]:
rf_classifier.best_params_

{'class_weight': {0: 1, 1: 1},
 'max_depth': 20,
 'max_features': 'sqrt',
 'n_estimators': 80}

### The score of the rf_classifier alone

In [110]:
classifier = rf_classifier.best_estimator_

In [111]:
y_train_pred = classifier.predict(X_train.as_matrix()).ravel()
y_test_pred = classifier.predict(X_test.as_matrix()).ravel()

  """Entry point for launching an IPython kernel.
  


In [113]:
# This returns an array for the probabilities of being each class, so it depends what the focus will be
# in our case, the focus is at class 1
y_test_pred_prob = classifier.predict_proba(X_test.as_matrix())[:, 1]

  This is separate from the ipykernel package so we can avoid doing imports until


In [114]:
print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)


Performance no conjunto de teste:

Accuracy:         0.6718
Recall:           0.8998
Precision:        0.6922
F1:               0.7825
AUROC:            0.6745
AUPR:             0.7921


# XGBoost

## Parameter Selection

In [None]:
# We will use grid search to improve our search for parameters using Cross Validation
number_estimators = [100, 120, 140]
loss_function = ["deviance", "exponential"]
min_samples_leaf = [1, 0.05, 0.5]
sub_samples = [0.85, 0.8, 0.7]
max_features = ["sqrt"]
xgboost_classifier = GridSearchCV(estimator=GradientBoostingClassifier(), 
                          param_grid=dict(
                              n_estimators=number_estimators,
                              max_features=max_features,
                              subsample=sub_samples,
                          min_samples_leaf=min_samples_leaf,
                          loss=loss_function), n_jobs=-1)

In [None]:
xgboost_classifier.fit(X_train, y_train)

In [None]:
# as I said in the documentation, GridSeach uses a stratified 3-fold cross validation because a Classifier was passed
# instead of a recgressor

xgboost_classifier.best_params_

### Score of the single XGBoost

In [None]:
classifier = xgboost_classifier.best_estimator_

In [None]:
y_train_pred = classifier.predict(X_train.as_matrix()).ravel()
y_test_pred = classifier.predict(X_test.as_matrix()).ravel()

In [None]:
# This returns an array for the probabilities of being each class, so it depends what the focus will be
# in our case, the focus is at class 1
y_test_pred_prob = classifier.predict_proba(X_test.as_matrix())[:, 1]

In [None]:
print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)

# Ensemble classifiers (Voting)

## Random Forest and XGBoost together

In [66]:
# I get the best estimator of a previous chosen xgboost classifier
classifier = VotingClassifier([('xgboost', xgboost_classifier.best_estimator_), ('randomforest', rf_classifier.best_estimator_)], voting='soft')

In [67]:
classifier.fit(X_train, y_train)

VotingClassifier(estimators=[('xgboost', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_l...imators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [68]:
classifier

VotingClassifier(estimators=[('xgboost', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_l...imators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

# Evaluation

In [107]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, average_precision_score

In [69]:
y_train_pred = classifier.predict(X_train.as_matrix()).ravel()
y_test_pred = classifier.predict(X_test.as_matrix()).ravel()

  """Entry point for launching an IPython kernel.
  


é bom prestar atenção se os valores estão próximo, caso contrário, existe uma boa indicação de que houve
overfitting e o modelo não consegue generalizar tão bem.

In [70]:
print("Mean Square error in train: {:0.1f}".format(mse(y_train, y_train_pred)))
print("Mean Square error in test: {:0.1f}".format(mse(y_test, y_test_pred)))

Mean Square error in train: 0.1
Mean Square error in test: 0.3


In [71]:
def compute_performance_metrics(y, y_pred_class, y_pred_scores=None):
    accuracy = accuracy_score(y, y_pred_class)
    recall = recall_score(y, y_pred_class)
    precision = precision_score(y, y_pred_class)
    f1 = f1_score(y, y_pred_class)
    performance_metrics = (accuracy, recall, precision, f1)
    if y_pred_scores is not None:
        auroc = roc_auc_score(y, y_pred_scores)
        aupr = average_precision_score(y, y_pred_scores)
        performance_metrics = performance_metrics + (auroc, aupr)
    return performance_metrics

In [72]:
def print_metrics_summary(accuracy, recall, precision, f1, auroc=None, aupr=None):
    print()
    print("{metric:<18}{value:.4f}".format(metric="Accuracy:", value=accuracy))
    print("{metric:<18}{value:.4f}".format(metric="Recall:", value=recall))
    print("{metric:<18}{value:.4f}".format(metric="Precision:", value=precision))
    print("{metric:<18}{value:.4f}".format(metric="F1:", value=f1))
    if auroc is not None:
        print("{metric:<18}{value:.4f}".format(metric="AUROC:", value=auroc))
    if aupr is not None:
        print("{metric:<18}{value:.4f}".format(metric="AUPR:", value=aupr))

In [73]:
# This returns an array for the probabilities of being each class, so it depends what the focus will be
# in our case, the focus is at class 1
y_test_pred_prob = classifier.predict_proba(X_test.as_matrix())[:, 1]

  This is separate from the ipykernel package so we can avoid doing imports until


In [74]:
y_test_pred.shape

(29190,)

In [75]:
y_test.shape

(29190,)

In [76]:
y_test_pred_prob.shape

(29190,)

In [77]:
print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)


Performance no conjunto de teste:

Accuracy:         0.6844
Recall:           0.9165
Precision:        0.6974
F1:               0.7921
AUROC:            0.7012
AUPR:             0.8113


# Evaluate for Kaggle

In [None]:
kaggle_test_data = pandas.read_csv("real_test_set.csv")

In [None]:
kaggle_test_data.shape
features_kaggle = kaggle_test_data.drop(["id"], axis=1)
features_kaggle.head(10)

In [None]:
rf_pred_test_class = rf_clf.predict(features_kaggle)
rf_pred_test_scores = rf_clf.predict_proba(features_kaggle)[:, 1]

In [None]:
rf_pred_test_class.size

In [None]:
rf_pred_test_class

Se ligar que na hora que cria o csv, na primeira linha (a linha do header), ele coloca ",0", tem que substituir para "id,IND_BOM_1_1"

In [None]:
df = pandas.DataFrame(data=rf_pred_test_class)
df.to_csv('test.csv', mode='a', index=True)

In [None]:
# For in ensemble classifiers
classifier = xgboost_classifier

# RESULTS LOG

## XGBoost - 120 estimators

In [89]:

print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)


Performance no conjunto de teste:

Accuracy:         0.6607
Recall:           0.9790
Precision:        0.6636
F1:               0.7910
AUROC:            0.6295
AUPR:             0.7507


In [91]:

print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)


Performance no conjunto de teste:

Accuracy:         0.6607
Recall:           0.9790
Precision:        0.6636
F1:               0.7910
AUROC:            0.6295
AUPR:             0.7507


## XGBoost - 2nd configuration

In [None]:
# parameters
number_estimators = [30, 60, 120, 200]
loss_function = ["deviance", "exponential"]
min_samples_leaf = [1, 0.05]
sub_samples = [1.0, 0.8, 0.6]
max_features = ["log2", "sqrt", "auto"]

In [100]:
classifier.best_params_

{'loss': 'deviance',
 'max_features': 'auto',
 'min_samples_leaf': 0.05,
 'n_estimators': 200,
 'subsample': 0.8}

In [105]:

print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)


Performance no conjunto de teste:

Accuracy:         0.6878
Recall:           0.9007
Precision:        0.7052
F1:               0.7910
AUROC:            0.7065
AUPR:             0.8131


# Ensemble Of XBGoost and RandomForest

Without Tuning of the random forest parameters

In [51]:
classifier

VotingClassifier(estimators=[('xgboost', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_l...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [52]:
print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)


Performance no conjunto de teste:

Accuracy:         0.6766
Recall:           0.8842
Precision:        0.7010
F1:               0.7820
AUROC:            0.6773
AUPR:             0.7933


## Random forest optimized

In [27]:
# as I said in the documentation, GridSeach uses a stratified 3-fold cross validation because a Classifier was passed
# instead of a recgressor

xgboost_classifier.best_params_

{'loss': 'deviance',
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'n_estimators': 120,
 'subsample': 0.8}

In [41]:
rf_classifier.best_params_

{'class_weight': {0: 1, 1: 1},
 'max_depth': 10,
 'max_features': 'sqrt',
 'n_estimators': 20}

In [40]:
print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)


Performance no conjunto de teste:

Accuracy:         0.6829
Recall:           0.9383
Precision:        0.6899
F1:               0.7952
AUROC:            0.7052
AUPR:             0.8132


# MLPs and Ensemble MLPS

In [65]:
y_train_pred = classifier_1.predict(X_train.as_matrix()).ravel()
y_test_pred = classifier_1.predict(X_test.as_matrix()).ravel()

# This returns an array for the probabilities of being each class, so it depends what the focus will be
# in our case, the focus is at class 1
y_test_pred_prob = classifier_1.predict_proba(X_test.as_matrix())[:, 0]

print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)

  """Entry point for launching an IPython kernel.
  
  



Performance no conjunto de teste:

Accuracy:         0.6664
Recall:           0.8560
Precision:        0.7013
F1:               0.7710
AUROC:            0.6650
AUPR:             0.7853


In [66]:
y_train_pred = classifier_2.predict(X_train.as_matrix()).ravel()
y_test_pred = classifier_2.predict(X_test.as_matrix()).ravel()

# This returns an array for the probabilities of being each class, so it depends what the focus will be
# in our case, the focus is at class 1
y_test_pred_prob = classifier_2.predict_proba(X_test.as_matrix())[:, 0]

print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)

  """Entry point for launching an IPython kernel.
  
  



Performance no conjunto de teste:

Accuracy:         0.6660
Recall:           0.8400
Precision:        0.7064
F1:               0.7674
AUROC:            0.6698
AUPR:             0.7878


In [67]:
y_train_pred = classifier_3.predict(X_train.as_matrix()).ravel()
y_test_pred = classifier_3.predict(X_test.as_matrix()).ravel()

# This returns an array for the probabilities of being each class, so it depends what the focus will be
# in our case, the focus is at class 1
y_test_pred_prob = classifier_3.predict_proba(X_test.as_matrix())[:, 0]

print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)

  """Entry point for launching an IPython kernel.
  
  



Performance no conjunto de teste:

Accuracy:         0.6684
Recall:           0.8315
Precision:        0.7116
F1:               0.7669
AUROC:            0.6739
AUPR:             0.7916


In [68]:
y_train_pred = voting_classifier.predict(X_train.as_matrix()).ravel()
y_test_pred = voting_classifier.predict(X_test.as_matrix()).ravel()

# This returns an array for the probabilities of being each class, so it depends what the focus will be
# in our case, the focus is at class 1
y_test_pred_prob = classifier_1.predict_proba(X_test.as_matrix())[:, 0]

print('\nPerformance no conjunto de teste:')
accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(y_test, y_test_pred.round(), y_test_pred_prob)
print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)

  """Entry point for launching an IPython kernel.
  
  



Performance no conjunto de teste:

Accuracy:         0.6493
Recall:           0.7997
Precision:        0.7051
F1:               0.7495
AUROC:            0.6650
AUPR:             0.7853
