In [1]:
import numpy as np
import pandas as pd
import itertools

# machine learning models
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
import xgboost as xgb
import lightgbm as lgb

#feature scaling
from sklearn.preprocessing import StandardScaler, RobustScaler

# Metrics
from sklearn.metrics import roc_curve,auc

# Utils
from sklearn.model_selection import train_test_split



In [2]:
train = pd.read_csv('train.gz')
test = pd.read_csv('test.gz')

In [3]:
len(train.loc[train["target"]==1])/ len(train)

0.036447517859182946

In [4]:
train = train.replace(-1, np.NaN)
d_median = train.median(axis=0)
d_mean = train.mean(axis=0)
train = train.fillna(-1)
one_hot = {c: list(train[c].unique()) for c in train.columns if c not in ['id','target']}

In [5]:
def transform_df(df):
    df = pd.DataFrame(df)
    dcol = [c for c in df.columns if c not in ['id','target']]
    df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
    df['negative_one_vals'] = np.sum((df[dcol]==-1).values, axis=1)
    for c in dcol:
        if '_bin' not in c: #standard arithmetic
            df[c+str('_median_range')] = (df[c].values > d_median[c]).astype(np.int)
            df[c+str('_mean_range')] = (df[c].values > d_mean[c]).astype(np.int)
            #df[c+str('_sq')] = np.power(df[c].values,2).astype(np.float32)
            #df[c+str('_sqr')] = np.square(df[c].values).astype(np.float32)
            #df[c+str('_log')] = np.log(np.abs(df[c].values) + 1)
            #df[c+str('_exp')] = np.exp(df[c].values) - 1
    for c in one_hot:
        if len(one_hot[c])>2 and len(one_hot[c]) < 7:
            for val in one_hot[c]:
                df[c+'_oh_' + str(val)] = (df[c].values == val).astype(np.int)
    return df

In [6]:
#def multi_transform(df):
#    print('Init Shape: ', df.shape)
#    p = Pool(cpu_count())
#    df = p.map(transform_df, np.array_split(df, cpu_count()))
#    df = pd.concat(df, axis=0, ignore_index=True).reset_index(drop=True)
#    p.close(); p.join()
#    print('After Shape: ', df.shape)
#    return df

In [7]:
def gini(y, pred):
    fpr, tpr, thr = roc_curve(y, pred, pos_label=1)
    g = 2 * auc(fpr, tpr) -1
    return g

In [8]:
#def gini_xgb(pred, y):
#    y = y.get_label()
#    return 'gini', gini(y, pred)

In [9]:
#LightGBM
def gini_lgb(preds, dtrain):
#    y = list(dtrain.get_label())
    y = dtrain
    score = gini(y, preds) / gini(y, y)
#    return 'gini', score, True
    return score

In [10]:
#params = {'eta': 0.02, 'max_depth': 4, 'objective': 'binary:logistic', 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 0.77, 'scale_pos_weight': 1.6, 'gamma': 10, 'reg_alpha': 8, 'reg_lambda': 1.3, 'eval_metric': 'auc', 'seed': 99, 'silent': True}
x1, x2, y1, y2 = train_test_split(train, train['target'], test_size=0.25, random_state=99)

In [11]:
#x1 = multi_transform(x1)
#x2 = multi_transform(x2)
#test = multi_transform(test)

x1 = transform_df(x1)
x2 = transform_df(x2)
test = transform_df(test)

In [12]:
print(test.shape)

(892816, 188)


In [13]:
col = [c for c in x1.columns if c not in ['id','target']]
col = [c for c in col if not c.startswith('ps_calc_')]
print(x1.values.shape, x2.values.shape)

(446409, 189) (148803, 189)


In [14]:
#remove duplicates just in case
#tdups = multi_transform(train)
tdups = transform_df(train)
dups = tdups[tdups.duplicated(subset=col, keep=False)]

x1 = x1[~(x1['id'].isin(dups['id'].values))]
x2 = x2[~(x2['id'].isin(dups['id'].values))]
print(x1.values.shape, x2.values.shape)

(446409, 189) (148803, 189)


In [15]:
y1 = x1['target']
y2 = x2['target']
x1 = x1[col]
x2 = x2[col]

In [19]:
# Baseline logistic regression
logreg = LogisticRegression(class_weight='balanced')
%time logreg.fit(x1, y1)
%time prediction = logreg.predict_proba(test[col])
prediction_lg=prediction[:,1]

%time score = logreg.score(x2, y2)
print("Exactitud en el dataset de validación: %.4f" % score)

Wall time: 55.5 s
Wall time: 974 ms
Wall time: 130 ms
Exactitud en el dataset de validación: 0.6144


In [20]:
%time gini_pred = logreg.predict_proba(x2)
gini_pred = gini_pred[:,1]
#gini_pred = (np.exp(gini_pred) - 1.0).clip(0,1)
gini_score = gini(y2, gini_pred)
print("Gini en el dataset de validación: %.4f" % gini_score)

Wall time: 125 ms
Gini en el dataset de validación: 0.2572


In [21]:
prediction_lg = (np.exp(prediction_lg) - 1.0).clip(0,1)

In [22]:
# limit tensorflow GPU memory usage

import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

In [23]:
import keras.backend as K
import tensorflow as tf

# Lambdarank loss function
def pair_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    parts = tf.dynamic_partition(y_pred, y_true, 2)
    y_pos = parts[1]
    y_neg = parts[0]
    y_pos = tf.expand_dims(y_pos, 0)
    y_neg = tf.expand_dims(y_neg, -1)
    out = K.sigmoid(y_neg - y_pos)
    return K.mean(out)

Using TensorFlow backend.


In [24]:
class WeightedBinaryCrossEntropy(object):

    def __init__(self, pos_ratio):
        neg_ratio = 1. - pos_ratio
        self.pos_ratio = tf.constant(pos_ratio, tf.float32)
        self.weights = tf.constant(neg_ratio / pos_ratio, tf.float32)
        self.__name__ = "weighted_binary_crossentropy({0})".format(pos_ratio)

    def __call__(self, y_true, y_pred):
        return self.weighted_binary_crossentropy(y_true, y_pred)

    def weighted_binary_crossentropy(self, y_true, y_pred):
            # Transform to logits
            epsilon = tf.convert_to_tensor(K.common._EPSILON, y_pred.dtype.base_dtype)
            y_pred = tf.clip_by_value(y_pred, epsilon, 1 - epsilon)
            y_pred = tf.log(y_pred / (1 - y_pred))

            cost = tf.nn.weighted_cross_entropy_with_logits(y_true, y_pred, self.weights)
            return K.mean(cost * self.pos_ratio, axis=-1)

In [55]:
inputs=tf.keras.layers.Input(shape=(133,))
#x=tf.keras.layers.Dense(37, kernel_initializer='he_normal', bias_initializer='he_normal')(inputs)
#x=tf.keras.layers.ELU()(x)
#x=tf.keras.layers.BatchNormalization()(x)
#x=tf.keras.layers.Dropout(0.5)(x)
#x=tf.keras.layers.add([inputs,x])
x=tf.keras.layers.Dense(8000, kernel_initializer='he_normal', bias_initializer='he_normal')(inputs)
x=tf.keras.layers.ELU()(x)
x=tf.keras.layers.BatchNormalization()(x)
x=tf.keras.layers.Dropout(0.5)(x)
x=tf.keras.layers.Dense(1000, kernel_initializer='he_normal', bias_initializer='he_normal')(x)
x=tf.keras.layers.ELU()(x)
x=tf.keras.layers.BatchNormalization()(x)
x=tf.keras.layers.Dropout(0.5)(x)
x=tf.keras.layers.Dense(500, kernel_initializer='he_normal', bias_initializer='he_normal')(x)
x=tf.keras.layers.ELU()(x)
x=tf.keras.layers.BatchNormalization()(x)
x=tf.keras.layers.Dropout(0.5)(x)
predictions=tf.keras.layers.Dense(1, activation='sigmoid')(x)
model=tf.keras.models.Model(inputs=inputs, outputs=predictions)
loss=WeightedBinaryCrossEntropy(0.036)
model.compile(optimizer=tf.keras.optimizers.Nadam(),  
              loss=loss,
              #loss=pair_loss,
              #optimizer=tf.keras.optimizers.SGD(lr=0.03, momentum=0.9, nesterov=True),
              #metrics=['binary_accuracy']
             )


In [56]:
print (model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 133)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 8000)              1072000   
_________________________________________________________________
elu_13 (ELU)                 (None, 8000)              0         
_________________________________________________________________
batch_normalization_13 (Batc (None, 8000)              32000     
_________________________________________________________________
dropout_13 (Dropout)         (None, 8000)              0         
_________________________________________________________________
dense_18 (Dense)             (None, 1000)              8001000   
_________________________________________________________________
elu_14 (ELU)                 (None, 1000)              0         
__________

In [57]:
from time import time

tensorboard=tf.keras.callbacks.TensorBoard(log_dir="logs/{}".format(time()))

In [58]:
# Normalize the data
scaler=StandardScaler()

x1_scaled = scaler.fit_transform(x1)
x2_scaled = scaler.fit_transform(x2)
test_scaled = scaler.fit_transform(test[col])

In [59]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    %time temp=model.fit (x1_scaled, y1, validation_data=(x2_scaled,y2), callbacks=[tensorboard], epochs=10, batch_size=100, verbose=0)
    %time prediction=model.predict(test_scaled)
    %time val_prediction=model.predict(x2_scaled)
    %time train_prediction=model.predict(x1_scaled)
    scores=model.evaluate(x2_scaled,y2,verbose=1)
    
    #save model
    model_json = model.to_json()
    with open("nn_model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("nn_model.h5")



Wall time: 27min 37s
Wall time: 2min 17s
Wall time: 22.3 s
Wall time: 1min 8s

In [60]:
gini_score_t = gini(y1, train_prediction)    
gini_score_v = gini(y2, val_prediction)    
print(scores, gini_score_t, gini_score_v)

0.0466486885741 0.314909130667 0.269516218957


In [61]:
prediction_nn=prediction[:,0]

In [62]:
submission = pd.DataFrame({
        "id": test["id"],
        "target": prediction_nn
    })
submission.to_csv('submission_nn_w_bce.csv', index=False)

In [None]:
# Blending results

In [None]:
#df1 = pd.read_csv('xgb_submission.csv')
#df2 = pd.read_csv('lgb_submission.csv')

In [None]:
#df2.columns = [x+'_' if x not in ['id'] else x for x in df2.columns]
#blend = pd.merge(df1, df2, how='left', on='id')
#for c in df1.columns:
#    if c != 'id':
#        blend[c] = (blend[c] * 0.5)  + (blend[c+'_'] * 0.5)
#blend = blend[df1.columns]
#blend['target'] = (np.exp(blend['target'].values) - 1.0).clip(0,1)

In [None]:
#blend.to_csv('blend1.csv', index=False, float_format='%.5f')