In [96]:
import math
import os
import gc
import logging
import time

from numba import jit

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve, accuracy_score, auc
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import power_transform
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import svm
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import clone
from sklearn.pipeline import Pipeline
import sklearn

from keras.layers import Activation, Dropout, Flatten, Dense, GlobalMaxPooling2D, BatchNormalization, Input, Conv2D
from keras import callbacks
from keras import metrics
from keras.optimizers import Adam
from keras import backend as K
import keras
from keras.models import Model, Sequential
from keras.models import model_from_json
from keras import regularizers
from keras.losses import binary_crossentropy
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.advanced_activations import PReLU, LeakyReLU

import tensorflow as tf

In [2]:
sklearn.__version__

'0.20.3'

In [3]:
%matplotlib inline

In [4]:
start_time = time.time()

In [5]:
train_df = pd.read_csv('../input/train.csv')

In [6]:
test_df = pd.read_csv('../input/test.csv')

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB


In [8]:
train_df.describe()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,10.679914,-1.627622,10.715192,6.796529,11.078333,-5.065317,5.408949,16.54585,0.284162,...,3.23444,7.438408,1.927839,3.331774,17.993784,-0.142088,2.303335,8.908158,15.87072,-3.326537
std,0.300653,3.040051,4.050044,2.640894,2.043319,1.62315,7.863267,0.866607,3.418076,3.332634,...,4.559922,3.023272,1.478423,3.99203,3.135162,1.429372,5.454369,0.921625,3.010945,10.438015
min,0.0,0.4084,-15.0434,2.1171,-0.0402,5.0748,-32.5626,2.3473,5.3497,-10.5055,...,-14.0933,-2.6917,-3.8145,-11.7834,8.6944,-5.261,-14.2096,5.9606,6.2993,-38.8528
25%,0.0,8.45385,-4.740025,8.722475,5.254075,9.883175,-11.20035,4.7677,13.9438,-2.3178,...,-0.058825,5.1574,0.889775,0.5846,15.6298,-1.1707,-1.946925,8.2528,13.8297,-11.208475
50%,0.0,10.52475,-1.60805,10.58,6.825,11.10825,-4.83315,5.3851,16.4568,0.3937,...,3.2036,7.34775,1.9013,3.39635,17.95795,-0.1727,2.4089,8.8882,15.93405,-2.81955
75%,0.0,12.7582,1.358625,12.5167,8.3241,12.261125,0.9248,6.003,19.1029,2.9379,...,6.4062,9.512525,2.9495,6.2058,20.396525,0.8296,6.556725,9.5933,18.064725,4.8368
max,1.0,20.315,10.3768,19.353,13.1883,16.6714,17.2516,8.4477,27.6918,10.1513,...,18.4409,16.7165,8.4024,18.2818,27.9288,4.2729,18.3215,12.0004,26.0791,28.5007


In [9]:
train_features = train_df.drop(['target', 'ID_code'], axis=1)
train_targets = train_df['target']

In [10]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
for feature in features:
    train_features['mean_'+feature] = (train_features[feature].mean()-train_features[feature])
    train_features['z_'+feature] = (train_features[feature] - train_features[feature].mean()) / train_features[feature].std(ddof=0)
    train_features['sq_'+feature] = (train_features[feature])**2
    train_features['sqrt_'+feature] = (train_features['sq_'+feature])**(1/4)
    train_features['log_'+feature] = np.log(train_features['sq_'+feature]+10)/2

In [11]:
train_features.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,mean_var_198,z_var_198,sq_var_198,sqrt_var_198,log_var_198,mean_var_199,z_var_199,sq_var_199,sqrt_var_199,log_var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,3.09042,-1.026398,163.336068,3.574955,2.577616,-2.235137,0.214135,1.191154,1.044701,1.207562
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,-2.48528,0.825417,336.942736,4.28439,2.92458,-5.278337,0.505685,3.809523,1.397068,1.312679
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,1.14852,-0.381449,216.743173,3.836952,2.711909,-3.723037,0.356681,0.157212,0.629682,1.159092
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,-2.09898,0.697118,322.910118,4.239068,2.903936,5.673063,-0.543502,80.9928,2.999933,2.25539
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-2.12668,0.706318,323.906407,4.242334,2.90543,5.483863,-0.525375,77.623148,2.968232,2.236523


In [12]:
sc = StandardScaler()
train_features = sc.fit_transform(train_features)

In [13]:
gc.collect()

291

In [14]:
input_dim = train_features.shape[1]
input_dim

1200

In [15]:
class printAUC(callbacks.Callback):
    def __init__(self, X_train, y_train):
        super(printAUC, self).__init__()
        self.bestAUC = 0
        self.X_train = X_train
        self.y_train = y_train
        
    def on_epoch_end(self, epoch, logs={}):
        pred = self.model.predict(np.array(self.X_train))
        auc = roc_auc_score(self.y_train, pred)
        print("Train AUC: " + str(auc))
        #pred = self.model.predict(self.validation_data[0])
        #auc = roc_auc_score(self.validation_data[1], pred)
        #print ("Validation AUC: " + str(auc))
        if (self.bestAUC < auc) :
            self.bestAUC = auc
            self.model.save("bestNet.h5", overwrite=True)
        return

In [94]:
def step_decay(epoch):
    initial_lrate = 0.1
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1 + epoch) / epochs_drop))
    return lrate

In [17]:
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.lr = []
 
    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.lr.append(step_decay(len(self.losses)))

In [18]:
@jit
def augment(x, y, t=2):
    xs, xn = [], []
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:, c] = x1[ids][:, c]
        xs.append(x1)

    for i in range(t // 2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:, c] = x1[ids][:, c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x, xs, xn])
    y = np.concatenate([y, ys, yn])
    return x, y

In [19]:
min_max_scaler = MinMaxScaler()

In [20]:
standart_scaler = StandardScaler()

In [21]:
robust_scaler = RobustScaler()

In [22]:
#scaled_train_values = min_max_scaler.fit_transform(train_df[train_df.columns.drop(['ID_code', 'target'])])
scaled_train_values = standart_scaler.fit_transform(train_df[train_df.columns.drop(['ID_code', 'target'])])
#scaled_train_values = robust_scaler.fit_transform(train_df[train_df.columns.drop(['ID_code', 'target'])])

In [23]:
target_values = train_df['target'].values

In [24]:
train_values, holdout_test_values, train_target_values, holdout_test_target_values = train_test_split(
    #scaled_train_values,
    train_df[train_df.columns.drop(['ID_code', 'target'])].values,
    target_values,
    test_size=0.2,
    random_state=0
)

In [25]:
print(train_values.shape)
print(train_target_values.shape)
print(holdout_test_values.shape)
print(holdout_test_target_values.shape)

(160000, 200)
(160000,)
(40000, 200)
(40000,)


In [26]:
#test_df.head()

In [27]:
ID_code = test_df['ID_code'].values

In [28]:
print(ID_code[:10])

['test_0' 'test_1' 'test_2' 'test_3' 'test_4' 'test_5' 'test_6' 'test_7'
 'test_8' 'test_9']


In [29]:
#tf.py_func

In [30]:
batch_size = 100

In [31]:
def auc(y_true, y_pred):
#def auc(y_pred, y_true):
    #print(y_true[:5])
    #print(y_pred[:5])  
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)
    #return tf.py_func(roc_auc_score, (y_true, y_pred), tf.int8)

Провести mean_shift кластеризацию на множестве переменных (features)?

In [32]:
#submission_predicts = sequential_nn_model.predict(test_df[test_df.columns.drop('ID_code')].values)

In [33]:
#submission_predicts.shape

In [34]:
#print(submission_predicts[:5])
#submission_predicts_values = submission_predicts[:, 0]

In [35]:
#submission_predicts_values.shape

In [36]:
#np.all(submission_predicts_values == 0)

In [37]:
#submission_df = pd.DataFrame({'ID_code': ID_code, 'target': submission_predicts_values.astype('float32')})

In [38]:
#np.any(submission_predicts_values == 1)

In [39]:
#np.all(submission_predicts_values > 0)

In [40]:
#np.all(submission_predicts_values < 0)

In [41]:
#np.any(submission_predicts_values > 0)

In [42]:
#np.any(submission_predicts_values < 0)

In [43]:
#submission_predicts_values[submission_predicts_values > 0].shape

In [44]:
#submission_predicts_values.max()

In [45]:
#submission_predicts_values.min()

In [46]:
#submission_predicts_values[submission_predicts_values < 0.245].shape

In [47]:
#submission_predicts_values[submission_predicts_values >= 0.245].shape

In [48]:
#submission_predicts_values[submission_predicts_values < 0.245] = 0
#submission_predicts_values[submission_predicts_values >= 0.245] = 1

In [49]:
#submission_predicts_values[submission_predicts_values == 1].shape

In [50]:
#submission_df = pd.DataFrame({'ID_code': ID_code, 'target': submission_predicts_values.astype('float32')})

In [51]:
#submission_df.to_csv('submission_mlp_1.csv', index=False)

In [52]:
def detect_threshold(classes_ratio, step_size, predicted_values):
    threshold = predicted_values.min()
    values_below_threshold = (predicted_values[predicted_values < threshold]).shape[0]
    values_above_threshold = (predicted_values[predicted_values >= threshold]).shape[0]
    #threshold_classes_ratio =  values_above_threshold / values_below_threshold
    threshold_classes_ratio = 1
    
    while(threshold_classes_ratio > classes_ratio):
        threshold += step_size        
        values_below_threshold = (predicted_values[predicted_values < threshold]).shape[0]
        values_above_threshold = (predicted_values[predicted_values >= threshold]).shape[0]
        threshold_classes_ratio =  values_above_threshold / values_below_threshold
    predicted_values[predicted_values < threshold] = 0
    predicted_values[predicted_values >= threshold] = 1
    return threshold, predicted_values

In [53]:
sequential_nn_model = None
#del sequential_nn_model
if sequential_nn_model:
    del sequential_nn_model
sequential_nn_model = Sequential()
sequential_nn_model.add(Dense(batch_size, input_dim=200, kernel_initializer='normal', activation='relu'))
sequential_nn_model.add(Dropout(0.1))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(batch_size, input_dim=100, kernel_initializer='normal', activation='sigmoid'))
sequential_nn_model.add(Dropout(0.1))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(batch_size, input_dim=50, kernel_initializer='normal', activation='relu'))
sequential_nn_model.add(Dropout(0.1))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(batch_size, input_dim=50, kernel_initializer='normal', activation='sigmoid'))
#sequential_nn_model.add(Dense(batch_size, input_shape=(100, 200), kernel_initializer='normal', activation='sigmoid'))
#sequential_nn_model.add(Dropout(0.76))
#sequential_nn_model.add(Dropout(0.24))
sequential_nn_model.add(Dropout(0.1))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(batch_size, input_dim=10, kernel_initializer='normal', activation='relu'))
sequential_nn_model.add(Dropout(0.1))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(batch_size, input_dim=10, kernel_initializer='normal', activation='sigmoid'))
sequential_nn_model.add(Dropout(0.1))
sequential_nn_model.add(BatchNormalization())
sequential_nn_model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

In [54]:
#sequential_nn_model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0005), metrics=['accuracy', auc])
sequential_nn_model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy', auc])

In [55]:
start_time = time.time()
sequential_nn_model.fit(train_values, train_target_values, batch_size=100, epochs=40, verbose=1, validation_split=0.2)
print("Run time {} min".format((time.time() - start_time) / 60))

Train on 128000 samples, validate on 32000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40

InvalidArgumentError: ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.
Traceback (most recent call last):

  File "/home/uldo/miniconda3/envs/DS-New/lib/python3.6/site-packages/tensorflow/python/ops/script_ops.py", line 206, in __call__
    ret = func(*args)

  File "/home/uldo/miniconda3/envs/DS-New/lib/python3.6/site-packages/sklearn/metrics/ranking.py", line 356, in roc_auc_score
    sample_weight=sample_weight)

  File "/home/uldo/miniconda3/envs/DS-New/lib/python3.6/site-packages/sklearn/metrics/base.py", line 77, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)

  File "/home/uldo/miniconda3/envs/DS-New/lib/python3.6/site-packages/sklearn/metrics/ranking.py", line 324, in _binary_roc_auc_score
    raise ValueError("Only one class present in y_true. ROC AUC score "

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.


	 [[Node: metrics/auc/PyFunc = PyFunc[Tin=[DT_FLOAT, DT_FLOAT], Tout=[DT_DOUBLE], token="pyfunc_0", _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_dense_7_target_0_1, dense_7/Sigmoid/_389)]]

In [None]:
loss_and_metrics = sequential_nn_model.evaluate(holdout_test_values, holdout_test_target_values, batch_size=100)

In [None]:
loss_and_metrics
# [0.25161404045298696, 0.9094000032544136, 0.8320361980522218] [0.2493363002128899, 0.9096750013530255, 0.8369347876192563]
# [0.24582509476691483, 0.9101250021159649, 0.8396344886385845] [0.24802235754206778, 0.9094000029563903, 0.8385308386618173]
# [0.24494799628853797, 0.9102000007033348, 0.839084512352511]

In [74]:
model_json = sequential_nn_model.to_json()
with open("sequential_nn_model_relu_droput024_lr001_sigmoid_batchnorm_40_epochs_2019-03-31.json", "w") as json_file:
    json_file.write(model_json)

In [75]:
sequential_nn_model.save("sequential_nn_model_relu_dropout024_lr001_sigmoid_batchnorm_40_epochs_2019-03-31.h5")

In [76]:
sequential_nn_model.save_weights("sequential_nn_model_weights_relu_dropout024_lr001_sigmoid_batchnorm_40_epochs_2019-03-31.h5")

In [77]:
submission_predict_values = sequential_nn_model.predict(test_df[test_df.columns.drop('ID_code')].values)[:, 0]

In [78]:
classes_ratio = train_df[train_df['target'] == 1].shape[0] / train_df[train_df['target'] == 0].shape[0]

In [79]:
classes_ratio

0.1117163789174106

In [80]:
threshold, submission_predicts = detect_threshold(classes_ratio, 0.01, submission_predict_values)

In [81]:
threshold

0.38138799776323157

In [82]:
submission_predicts[submission_predicts == 0].shape

(180180,)

In [83]:
submission_predicts[submission_predicts == 1].shape

(19820,)

In [84]:
submission_df = pd.DataFrame({'ID_code': ID_code, 'target': submission_predicts.astype('float32')})

In [85]:
submission_df.to_csv('submission_mlp_4.csv', index=False)

In [98]:
train_features.shape

(200000, 1200)

In [87]:
kernel_regularizer=regularizers.l2(0.01)
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(train_features.shape[1],)))
#model.add(PreLU(alpha=.001))
model.add(Dropout(0.6))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
#model.add(PreLU(alpha=.001))
model.add(Dropout(0.6))
model.add(BatchNormalization())
model.add(Dense(32, activation='relu'))
#model.add(PreLU(alpha=.001))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))

annealer = LearningRateScheduler(lambda x: 1e-2 * 0.95 ** x)

In [88]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', auc])

In [97]:
loss_history = LossHistory()
lrate = LearningRateScheduler(step_decay)
callbacks_list = [EarlyStopping(monitor='val_auc', patience=5, mode='max'), loss_history, annealer]
sss = StratifiedShuffleSplit(n_splits=10)
start_time = time.time()
for train_index, test_index in sss.split(train_features, train_targets):
    X_train, X_val = train_features[train_index], train_features[test_index]
    Y_train, Y_val = train_targets[train_index], train_targets[test_index]
    #X_tr, y_tr = augment(X_train.values, y_train.values)
    #print("{} iteration".format(i+1))
    history= model.fit(X_train, Y_train, batch_size=512, epochs=50, callbacks=callbacks_list, verbose=1, validation_data=(X_val,Y_val))
    del X_train, X_val, Y_train, Y_val
    gc.collect()
print("Run time {} min".format((time.time() - start_time) / 60))

Train on 180000 samples, validate on 20000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Train on 180000 samples, validate on 20000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Train on 180000 samples, validate on 20000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Train on 180000 samples, validate on 20000 samples
Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Train on 180000 samples, validate on 20000 samples
Epoch 1/50

InvalidArgumentError: ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.
Traceback (most recent call last):

  File "/home/uldo/miniconda3/envs/DS-New/lib/python3.6/site-packages/tensorflow/python/ops/script_ops.py", line 206, in __call__
    ret = func(*args)

  File "/home/uldo/miniconda3/envs/DS-New/lib/python3.6/site-packages/sklearn/metrics/ranking.py", line 356, in roc_auc_score
    sample_weight=sample_weight)

  File "/home/uldo/miniconda3/envs/DS-New/lib/python3.6/site-packages/sklearn/metrics/base.py", line 77, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)

  File "/home/uldo/miniconda3/envs/DS-New/lib/python3.6/site-packages/sklearn/metrics/ranking.py", line 324, in _binary_roc_auc_score
    raise ValueError("Only one class present in y_true. ROC AUC score "

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.


	 [[Node: metrics_2/auc/PyFunc = PyFunc[Tin=[DT_FLOAT, DT_FLOAT], Tout=[DT_DOUBLE], token="pyfunc_2", _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_dense_18_target_0_1, dense_18/Sigmoid/_1357)]]
	 [[Node: metrics_2/auc/PyFunc/_1403 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_235_metrics_2/auc/PyFunc", tensor_type=DT_DOUBLE, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]

In [None]:
train_features.shape

In [None]:
del train, train_features
gc.collect()

In [None]:
test = pd.read_csv('../input/test.csv')
test_features = test.drop(['ID_code'], axis=1)

In [None]:
for feature in features:
    test_features['mean_'+feature] = (test_features[feature].mean()-test_features[feature])
    test_features['z_'+feature] = (test_features[feature] - test_features[feature].mean())/test_features[feature].std(ddof=0)
    test_features['sq_'+feature] = (test_features[feature])**2
    test_features['sqrt_'+feature] = (test_features['sq_'+feature])**(1/4)
    test_features['log_'+feature] = np.log(test_features['sq_'+feature]+10)/2

In [None]:
test_features = sc.transform(test_features)

In [None]:
id_code_test = test['ID_code']
# Make predicitions
pred = model.predict(test_features)
pred_ = pred[:,0]

In [None]:
print(train['target'].mean())
pred.mean()

In [None]:
my_submission = pd.DataFrame({"ID_code" : id_code_test, "target" : pred_})

In [None]:
my_submission.to_csv('submission.csv', index = False, header = True)