In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.callbacks import EarlyStopping

Using Theano backend.


In [2]:
np.random.seed(2017)

In [3]:
train = pd.read_csv('train.csv')

In [4]:
df_train = train.sample(n=100000)

y = np.log( df_train['loss'].values )
sparse_data = []

### Categorical Variables

In [5]:
feat_cats = [f for f in df_train.columns if 'cat' in f]
for feat in feat_cats:
    dummy = pd.get_dummies(df_train[feat].astype('category'))
    tmp = csr_matrix(dummy)
    sparse_data.append(tmp)

### Continuous Variables

In [6]:
f_num = [f for f in df_train.columns if 'cont' in f]
scaler = StandardScaler()
tmp = csr_matrix(scaler.fit_transform(df_train[f_num]))
sparse_data.append(tmp)

In [7]:
X = hstack(sparse_data, format = 'csr')
X

<100000x1093 sparse matrix of type '<type 'numpy.float64'>'
	with 13000000 stored elements in Compressed Sparse Row format>

In [13]:
input_dim

NameError: name 'input_dim' is not defined

In [8]:
def nn_model(input_dim):
    model = Sequential()

    model.add(Dense(400, input_dim = input_dim, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.45))

    model.add(Dense(200, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(Dense(50, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(Dense(1, init = 'he_normal'))
    model.compile(loss = 'mae', optimizer = 'adadelta')
    return(model)

In [9]:
def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])

    if shuffle:
        np.random.shuffle(sample_index)
        
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch

        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0
            
def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

In [14]:
nepochs = 4
nfolds = 3
folds = KFold(len(y), n_folds=nfolds, shuffle = True, random_state = 2017)



for num_iter, (train_index, test_index) in enumerate(folds):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test   = X[test_index], y[test_index]
        
    model = nn_model(X_train.shape[1])
    callbacks=[EarlyStopping(patience=8)]

    model.fit_generator(generator = batch_generator(X_train, y_train, 128, True),
                                  nb_epoch = nepochs,
                                  samples_per_epoch = y_train.shape[0],
                                  validation_data=(X_test.todense(), y_test),
                                  verbose = 2, callbacks=callbacks) 
    
    y_pred = np.exp(model.predict_generator(generator = batch_generatorp(X_test, 128, False), val_samples = X_test.shape[0])[:,0])

    score = mean_absolute_error(np.exp(y_test), y_pred)
    print("Fold{0}, score={1}".format(num_iter+1, score))

Epoch 1/4
14s - loss: 4.0804 - val_loss: 0.7600
Epoch 2/4
21s - loss: 1.0442 - val_loss: 0.4936
Epoch 3/4
39s - loss: 0.8669 - val_loss: 0.4824
Epoch 4/4
41s - loss: 0.7869 - val_loss: 0.4675
Fold1, score=1290.45410169
Epoch 1/4
25s - loss: 4.2003 - val_loss: 0.6841
Epoch 2/4
15s - loss: 1.1220 - val_loss: 0.5431
Epoch 3/4
16s - loss: 0.9196 - val_loss: 0.4622
Epoch 4/4
16s - loss: 0.8166 - val_loss: 0.4566
Fold2, score=1249.59718595
Epoch 1/4
15s - loss: 4.2140 - val_loss: 0.7820
Epoch 2/4
17s - loss: 1.0618 - val_loss: 0.4949
Epoch 3/4
17s - loss: 0.8871 - val_loss: 0.4652
Epoch 4/4
16s - loss: 0.8004 - val_loss: 0.4635
Fold3, score=1285.78028202


## Task

Play aroud with NN architecture. First version is here:

- input
- hidden1: 400
- drouput + bn
- hidden2: 200
- drouput + bn
- hidden3: 50
- output


try to change something (remove layer, add a new one, change attribute in dropout and so on)
