In [1]:
import numpy as np
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)

import pandas as pd
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping
from keras import backend as K
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

Using TensorFlow backend.


In [2]:
train = pd.read_csv("data/train.csv", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", parse_dates=["timestamp"])
macro = pd.read_csv("data/macro.csv", parse_dates=["timestamp"])

In [3]:
y_train = train["price_doc"]
x_train = train.drop(["timestamp", "price_doc"], axis=1)

In [4]:
# transform non-numerical variables
for c in x_train.columns:
    if x_train[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train[c].values)) 
        x_train[c] = lbl.transform(list(x_train[c].values))

# replace missing values with mean values
for c in x_train.columns:
    x_train[c].fillna(x_train[c].mean(), inplace=True)

In [5]:
x_test = test.drop(["timestamp"], axis=1)

# transform non-numerical variables
for c in x_test.columns:
    if x_test[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_test[c].values)) 
        x_test[c] = lbl.transform(list(x_test[c].values))
        
# replace missing values with mean values
for c in x_test.columns:
    x_test[c].fillna(x_test[c].mean(), inplace=True)

In [7]:
def rmse(y_true, y_pred):
    return K.mean(K.square(y_pred - y_true), axis=-1)**(1/2)

def rmsle(y_true, y_pred):
    a = K.log(y_pred + 1)
    b = K.log(y_true + 1)
    return K.mean(K.square(a - b), axis=-1)**(1/2)

In [9]:
model = Sequential()
model.add(Dense(1024, input_dim=x_train.shape[1]))
model.add(Activation("sigmoid"))
model.add(Dense(512))
model.add(Activation("sigmoid"))
model.add(Dense(256))
model.add(Activation("sigmoid"))
model.add(Dense(128))
model.add(Activation("sigmoid"))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation("linear"))
model.compile(optimizer="rmsprop", loss=rmsle)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 1024)              297984    
_________________________________________________________________
activation_5 (Activation)    (None, 1024)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 512)               524800    
_________________________________________________________________
activation_6 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 256)               131328    
_________________________________________________________________
activation_7 (Activation)    (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               32896     
__________

In [10]:
early_stop = EarlyStopping(monitor="val_loss", patience=10)
model.fit(x_train.values, y_train.values, epochs=10, validation_split=0.01, callbacks=[early_stop], verbose=1)

Train on 30166 samples, validate on 305 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x105e90cc0>

In [17]:
np.reshape(model.predict(x_test.values), -1)

array([ 1182.96508789,  1182.96508789,  1182.96508789, ...,  1182.96508789,
        1182.96508789,  1182.96508789], dtype=float32)

In [18]:
output = pd.DataFrame({"id": x_test.index, "price_doc": np.reshape(model.predict(x_test.values), -1)})

In [20]:
output.to_csv("submissions_nn.csv", index=False)