In [145]:
import numpy as np
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)

import pandas as pd
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Merge
from keras.callbacks import EarlyStopping
from keras import backend as K
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [146]:
train = pd.read_csv("data/train.csv", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", parse_dates=["timestamp"])
macro = pd.read_csv("data/macro.csv", parse_dates=["timestamp"])

In [147]:
y_train = train["price_doc"]
x_train = train.drop(["timestamp", "price_doc", "id"], axis=1)

In [148]:
# transform non-numerical variables
for c in x_train.columns:
    if x_train[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train[c].values)) 
        x_train[c] = lbl.transform(list(x_train[c].values))

# replace missing values with mean values
for c in x_train.columns:
    x_train[c].fillna(x_train[c].mean(), inplace=True)

In [149]:
x_test = test.drop(["timestamp", "id"], axis=1)

# transform non-numerical variables
for c in x_test.columns:
    if x_test[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_test[c].values)) 
        x_test[c] = lbl.transform(list(x_test[c].values))
        
# replace missing values with mean values
for c in x_test.columns:
    x_test[c].fillna(x_test[c].mean(), inplace=True)

In [150]:
# join macro with traindata
train_macro = pd.merge(train, macro, how="left", on="timestamp")
test_macro = pd.merge(test, macro, how="left", on="timestamp")

In [151]:
# add price to macro data and filter out
test_macro = test_macro[np.append(macro.columns.values, "id")].copy()

In [152]:
# remove columns that are completly null
test_macro.dropna(axis=1, how="all", inplace=True)

In [153]:
test_macro.shape

(7662, 61)

In [154]:
train_macro = train_macro[np.append(test_macro.columns.values, "price_doc")]

In [155]:
train_macro.shape

(30471, 62)

In [94]:
y_train_macro = train_macro["price_doc"]
x_train_macro = train_macro.drop(["timestamp", "price_doc"], axis=1)

# transform non-numerical variables
for c in x_train_macro.columns:
    if x_train_macro[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train_macro[c].values)) 
        x_train_macro[c] = lbl.transform(list(x_train_macro[c].values))
        
# replace missing values with mean values
for c in x_train_macro.columns:
    x_train_macro[c].fillna(x_train_macro[c].mean(), inplace=True)

In [95]:
x_test_macro = test_macro.drop(["timestamp"], axis=1)

# transform non-numerical variables
for c in x_test_macro.columns:
    if x_test_macro[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_test_macro[c].values)) 
        x_test_macro[c] = lbl.transform(list(x_test_macro[c].values))
        
# replace missing values with mean values
for c in x_test_macro.columns:
    x_test_macro[c].fillna(x_test_macro[c].mean(), inplace=True)

In [98]:
# reset index
x_train_macro.set_index("id", inplace=True)
x_test_macro.set_index("id", inplace=True)

### Modeling

In [100]:
def rmse(y_true, y_pred):
    return K.mean(K.square(y_pred - y_true), axis=-1)**(1/2)

def rmsle(y_true, y_pred):
    a = K.log(y_pred + 1)
    b = K.log(y_true + 1)
    return K.mean(K.square(a - b), axis=-1)**(1/2)

In [104]:
from keras.models import Model
from keras.layers import Dense, Input, concatenate, average
from keras.optimizers import Adam

a_input = Input(shape=(x_train.shape[1], ))
a_1 = Dense(1024, activation="sigmoid")(a_input)
a_2 = Dense(512, activation="sigmoid")(a_1)
a_3 = Dense(256, activation="sigmoid")(a_2)
a_4 = Dense(128, activation="sigmoid")(a_3)
a_5 = Dropout(0.5)(a_4)
a_6 = Dense(64, activation="sigmoid")(a_5)
a_7 = Dense(1, activation="linear")(a_6)

b_input = Input(shape=(x_train_macro.shape[1], ))
b_1 = Dense(256, activation="sigmoid")(b_input)
b_2 = Dense(192, activation="sigmoid")(b_1)
b_3 = Dense(128, activation="sigmoid")(b_2)
b_4 = Dense(64, activation="sigmoid")(b_3)
b_5 = Dropout(0.5)(b_4)
b_6 = Dense(32, activation="sigmoid")(b_5)
b_7 = Dense(1, activation="linear")(b_6)

merge_output = average([a_7, b_7])

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model = Model(inputs=[a_input, b_input], outputs=merge_output)
model.compile(optimizer=adam, loss=rmsle)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 290)           0                                            
____________________________________________________________________________________________________
input_6 (InputLayer)             (None, 59)            0                                            
____________________________________________________________________________________________________
dense_25 (Dense)                 (None, 1024)          297984      input_5[0][0]                    
____________________________________________________________________________________________________
dense_31 (Dense)                 (None, 256)           15360       input_6[0][0]                    
___________________________________________________________________________________________

In [124]:
early_stop = EarlyStopping(monitor="val_loss", patience=10)
model.fit([preprocessing.normalize(x_train.values), preprocessing.normalize(x_train_macro.values)], y_train.values, epochs=20, validation_split=0.01, callbacks=[early_stop], verbose=1)

Train on 30166 samples, validate on 305 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1295a1c50>

In [125]:
model.predict([x_train.values, x_train_macro.values])

array([[ 1576.546875],
       [ 1576.546875],
       [ 1576.546875],
       ..., 
       [ 1576.546875],
       [ 1576.546875],
       [ 1576.546875]], dtype=float32)

In [135]:
assert 1==2

AssertionError: 