In [3]:
import pandas as pd
import numpy as np
#import gc

import mlcrate as mlc
import pickle as pkl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import keras
from keras.layers.normalization import BatchNormalization
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dot, Reshape, Add, Subtract
from keras import objectives
from keras import backend as K
from keras import regularizers 
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2

from keras.utils import plot_model

Using TensorFlow backend.


In [4]:
data_train = pd.read_csv("./train_sample.csv", header = None)
data_train.drop(0,axis=0,inplace=True)
print(data_train.shape)
data_train.head()

(9000, 4)


Unnamed: 0,0,1,2,3
1,28,36,43,0
2,9,7,11,0
3,48,27,20,1
4,35,8,20,0
5,9,2,41,0


In [5]:
data_test = pd.read_csv("./test_sample.csv", header = None)
data_test.drop(0,axis=0,inplace=True)
print(data_test.shape)
data_test.head()

(1000, 4)


Unnamed: 0,0,1,2,3
1,9000,25,5,3
2,9001,11,48,45
3,9002,8,30,43
4,9003,15,31,26
5,9004,28,21,27


In [26]:
k_latent = 4
embedding_reg = 0.0002 
kernel_reg = 0.1 

In [27]:
def get_embed(x_input, x_size, out_dim,test_weights=False):
    # x_input is index of input (either user or item)
    # x_size is length of vocabulary (e.g. total number of users or items)
    # test_weights is a demo flag to show results with predefined weights
    # out_dim is size of embedding vectors
    if x_size > 0: #category
        if test_weights & (out_dim<=2):
            embed = Embedding(x_size, out_dim, input_length=1,
                          weights=[weight_matrix[:x_size,:out_dim]], 
                          embeddings_regularizer=l2(embedding_reg))(x_input)
        else:
            embed = Embedding(x_size, out_dim, input_length=1,
                              embeddings_regularizer=l2(embedding_reg))(x_input)
        embed = Flatten()(embed)
    else:
        embed = Dense(out_dim, kernel_regularizer=l2(embedding_reg))(x_input)
    return embed

In [28]:
def build_model(f_size):
    dim_input = len(f_size)
    
    input_x = [Input(shape=(1,)) for i in range(dim_input)] 
    
    lin_terms = [get_embed(x, size, 1) for (x, size) in zip(input_x, f_size)]

    factors = [get_embed(x, size, k_latent) for (x, size) in zip(input_x, f_size)]
     
    s = Add()(factors)
    
    diffs = [Subtract()([s, x]) for x in factors]
    
    dots = [Dot(axes=1)([d, x]) for d,x in zip(diffs, factors)]
    
    x = Concatenate()(lin_terms + dots)
    x = BatchNormalization()(x)
    output = Dense(1, activation='relu', kernel_regularizer=l2(kernel_reg))(x)
    model = Model(inputs=input_x, outputs=[output])
    model.compile(optimizer=Adam(clipnorm=0.6), loss='mean_squared_error')
    return model

In [29]:
f_size  = [pd.to_numeric(data_train[f]).max() + 1 for f in range(3)]
f_size

[50, 50, 50]

In [30]:
X_train,X_val,y_train,y_val = train_test_split(data_train.iloc[:,0:3], data_train[3],
test_size=0.3)

In [31]:
X_train = np.hsplit(X_train,3)
X_val = np.hsplit(X_val, 3)

In [32]:
n_epochs = 100

batch_size = 64
print('Batch size: ',batch_size)
model = build_model(f_size)
earlystopper = EarlyStopping(patience=2, verbose=0)
model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, verbose=0,
          validation_data=(X_val, y_val), callbacks=[earlystopper])  
print('\n')
print('RMSE',model.evaluate(X_val, y_val,
                            batch_size=batch_size,verbose=0)**0.5)
best_epoch = earlystopper.stopped_epoch

Batch size:  64


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




RMSE 0.33806177267543813


In [33]:
best_epoch

40

In [28]:
# k_latent = 2, clipnorm = 0.25, batch_size = 128 => RMSE 0.2807015537143627, best_epoch = 79
# k_latent = 3, clipnorm = 0.5, batch_size = 64 => RMSE 0.2838863304417515, best_epoch = 26
# k_latent = 4, clipnorm = 0.6, batch_size = 64 => RMSE 0.3019954558509389, best_epoch = 47

In [14]:
model.fit(np.hsplit(data_train.iloc[:,0:3], 3), data_train[3], epochs=best_epoch, batch_size=batch_size, verbose=0)
predict1 = model.predict(np.hsplit(data_test.iloc[:,1:4], 3))
predict1

array([[7.11733401e-01],
       [2.11860776e+00],
       [3.05877542e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [2.05074120e+00],
       [5.76581240e-01],
       [1.38629603e+00],
       [0.00000000e+00],
       [2.38188314e+00],
       [7.76066005e-01],
       [0.00000000e+00],
       [1.87857437e+00],
       [2.74665296e-01],
       [3.35418653e+00],
       [4.37030172e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [3.45710230e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [1.03592634e+00],
       [0.00000000e+00],
       [7.57385373e-01],
       [2.79933512e-01],
       [4.63304222e-02],
       [0.00000000e+00],
       [0.00000000e+00],
       [3.58862638e+00],
       [0.00000000e+00],
       [1.54317904e+00],
       [0.00000000e+00],
       [3.40561342e+00],
       [0.00000000e+00],
       [6.27025902e-01],
       [4.93230200e+00],
       [2.48946762e+00],
       [0.00000000e+00],
       [2.94505906e+00],
       [3.63876522e-02],


In [25]:
model.fit(np.hsplit(data_train.iloc[:,0:3], 3), data_train[3], epochs=best_epoch, batch_size=batch_size, verbose=0)
predict2 = model.predict(np.hsplit(data_test.iloc[:,1:4], 3))
predict2

array([[6.63759232e-01],
       [1.94415522e+00],
       [3.12994194e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [1.98649645e+00],
       [3.45945686e-01],
       [1.30808091e+00],
       [0.00000000e+00],
       [2.26228499e+00],
       [8.31610322e-01],
       [0.00000000e+00],
       [1.82726789e+00],
       [3.33225280e-01],
       [3.50223684e+00],
       [4.34352112e+00],
       [0.00000000e+00],
       [2.29495317e-01],
       [3.26344228e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [1.00151694e+00],
       [0.00000000e+00],
       [8.52963865e-01],
       [2.96565503e-01],
       [4.69577014e-02],
       [0.00000000e+00],
       [0.00000000e+00],
       [3.56203794e+00],
       [0.00000000e+00],
       [1.50740957e+00],
       [0.00000000e+00],
       [3.18703938e+00],
       [0.00000000e+00],
       [7.33999610e-01],
       [5.14367294e+00],
       [2.59668064e+00],
       [0.00000000e+00],
       [3.00190544e+00],
       [0.00000000e+00],


In [34]:
model.fit(np.hsplit(data_train.iloc[:,0:3], 3), data_train[3], epochs=best_epoch, batch_size=batch_size, verbose=0)
predict3 = model.predict(np.hsplit(data_test.iloc[:,1:4], 3))
predict3

array([[7.22884297e-01],
       [1.95749521e+00],
       [3.06377554e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [2.04114866e+00],
       [3.32605004e-01],
       [1.62979603e+00],
       [0.00000000e+00],
       [2.20386696e+00],
       [7.49983311e-01],
       [0.00000000e+00],
       [1.74354696e+00],
       [2.34689415e-01],
       [3.48884940e+00],
       [4.49621677e+00],
       [0.00000000e+00],
       [4.44939107e-01],
       [3.28867984e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [9.87435341e-01],
       [0.00000000e+00],
       [8.70937884e-01],
       [2.42718399e-01],
       [5.52715659e-02],
       [0.00000000e+00],
       [0.00000000e+00],
       [3.57239914e+00],
       [0.00000000e+00],
       [1.54398203e+00],
       [0.00000000e+00],
       [3.34995890e+00],
       [0.00000000e+00],
       [5.28425932e-01],
       [5.07089806e+00],
       [2.52268434e+00],
       [0.00000000e+00],
       [2.97203612e+00],
       [1.68863624e-01],


In [35]:
print('RMSE',model.evaluate(np.hsplit(data_train.iloc[:,0:3], 3), data_train[3],
                            batch_size=batch_size,verbose=0)**0.5)

RMSE 0.2797589986637164


In [36]:
p = model.predict(np.hsplit(data_train.iloc[:,0:3], 3))
p[17:29]

array([[0.24434635],
       [0.5588033 ],
       [0.        ],
       [0.        ],
       [3.64636   ],
       [0.46447268],
       [0.        ],
       [4.114974  ],
       [0.31164348],
       [0.        ],
       [2.119837  ],
       [0.        ]], dtype=float32)

In [37]:
pred = (predict1 + predict2 + predict3) / 3

In [38]:
sub = pd.read_csv('./test_sample.csv',usecols=['ID'])
sub['target'] = pred
sub.to_csv('submission.csv',index=False)