In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Lambda, concatenate, Dense, Dropout, dot
from keras.backend import mean
from keras.utils import to_categorical, plot_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Data

In [None]:
songs = pd.merge(pd.read_csv('data/songs.csv'),
                 pd.read_csv('data/song_extra_info.csv'),
                 on = 'song_id',
                 how='inner')\
          .set_index('song_id', drop=True)\
          .sample(500000)

In [None]:
members = pd.read_csv('data/members.csv')\
            .set_index('msno', drop=True)

In [None]:
train = pd.read_csv('data/train.csv').sample(200000)

nb_users = members.index.nunique()
nb_songs = songs.index.nunique()
nb_genres = songs.genre_ids.nunique() # rough approximation, as this often has several values
nb_artists = songs.artist_name.nunique()
nb_composers = songs.composer.nunique()
nb_languages = songs.language.nunique()
nb_cities = members.city.nunique()

# Neural Net

In [None]:
# embedding on user
x1 = Input(shape=(1,), dtype='int32')
e1 = Embedding(output_dim=128, input_dim=nb_users)(x1)
e1 = Flatten()(e1)

# embedding on song
x2 = Input(shape=(1,), dtype='int32')
e2 = Embedding(output_dim=128, input_dim=nb_songs)(x2)
e2 = Flatten()(e2)

# song length
x3 = Input(shape=(1,), dtype='float32')

# embedding on genres
x4 = Input(shape=(5,), dtype='int32')
e4 = Embedding(output_dim=128, input_dim=nb_genres)(x4)
e4 = Lambda(lambda x: mean(x, axis=1))(e4)

# embedding on artists
x5 = Input(shape=(5,), dtype='int32')
e5 = Embedding(output_dim=128, input_dim=nb_artists)(x5)
e5 = Lambda(lambda x: mean(x, axis=1))(e5)

# embedding on composers
x6 = Input(shape=(5,), dtype='int32')
e6 = Embedding(output_dim=128, input_dim=nb_composers)(x6)
e6 = Lambda(lambda x: mean(x, axis=1))(e6)

# language
x7 = Input(shape=(nb_languages,), dtype='float32')

# embedding on cities
x8 = Input(shape=(1,), dtype='int32')
e8 = Embedding(output_dim=32, input_dim=nb_cities)(x8)
e8 = Flatten()(e8)

x = concatenate([e1, e2, x3, e4, e5, e6, x7, e8])
h = Dense(128, activation='relu')(x)
h = Dropout(0.2)(h)
o = Dense(1, activation='sigmoid')(h)

model = Model(inputs=[x1, x2, x3, x4, x5, x6, x7, x8],
              outputs=[o])

In [None]:
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

![](model.png)

In [None]:
def make_inputs_and_target_from_df(df):
    df = df.merge(songs, left_on='song_id', right_index=True)\
           .merge(members, left_on='msno', right_index=True)\

    x1 = df['msno'].apply(lambda r: one_hot(r,
                                            nb_users,
                                            filters='',
                                            split=' ')).tolist()
    x1 = np.asarray(x1).reshape(-1, 1)
    x2 = df['song_id'].apply(lambda r: one_hot(r,
                                               nb_songs,
                                               filters='',
                                               split=' ')).tolist()
    x2 = np.asarray(x2).reshape(-1, 1)
    x3 = df['song_length'].tolist()
    x3 = np.asarray(x3).reshape(-1, 1)
    x4 = df['genre_ids'].astype(str)\
                        .apply(lambda r: one_hot(r, nb_genres)).tolist()
    x4 = pad_sequences(x4, maxlen=5, padding='post')
    x5 = df['artist_name'].apply(lambda r: one_hot(r.replace(' ', '_'),
                                                   nb_artists,
                                                   filters='',
                                                   split=' ')).tolist()
    x5 = pad_sequences(x5, maxlen=5, padding='post')
    x6 = df['composer'].fillna('').apply(lambda r: one_hot(r.replace(' ', '_'),
                                                   nb_composers,
                                                   filters='',
                                                   split=' ')).tolist()
    x6 = pad_sequences(x6, maxlen=5, padding='post')
    x7 = df['language'].fillna(-1)\
                       .astype(int)\
                       .astype(str)\
                       .apply(lambda r: one_hot(r, nb_languages)).tolist()
    x7 = to_categorical(x7, nb_languages)
    x8 = df['city'].fillna(-1)\
                   .astype(str)\
                   .apply(lambda r: one_hot(r, nb_cities)).tolist()
    x8 = np.asarray(x8).reshape(-1, 1)
    y = df['target'].values
    
    return ([x1, x2, x3, x4, x5, x6, x7, x8], y)

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
train_data = make_inputs_and_target_from_df(train.head(10000))
eval_data = make_inputs_and_target_from_df(train.tail(5000))

In [None]:
model.fit(x=train_data[0],
          y=train_data[1],
          epochs=5,
          validation_data=(eval_data[0], eval_data[1]))

In [None]:
model.predict(train_data[0])