In [1]:
import pandas as pd
from keras.preprocessing.text import one_hot
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Lambda, concatenate, Dense, Dropout
from keras.backend import mean

Using TensorFlow backend.


# Data

In [2]:
songs = pd.merge(pd.read_csv('data/songs.csv'),
                 pd.read_csv('data/song_extra_info.csv'),
                 on = 'song_id',
                 how='outer')\
          .set_index('song_id', drop=True)

In [3]:
members = pd.read_csv('data/members.csv')\
            .set_index('msno', drop=True)

In [4]:
train = pd.read_csv('data/train.csv').sample(frac=1)

nb_users = members.index.nunique()
nb_songs = songs.index.nunique()
nb_genres = songs.genre_ids.nunique() # rough approximation, as this often has several values
nb_artists = songs.artist_name.nunique()
nb_composers = songs.composer.nunique()
nb_languages = songs.language.nunique()
nb_cities = members.city.nunique()

# Neural Net

In [15]:
# embedding on user
x1 = Input(shape=(1,), dtype='int32')
e1 = Embedding(output_dim=256, input_dim=nb_users)(x1)
e1 = Lambda(lambda x: mean(x, axis=1))(e1)

# embedding on song
x2 = Input(shape=(1,), dtype='int32')
e2 = Embedding(output_dim=256, input_dim=nb_songs)(x2)
e2 = Lambda(lambda x: mean(x, axis=1))(e2)

# song length
x3 = Input(shape=(1,), dtype='float32')

# embedding on genres
x4 = Input(shape=(5,), dtype='int32')
e4 = Embedding(output_dim=128, input_dim=nb_genres)(x4)
e4 = Lambda(lambda x: mean(x, axis=1))(e4)

# embedding on artists
x5 = Input(shape=(5,), dtype='int32')
e5 = Embedding(output_dim=256, input_dim=nb_artists)(x5)
e5 = Lambda(lambda x: mean(x, axis=1))(e5)

# embedding on composers
x6 = Input(shape=(5,), dtype='int32')
e6 = Embedding(output_dim=256, input_dim=nb_composers)(x6)
e6 = Lambda(lambda x: mean(x, axis=1))(e6)

# language
x7 = Input(shape=(1,), dtype='int32')

# embedding on cities
x8 = Input(shape=(1,), dtype='int32')
e8 = Embedding(output_dim=128, input_dim=nb_cities)(x8)
e8 = Flatten()(e8)

x = concatenate([e1, e2, x3, e4])
h = Dense(1024, activation='relu')(x)
h = Dropout(0.3)(x)
h = Dense(512, activation='relu')(x)
h = Dropout(0.3)(x)
h = Dense(256, activation='relu')(x)
h = Dropout(0.3)(x)
h = Dense(128, activation='relu')(x)
h = Dropout(0.3)(x)
o = Dense(1, activation='sigmoid')(h)

model = Model(inputs=[x1, x2, x3, x4],
              outputs=[o])

In [14]:
df = train.head(10000)
df = df.merge(songs, left_on='song_id', right_index=True)\
       .merge(members, left_on='msno', right_index=True)\

x1 = df['msno'].apply(lambda r: one_hot(r, nb_users))
x2 = df['song_id']
x3 = df['song_length']
x4 = df['genre_ids'].astype(str).apply(lambda r: one_hot(r, nb_genres))
x5 = df['artist_name'].apply(lambda r: one_hot(r, nb_artists))
x6 = df['composer'].fillna('').apply(lambda r: one_hot(r, nb_composers))
x7 = df['language'].fillna(-1).astype(str).apply(lambda r: one_hot(r, nb_languages))
x8 = df['city'].fillna(-1).astype(str).apply(lambda r: one_hot(r, nb_cities))