In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import pandas as pd
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical
from sklearn.metrics import roc_auc_score
from scipy import sparse
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
songs = pd.merge(pd.read_csv('data/songs.csv'),
                 pd.read_csv('data/song_extra_info.csv'),
                 on = 'song_id',
                 how='inner')\
          .set_index('song_id', drop=True)

members = pd.read_csv('data/members.csv')\
            .set_index('msno', drop=True)

## Feature Analysis

In [None]:
train = pd.read_csv('data/train.csv').sample(20000)

train = train.merge(songs, left_on='song_id', right_index=True)\
             .merge(members, left_on='msno', right_index=True)

In [None]:
pd.concat([train.nunique(),
           train.dtypes],
          axis=1)

## Models

### Linear Model

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

def parse_df_1(df):
    categorical_columns = df.dtypes.loc[df.dtypes == 'object'].index.tolist()
    
    for col in categorical_columns:
        df[col] = (col + '-' + df[col].str.replace(' ', '_')).fillna('')

    h = HashingVectorizer(token_pattern='[\S]+')

    X = sparse.hstack([h.transform([' '.join(row) for row in df[categorical_columns].values]),])
    y = df['target']
    
    return X, y


for df in pd.read_csv('./data/train.csv', chunksize=1000000):
    df = df.merge(songs, left_on='song_id', right_index=True)\
           .merge(members, left_on='msno', right_index=True)

    train, val = train_test_split(df, shuffle=True)
    X_train, y_train = parse_df_1(train)
    X_val, y_val = parse_df_1(val)

    clf = SGDClassifier(loss='log', max_iter=5)
    clf.partial_fit(X_train, y_train, classes=[0, 1])
    
    print(roc_auc_score(clf.predict(X_val), y_val))

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout

def parse_df_2(df):
    # df['song_length_cut'] = pd.cut(df.song_length, bins=10, labels=range(10))
    
    categorical_columns = df.dtypes.loc[df.dtypes == 'object'].index.tolist()
    
    for col in categorical_columns:
        df[col] = (col + '-' + df[col].str.replace(' ', '_')).fillna('')

    h = HashingVectorizer(n_features=500000,
                          token_pattern='[\S]+')

    X = sparse.hstack([h.transform([' '.join(row) for row in df[categorical_columns].values]),])
    y = df['target']
    
    return X, y


for df in pd.read_csv('./data/train.csv', chunksize=50000):
    df = df.merge(songs, left_on='song_id', right_index=True)\
           .merge(members, left_on='msno', right_index=True)
    
    train, val = train_test_split(df, shuffle=True)
    X_train, y_train = parse_df_2(train)
    X_val, y_val = parse_df_2(val)
    
    print(X_train.shape)
    model = Sequential()
    model.add(Dense(input_shape=(X_train.shape[1],),
                    units=512,
                    activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(units=256,
                    activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(units=128,
                    activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(units=2,
                    activation='softmax'))
    
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.fit(X_train,
              y_train,
              epochs=1,
              validation_data=(X_val, y_val))
    
    print(roc_auc_score(model.predict_classes(X_val), y_val))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


(37493, 500000)


### Using Embeddings

In [None]:
from keras import layers
from keras import models
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

In [None]:
i_user = layers.Input(shape=(1,), dtype='int32', name='user')
embedding_user = layers.Embedding(input_dim=50000, output_dim=64, name='embedding_user')
embedding_user = embedding_user(i_user)
embedding_user = layers.Flatten()(embedding_user)

i_song = layers.Input(shape=(1,), dtype='int32', name='song')
embedding_song = layers.Embedding(input_dim=50000, output_dim=64, name='embedding_song')
embedding_song = embedding_song(i_song)
embedding_song = layers.Flatten()(embedding_song)

e_dot = layers.dot([embedding_user, embedding_song], axes=1)

i_length = layers.Input(shape=(1,), dtype='float32', name='song_length')

m = layers.concatenate([e_dot, i_length])

h = layers.Dense(5, activation='relu')(m)
o = layers.Dense(1, activation='sigmoid')(h)

model = models.Model(inputs=[i_user, i_song, i_length],
                     outputs=o)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
x_user = train['msno'].apply(lambda r: one_hot(r, filters=' ', n=50000)[0])
x_song = train['song_id'].apply(lambda r: one_hot(r, filters=' ', n=50000)[0])
x_length = train['song_length']

In [None]:
model.fit([x_user, x_song, x_length],
          train['target'].values,
          epochs=3,
          validation_split=0.3)

In [None]:
train.groupby('msno')['song_id'].count().mean()