In [5]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from sklearn.preprocessing import LabelEncoder

In [6]:
logs = pd.read_csv('data/training_set_mini.csv')
logs.columns

Index(['session_id', 'session_position', 'session_length', 'track_id_clean',
       'skip_1', 'skip_2', 'skip_3', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'date', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'hist_user_behavior_reason_end'],
      dtype='object')

In [7]:
unique_tracks = logs['track_id_clean'].nunique()

In [8]:
model_input = logs[['track_id_clean', 'skip_2']]
# Skip_2 to binary
model_input['skip_2'] = model_input['skip_2'].astype(int)
model_input.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,track_id_clean,skip_2
0,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0
1,t_9099cd7b-c238-47b7-9381-f23f2c1d1043,0
2,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0,0
3,t_23cff8d6-d874-4b20-83dc-94e450e8aa20,0
4,t_64f3743c-f624-46bb-a579-0f3f9a07a123,0


In [9]:
skips = model_input['skip_2']

In [10]:
encoder = LabelEncoder()
model_input['track_id_clean'] = encoder.fit_transform(model_input['track_id_clean'])
model_input.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0      890
1    28794
2    49953
3     7133
4    20100
Name: track_id_clean, dtype: int64

In [11]:
tracks = model_input.drop('skip_2', axis=1)

In [12]:
skips.shape

(167880,)

In [13]:
tracks.shape

(167880, 1)

In [14]:
# Note: no hyperparam tuning or anything; random value
embedding_size = 30
embedding = Embedding(input_dim=unique_tracks, output_dim=embedding_size, input_length=1, name='simple_track_embedding')
model = Sequential()
model.add(embedding)
model.add(Flatten())
model.add(Dense(30, activation="relu"))
model.add(Dense(15, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

In [15]:
model.compile(optimizer='adam', loss='binary_crossentropy')

In [16]:
model.fit(x=tracks, y=skips, epochs=10)

Train on 167880 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fece6ce2750>

In [17]:
embedding_layer = model.get_layer(name="simple_track_embedding")
embedding_layer = pd.DataFrame(embedding_layer.get_weights()[0])
embedding_layer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.039616,0.031245,-0.010624,0.012539,0.016842,0.009916,-0.074926,0.025634,0.000581,-0.007201,...,0.003268,0.025305,0.118738,0.106637,-0.066458,-0.085568,-0.138731,0.060072,-0.058621,0.036566
1,0.106493,0.187047,-0.021124,-0.139072,-0.204719,-0.143086,-0.124873,0.080098,0.133161,0.132346,...,0.166538,-0.164691,-0.290018,0.113933,0.108228,-0.099224,-0.018081,0.217432,0.145408,-0.134952
2,0.013989,-0.003414,0.070037,-0.044737,0.010381,-0.019526,-0.032099,-0.023538,0.017303,-0.069139,...,0.091654,-0.080614,0.013312,0.138682,0.091233,0.031144,0.007486,0.159009,-0.083794,0.058196
3,0.078738,-0.087371,0.076692,0.030285,-0.03674,0.094477,-0.038655,0.079839,-0.069403,0.022919,...,0.038315,-0.026034,0.051578,0.101857,-0.045312,-0.041455,-0.050799,0.004607,-0.024801,-0.023521
4,-0.147822,-0.020228,0.058357,-0.044785,0.242435,0.050384,0.153116,0.098516,0.037036,-0.159087,...,-0.107911,0.221624,0.067732,0.076117,0.118543,0.157262,0.141642,0.078912,-0.111785,0.002963


In [18]:
embedding_layer.shape

(50704, 30)

In [21]:
embedding_layer.index = encoder.inverse_transform(embedding_layer.index)

In [22]:
embedding_layer.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
t_00007fba-6bd3-449d-85dd-54d4aea397c2,0.039616,0.031245,-0.010624,0.012539,0.016842,0.009916,-0.074926,0.025634,0.000581,-0.007201,...,0.003268,0.025305,0.118738,0.106637,-0.066458,-0.085568,-0.138731,0.060072,-0.058621,0.036566
t_0000dc06-0c00-4a09-9dc6-3bdad9c6f0e8,0.106493,0.187047,-0.021124,-0.139072,-0.204719,-0.143086,-0.124873,0.080098,0.133161,0.132346,...,0.166538,-0.164691,-0.290018,0.113933,0.108228,-0.099224,-0.018081,0.217432,0.145408,-0.134952
t_00020dc1-1b82-43e9-8327-77b074bdf626,0.013989,-0.003414,0.070037,-0.044737,0.010381,-0.019526,-0.032099,-0.023538,0.017303,-0.069139,...,0.091654,-0.080614,0.013312,0.138682,0.091233,0.031144,0.007486,0.159009,-0.083794,0.058196
t_0003d374-de7a-44c0-a2b6-9ee6785a0750,0.078738,-0.087371,0.076692,0.030285,-0.03674,0.094477,-0.038655,0.079839,-0.069403,0.022919,...,0.038315,-0.026034,0.051578,0.101857,-0.045312,-0.041455,-0.050799,0.004607,-0.024801,-0.023521
t_00042d9b-e795-41a9-89ad-504373dd4287,-0.147822,-0.020228,0.058357,-0.044785,0.242435,0.050384,0.153116,0.098516,0.037036,-0.159087,...,-0.107911,0.221624,0.067732,0.076117,0.118543,0.157262,0.141642,0.078912,-0.111785,0.002963
