In [None]:
# Install your required packages here
!pip install pandas numpy matplotlib sklearn fsspec gcsfs

In [None]:
%env GOOGLE_APPLICATION_CREDENTIALS=/content/drive/My Drive/CS/AI/Credentials/ai-project-2020-f4dfbc25326c.json

In [1]:
%env GOOGLE_APPLICATION_CREDENTIALS=./credentials/ai-project-2020-f4dfbc25326c.json

env: GOOGLE_APPLICATION_CREDENTIALS=./credentials/ai-project-2020-f4dfbc25326c.json


In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout
from sklearn.preprocessing import LabelEncoder
from google.cloud import storage

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
# define constants
bucket_name = "ai-project-2020-spotify"
client = storage.Client()
bucket = client.get_bucket(bucket_name)

In [4]:
train_files = list(bucket.list_blobs(prefix='training_set/'))
for blob in [blob for blob in train_files if '20180715' in blob.name]:
  print(blob.name)

training_set/log_0_20180715_000000000000.csv.gz
training_set/log_1_20180715_000000000000.csv.gz
training_set/log_2_20180715_000000000000.csv.gz
training_set/log_3_20180715_000000000000.csv.gz
training_set/log_4_20180715_000000000000.csv.gz
training_set/log_5_20180715_000000000000.csv.gz
training_set/log_6_20180715_000000000000.csv.gz
training_set/log_7_20180715_000000000000.csv.gz


In [33]:
logs = pd.read_csv(f"gs://{bucket_name}/training_set/log_0_20180715_000000000000.csv.gz")
logs.columns

Index(['session_id', 'session_position', 'session_length', 'track_id_clean',
       'skip_1', 'skip_2', 'skip_3', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'date', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'hist_user_behavior_reason_end'],
      dtype='object')

In [34]:
unique_tracks = logs['track_id_clean'].nunique()
print(unique_tracks)

In [35]:
logs_dropped = logs[['session_id','session_position','track_id_clean', 'skip_2']]
# Skip_2 to binary
logs_dropped['skip_2'] = logs_dropped['skip_2'].astype(int)
logs_dropped.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,session_id,session_position,track_id_clean,skip_2
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0
1,0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,t_9099cd7b-c238-47b7-9381-f23f2c1d1043,0
2,0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0,0
3,0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,t_23cff8d6-d874-4b20-83dc-94e450e8aa20,0
4,0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,t_64f3743c-f624-46bb-a579-0f3f9a07a123,0


In [36]:
encoder = LabelEncoder()
logs_dropped['track_id_clean'] = encoder.fit_transform(logs_dropped['track_id_clean'])
logs_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2990609 entries, 0 to 2990608
Data columns (total 4 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   session_id        object
 1   session_position  int64 
 2   track_id_clean    int64 
 3   skip_2            int64 
dtypes: int64(3), object(1)
memory usage: 91.3+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [37]:
def stack_sessions(df):
    """
    Turn matrix representation into vector by stacking the listen events together (as columns)
    For example:
    session_id session_position feature1 feature2
    a          1                ~        ~
    a          2                ~        ~
    b          1                ~        ~
    b          2                ~        ~
    b          3                ~        ~

    Turns into:
    session_id 1_feature1 1_feature2 2_feature1 2_feature2 3_feature1 3_feature2
    a          ~          ~          ~          ~          Nan        Nan
    b          ~          ~          ~          ~          ~          ~
    """
    columns = list(df.columns)
    columns.remove('session_id')
    columns.remove('session_position')
    sessions = df.pivot(index='session_id', columns='session_position', values=columns)
    return sessions

In [38]:
# Stack all Sessions
stacked_sessions = stack_sessions(logs_dropped)
# Drop all features except track_id, skip_2, session_ids
stacked_sessions.head()

Unnamed: 0_level_0,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,...,skip_2,skip_2,skip_2,skip_2,skip_2,skip_2,skip_2,skip_2,skip_2,skip_2
session_position,1,2,3,4,5,6,7,8,9,10,...,11,12,13,14,15,16,17,18,19,20
session_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0_00006f66-33e5-4de7-a324-2d18e439fc1e,5621.0,180239.0,314424.0,44689.0,125848.0,249229.0,281749.0,14825.0,303773.0,53562.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
0_0000a72b-09ac-412f-b452-9b9e79bded8f,290715.0,7754.0,163371.0,189575.0,157360.0,135801.0,246743.0,283374.0,13956.0,222296.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
0_00010fc5-b79e-4cdf-bc4c-f140d0f99a3a,179103.0,303076.0,129833.0,92348.0,226222.0,118026.0,116226.0,51878.0,295719.0,208526.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
0_00016a3d-9076-4f67-918f-f29e3ce160dc,140720.0,286600.0,101915.0,277035.0,236783.0,68598.0,88140.0,31527.0,230808.0,291701.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
0_00018b58-deb8-4f98-ac5e-d7e01b346130,50130.0,58504.0,88612.0,37166.0,68322.0,44687.0,149478.0,136053.0,184454.0,182141.0,...,0.0,,,,,,,,,


In [39]:
input_array = []
for index, row in stacked_sessions.iterrows():
    index = 3
    while index <= 8 :
        arr_row = {'Track1': row[('track_id_clean', index-2)], 'Track2': row[('track_id_clean', index-1)], 'Track3': row[('track_id_clean', index)], 'Track4': row[('track_id_clean', index+1)], 'Track5': row[('track_id_clean', index+2)], 'Track_3_Skip_2': row[('skip_2', index)]}
        input_array.append(arr_row)
        index = index+1
print(len(input_array))

1070052


In [40]:
model_input = pd.DataFrame(input_array)
model_input.head()

Unnamed: 0,Track1,Track2,Track3,Track4,Track5,Track_3_Skip_2
0,5621.0,180239.0,314424.0,44689.0,125848.0,0.0
1,180239.0,314424.0,44689.0,125848.0,249229.0,0.0
2,314424.0,44689.0,125848.0,249229.0,281749.0,0.0
3,44689.0,125848.0,249229.0,281749.0,14825.0,0.0
4,125848.0,249229.0,281749.0,14825.0,303773.0,1.0


In [41]:
model_input.shape


(1070052, 6)

In [42]:
skips = model_input['Track_3_Skip_2']

In [43]:
tracks = model_input.drop('Track_3_Skip_2', axis=1)

In [44]:
skips.shape

(1070052,)

In [45]:
tracks.shape

(1070052, 5)

In [47]:
embedding_size = 50
embedding = Embedding(input_dim=unique_tracks, output_dim=embedding_size, input_length=5, name='sequence_track_embedding')
model = Sequential()
model.add(embedding)
model.add(Flatten())
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(30, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(15, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))

In [48]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x=tracks, y=skips, epochs=10, validation_split=0.2)

Train on 856041 samples, validate on 214011 samples
Epoch 1/10
155616/856041 [====>.........................] - ETA: 1:06:23 - loss: 0.6920 - accuracy: 0.5255

In [None]:
embedding_layer = model.get_layer(name="sequence_track_embedding")
embedding_layer = pd.DataFrame(embedding_layer.get_weights()[0])
embedding_layer.head()

In [None]:
embedding_layer.shape

In [None]:
embedding_layer.index = encoder.inverse_transform(embedding_layer.index)

In [None]:
embedding_layer.head()

