## In-Session Analysis

Predict next best song *within* each session, using the previous 15 songs in the session.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from numpy.linalg import norm
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
import warnings
from itertools import tee

# Model Building
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.inspection import permutation_importance

# Suppress scientific notation
pd.options.display.float_format = '{:20,.2f}'.format

warnings.filterwarnings("ignore")

## Data Pre-processing

- Join session level data with track context/features
- Convert rewards to binary -1 = skipped, 1 = not skipped
- Encode categorical track and dsession features
- Create features dataframe which contains cleaned/OHE vector of track features

In [2]:
# Load in Data
df_sessions_sampled = pd.read_csv('data/training_set/log_sampled.csv')
df_tracks_sampled = pd.read_csv('data/track_features/tf_sampled.csv')
print("Number of sessions in DF: " + str(len(df_sessions_sampled['session_id'].unique())))
print("Number of tracks in DF: " + str(len(df_tracks_sampled['track_id'].unique())))
# Sanity check num tracks in tracks df equals num tracks in sessions df
assert len(df_sessions_sampled['track_id_clean'].unique()) == len(df_tracks_sampled['track_id'].unique())

Number of sessions in DF: 178342
Number of tracks in DF: 319008


In [3]:
## Join dataframes together on 'track_id'
spotify = df_sessions_sampled.merge(df_tracks_sampled, left_on="track_id_clean", right_on="track_id")
#print(spotify.isna().sum())
spotify.head()

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,...,time_signature,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,False,False,False,True,0,0,...,4,0.15,-0.82,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21
1,0_00079a23-1600-486a-91bd-5208be0c745a,7,12,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,False,False,True,False,0,0,...,4,0.15,-0.82,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21
2,0_012b0fb4-0cc3-429f-9a78-cc6e622153fb,6,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,False,False,False,True,0,0,...,4,0.15,-0.82,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21
3,0_013cc010-c476-4ad2-8972-73449e0b2ef4,9,13,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,False,False,False,True,0,1,...,4,0.15,-0.82,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21
4,0_01a5f0dc-9938-48c9-92f1-c7e51f34d290,7,12,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,False,False,True,False,0,0,...,4,0.15,-0.82,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21


In [4]:
## BINARY REWARDS (skip/no skip)
# spotify['response'] = np.where(
#     spotify['not_skipped']==True,
#     1,
#     -1
# )
# spotify = spotify.drop(columns=['skip_1', 'skip_2', 'skip_3', 'not_skipped'])
# spotify.head()

In [4]:
## MULTICLASS REWARDS
# response = -1, track was skipped
# response = 0, track was only played briefly and skipped
# response = 1, most of the track was played and skipped
# response = 2, track was played in its entirety and not skipped

spotify['response'] = np.where(
    spotify['not_skipped']==True,
    2,
    np.where(spotify['skip_1']==True, -1, 
             np.where(spotify['skip_2']==True, 0, 1))
)
spotify = spotify.drop(columns=['skip_1', 'skip_2', 'skip_3', 'not_skipped'])
spotify.head()

Unnamed: 0,session_id,session_position,session_length,track_id_clean,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,hist_user_behavior_n_seekback,...,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7,response
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,0,0,0,0,...,0.15,-0.82,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,2
1,0_00079a23-1600-486a-91bd-5208be0c745a,7,12,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,1,1,0,0,...,0.15,-0.82,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,1
2,0_012b0fb4-0cc3-429f-9a78-cc6e622153fb,6,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,1,1,0,0,...,0.15,-0.82,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,2
3,0_013cc010-c476-4ad2-8972-73449e0b2ef4,9,13,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,1,0,0,0,0,...,0.15,-0.82,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,2
4,0_01a5f0dc-9938-48c9-92f1-c7e51f34d290,7,12,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,1,1,0,0,...,0.15,-0.82,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,1


## Encode relevant user context data - behavior
To booleans:
- shuffle (0/1)
- premium (0/1)

To float:
- all int columns
- date --> year, month

To OHE vectors:
- context (catalog, charts, editorial playlist, personalized playlist, radio, user collection)
- track end reason (appload, backbtn, clickrow, endplay, fwdbtn, logout, remote, trackdone)

In [5]:
# Convert binary columns to booleans
data_bool_cols = spotify.columns[spotify.dtypes=='bool']
for col in data_bool_cols:
    spotify[col]=spotify[col].astype(int)
spotify['mode'] = spotify['mode'].astype(bool).astype(int)

In [6]:
from datetime import datetime

spotify['date'] = pd.to_datetime(spotify.date, format='%Y-%m-%d')
spotify['year']=spotify['date'].dt.year
spotify['month']=spotify['date'].dt.month
spotify = spotify.drop(columns='date')
spotify.head()

Unnamed: 0,session_id,session_position,session_length,track_id_clean,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,hist_user_behavior_n_seekback,...,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7,response,year,month
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,0,0,0,0,...,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,2,2018,7
1,0_00079a23-1600-486a-91bd-5208be0c745a,7,12,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,1,1,0,0,...,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,1,2018,7
2,0_012b0fb4-0cc3-429f-9a78-cc6e622153fb,6,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,1,1,0,0,...,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,2,2018,7
3,0_013cc010-c476-4ad2-8972-73449e0b2ef4,9,13,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,1,0,0,0,0,...,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,2,2018,7
4,0_01a5f0dc-9938-48c9-92f1-c7e51f34d290,7,12,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,1,1,0,0,...,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,1,2018,7


In [7]:
# OHE Categorical Columns
categorical_columns =['context_type', 'hist_user_behavior_reason_end', 'hist_user_behavior_reason_start']
enc = OneHotEncoder()
array_hot_encoded = enc.fit_transform(spotify[categorical_columns]).toarray()
feature_labels = enc.categories_
feature_labels = np.concatenate(feature_labels, axis=0)
data_hot_encoded = pd.DataFrame(array_hot_encoded, columns=feature_labels, index=spotify.index) # Convert OHE array to df
spotify = spotify.drop(columns=categorical_columns)
spotify_enc = pd.concat([spotify, data_hot_encoded], axis=1)
spotify_enc.head()

Unnamed: 0,session_id,session_position,session_length,track_id_clean,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,hist_user_behavior_n_seekback,...,backbtn,clickrow,endplay,fwdbtn,playbtn,popup,remote,trackdone,trackerror,uriopen
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0_00079a23-1600-486a-91bd-5208be0c745a,7,12,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,1,1,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0_012b0fb4-0cc3-429f-9a78-cc6e622153fb,6,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0_013cc010-c476-4ad2-8972-73449e0b2ef4,9,13,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,1,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0_01a5f0dc-9938-48c9-92f1-c7e51f34d290,7,12,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
## Convert int columns to floats for NN
int_columns = spotify.select_dtypes('int64')
spotify[int_columns.columns]= int_columns.astype('float')
spotify

Unnamed: 0,session_id,session_position,session_length,track_id_clean,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,hist_user_behavior_n_seekback,...,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7,response,year,month
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1.00,20.00,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0.00,0.00,0.00,0.00,0.00,0.00,...,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,2.00,2018.00,7.00
1,0_00079a23-1600-486a-91bd-5208be0c745a,7.00,12.00,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0.00,0.00,1.00,1.00,0.00,0.00,...,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,1.00,2018.00,7.00
2,0_012b0fb4-0cc3-429f-9a78-cc6e622153fb,6.00,20.00,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0.00,0.00,1.00,1.00,0.00,0.00,...,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,2.00,2018.00,7.00
3,0_013cc010-c476-4ad2-8972-73449e0b2ef4,9.00,13.00,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0.00,1.00,0.00,0.00,0.00,0.00,...,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,2.00,2018.00,7.00
4,0_01a5f0dc-9938-48c9-92f1-c7e51f34d290,7.00,12.00,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,0.00,0.00,1.00,1.00,0.00,0.00,...,0.39,0.23,0.03,-0.33,0.02,-0.35,0.21,1.00,2018.00,7.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2990604,0_fffc434b-de94-4a49-ace7-7e574fc94c64,7.00,10.00,t_a136f450-7858-4156-a40a-ef9412014391,0.00,1.00,0.00,0.00,0.00,0.00,...,-0.02,-0.13,-0.42,-0.07,0.16,-0.24,0.00,-1.00,2018.00,7.00
2990605,0_fffc434b-de94-4a49-ace7-7e574fc94c64,8.00,10.00,t_567ae6a1-a420-44e6-8d2f-77d1fe070bdc,0.00,1.00,0.00,0.00,0.00,0.00,...,-0.02,-0.21,-0.49,0.19,0.17,-0.24,-0.16,-1.00,2018.00,7.00
2990606,0_fffc434b-de94-4a49-ace7-7e574fc94c64,10.00,10.00,t_567ae6a1-a420-44e6-8d2f-77d1fe070bdc,0.00,1.00,0.00,0.00,0.00,0.00,...,-0.02,-0.21,-0.49,0.19,0.17,-0.24,-0.16,2.00,2018.00,7.00
2990607,0_fffc434b-de94-4a49-ace7-7e574fc94c64,9.00,10.00,t_81aaf4e3-b544-45b1-8366-a06370398a6a,0.00,1.00,0.00,0.00,0.00,0.00,...,-0.11,-0.30,-0.36,0.10,0.12,-0.26,0.11,2.00,2018.00,7.00


### Convert to numpy arrays for NN

- Separate features from skip outcome
- Remove ID columns

In [31]:
## Remove ID columns
ids = spotify[['session_id', 'track_id_clean', 'track_id']]
data = spotify.drop(columns=ids)

## Separate features from skip outcome
Y = data[['response']]
X = data.drop(columns=Y)
X_feature_names = X.columns # Store feature names for later

# Convert to numpy arrays
X = np.array(X)
Y = np.array(Y)
X = np.array(X).astype(np.float32) # Make sure all columns are floats
print(X)
print(X.shape)
print(Y.shape)

## Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2)

# Convert y_train and y_test to one hot encoded vectors
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Drop first column
y_train = y_train[:, 1:]
y_test = y_test[:, 1:]

[[ 1.0000000e+00  2.0000000e+01  0.0000000e+00 ...  2.0582636e-01
   2.0180000e+03  7.0000000e+00]
 [ 7.0000000e+00  1.2000000e+01  0.0000000e+00 ...  2.0582636e-01
   2.0180000e+03  7.0000000e+00]
 [ 6.0000000e+00  2.0000000e+01  0.0000000e+00 ...  2.0582636e-01
   2.0180000e+03  7.0000000e+00]
 ...
 [ 1.0000000e+01  1.0000000e+01  0.0000000e+00 ... -1.6189176e-01
   2.0180000e+03  7.0000000e+00]
 [ 9.0000000e+00  1.0000000e+01  0.0000000e+00 ...  1.1186143e-01
   2.0180000e+03  7.0000000e+00]
 [ 1.1000000e+01  2.0000000e+01  0.0000000e+00 ...  7.7194706e-02
   2.0180000e+03  7.0000000e+00]]
(2990609, 42)
(2990609, 1)


## Feature Selection using NN

- For each session, inputting track/session context and predicting skip outcome for the remaining 5 songs
- NN based feature selection for skip prediction
    - MLP
     - CNN

Multilayed Perceptron (MLP)

In [32]:
print("Training data shape", X_train.shape)
print("Test data shape", X_test.shape)
print("Training response shape", y_train.shape)
print("Test response shape", y_test.shape)

# Number of nodes in output layer (2) match number of classes in target variable (2)

Training data shape (2392487, 42)
Test data shape (598122, 42)
Training response shape (2392487, 2)
Test response shape (598122, 2)


In [34]:
# Define the MLP architecture
model = Sequential()
model.add(Dense(64, input_dim=42, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x250c31a80>

In [37]:
# Compute feature importance using permutation importance
perm_importance = permutation_importance(model, X_test, y_test, scoring="accuracy")



In [38]:
# Print the feature importances
for i in perm_importance.importances_mean.argsort()[::-1]:
    print(f"{X_feature_names[i]}: {perm_importance.importances_mean[i]:.3f}")

month: 0.000
premium: 0.000
dyn_range_mean: 0.000
danceability: 0.000
bounciness: 0.000
beat_strength: 0.000
acousticness: 0.000
us_popularity_estimate: 0.000
release_year: 0.000
duration: 0.000
hour_of_day: 0.000
year: 0.000
hist_user_behavior_is_shuffle: 0.000
hist_user_behavior_n_seekback: 0.000
hist_user_behavior_n_seekfwd: 0.000
long_pause_before_play: 0.000
short_pause_before_play: 0.000
no_pause_before_play: 0.000
context_switch: 0.000
session_length: 0.000
energy: 0.000
flatness: 0.000
instrumentalness: 0.000
key: 0.000
acoustic_vector_7: 0.000
acoustic_vector_6: 0.000
acoustic_vector_5: 0.000
acoustic_vector_4: 0.000
acoustic_vector_3: 0.000
acoustic_vector_2: 0.000
acoustic_vector_1: 0.000
acoustic_vector_0: 0.000
valence: 0.000
time_signature: 0.000
tempo: 0.000
speechiness: 0.000
organism: 0.000
mode: 0.000
mechanism: 0.000
loudness: 0.000
liveness: 0.000
session_position: 0.000
