In [None]:
# %matplotlib inline
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
#import statsmodels.formula.api as sm
import seaborn as sns
import sklearn as sl
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 350)

# Import Data

In [3]:
data=pd.read_json('./dataset/song.json')

In [4]:
data.head()

Unnamed: 0,id,user_id,user_state,user_sign_up_date,song_played,time_played
0,GOQMMKSQQH,122,Louisiana,2015-05-16,Hey Jude,2015-06-11 21:51:35
1,HWKKBQKNWI,3,Ohio,2015-05-01,We Can Work It Out,2015-06-06 16:49:19
2,DKQSXVNJDH,35,New Jersey,2015-05-04,Back In the U.S.S.R.,2015-06-14 02:11:29
3,HLHRIDQTUW,126,Illinois,2015-05-16,P.s. I Love You,2015-06-08 12:26:10
4,SUKJCSBCYW,6,New Jersey,2015-05-01,Sgt. Pepper's Lonely Hearts Club Band,2015-06-28 14:57:00


In [5]:
data.shape

(4000, 6)

In [6]:
#fixing dates
data['time_played']=  pd.to_datetime(data['time_played'], format='%Y-%m-%d %H:%M:%S')
data['user_sign_up_date'] = pd.to_datetime(data['user_sign_up_date'], format='%Y-%m-%d')

In [7]:
data.head()

Unnamed: 0,id,user_id,user_state,user_sign_up_date,song_played,time_played
0,GOQMMKSQQH,122,Louisiana,2015-05-16,Hey Jude,2015-06-11 21:51:35
1,HWKKBQKNWI,3,Ohio,2015-05-01,We Can Work It Out,2015-06-06 16:49:19
2,DKQSXVNJDH,35,New Jersey,2015-05-04,Back In the U.S.S.R.,2015-06-14 02:11:29
3,HLHRIDQTUW,126,Illinois,2015-05-16,P.s. I Love You,2015-06-08 12:26:10
4,SUKJCSBCYW,6,New Jersey,2015-05-01,Sgt. Pepper's Lonely Hearts Club Band,2015-06-28 14:57:00


In [8]:
data.dtypes

id                           object
user_id                       int64
user_state                   object
user_sign_up_date    datetime64[ns]
song_played                  object
time_played          datetime64[ns]
dtype: object

In [9]:
data.song_played.unique()

array(['Hey Jude', 'We Can Work It Out', 'Back In the U.S.S.R.',
       'P.s. I Love You', "Sgt. Pepper's Lonely Hearts Club Band",
       'Sgt. Pepper Inner Groove', 'Hello Goodbye', 'Cry For A Shadow',
       'Revolution', 'Let It Be', 'I Feel Fine', 'The Fool On The Hill',
       'Get Back', 'Come Together', 'She Loves You',
       'While My Guitar Gently Weeps', 'Here Comes The Sun',
       'A Day In The Life', 'Getting Better', "Baby You're A Rich Man",
       'The Ballad Of John And Yoko', 'Lucy In The Sky With Diamonds',
       "Don't Let Me Down", 'Reprise / Day in the Life',
       "Maxwell's Silver Hammer", 'Across The Universe',
       'Ob-la-di, Ob-la-da', 'Yesterday', 'Fixing A Hole', 'OH DARLING',
       'Birthday', 'A Saturday Club Xmas/Crimble Medley',
       'Got To Get You Into My Life', 'Wild Honey Pie', 'IN MY LIFE',
       'Strawberry Fields Forever', 'Twist and Shout',
       'I Saw Her Standing There', 'Helter Skelter', 'Something',
       'Eleanor Rigby', 'Thing

# Data Analysis

In [10]:
#since songs are characters, let's just make them all lower case, just in case we have some in upper case and some lower. With more time, we would dig deeper to see if there are perhaps cases in which the same song is spelled differently
data['song_played'] = data['song_played'].str.lower()

Q: What are the top 3 and the bottom 3 states in terms number of users?

There are two ways to interpret this question. One option is considering all 50 US states. The other option is considering only the states that have at least 1 user. We will answer the question both ways.


In [12]:
#From how many states we have at least one user?
print("For", 50 - data['user_state'].nunique(), "states we have no users")
#Count # users per state 
data_state_count = data.groupby('user_state').user_id.nunique()
 
#Top 3 states
print(data_state_count.nlargest(3, keep='all'))


For 9 states we have no users
user_state
New York      23
California    21
Texas         15
Name: user_id, dtype: int64


In [14]:
data_state_count.nlargest(3, keep='last')

user_state
New York      23
California    21
Texas         15
Name: user_id, dtype: int64

In [15]:
#Bottom 3 only including states for which we have at least 1 user
data_state_count.nsmallest(3, keep='all')

user_state
Arizona         1
Connecticut     1
Idaho           1
Iowa            1
Kansas          1
Nebraska        1
New Mexico      1
North Dakota    1
Rhode Island    1
Name: user_id, dtype: int64

In [17]:
all_states = [
         'Alabama','Alaska','Arizona','Arkansas','California','Colorado',
         'Connecticut','Delaware','Florida','Georgia','Hawaii','Idaho', 
         'Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana',
         'Maine', 'Maryland','Massachusetts','Michigan','Minnesota',
         'Mississippi', 'Missouri','Montana','Nebraska','Nevada',
         'New Hampshire','New Jersey','New Mexico','New York',
         'North Carolina','North Dakota','Ohio',    
         'Oklahoma','Oregon','Pennsylvania','Rhode Island',
         'South Carolina','South Dakota','Tennessee','Texas','Utah',
         'Vermont','Virginia','Washington','West Virginia',
         'Wisconsin','Wyoming'
    ]

In [18]:
#Check string format is same for our dataset and all_states
print(len(np.intersect1d(data['user_state'].unique(), all_states))==data['user_state'].nunique())

True


In [19]:
#Now let's find the 9 missing states
print("The following", len(np.setxor1d(data['user_state'].unique(), all_states))  ,"states have no users: ", np.setxor1d(data['user_state'].unique(), all_states))


The following 9 states have no users:  ['Delaware' 'Hawaii' 'Maine' 'Montana' 'Nevada' 'New Hampshire'
 'South Dakota' 'Vermont' 'Wyoming']


Q: What are the top 3 and the bottom 3 states in terms of user engagement?

There are tons of ways to define engagement. This is a very early stage start-ups with low absolute numbers. In this case, you often want a metric that’s sensitive to outliers. After all, outliers mean power users. And the main goal of a small company is identifying where your early power users are and finding more of those people. So we will take avg # of songs listened per user as our metric.


In [21]:
#create a dataset with avg songs listened per user and count unique users for each state
data_engagement = data.groupby(['user_id', 'user_state']).size().reset_index(name='songs_user')


In [22]:
data_engagement.head()

Unnamed: 0,user_id,user_state,songs_user
0,1,Oregon,10
1,2,North Carolina,18
2,3,Ohio,18
3,4,New Mexico,17
4,5,Alabama,21


In [23]:
data_engagement = data_engagement.groupby(['user_state']).apply(
                    lambda x: pd.Series({
                             # avg songs
                            'engagement': x['songs_user'].mean(),
                             # count unique users 
                            'count_unique_users': x['user_id'].nunique()
  })
).reset_index()

In [24]:
data_engagement.head()

Unnamed: 0,user_state,engagement,count_unique_users
0,Alabama,26.0,4.0
1,Alaska,29.0,2.0
2,Arizona,22.0,1.0
3,Arkansas,17.0,2.0
4,California,20.238095,21.0


In [25]:
data_engagement.count_unique_users.median()

3.0

In [26]:
np.median(data_engagement.count_unique_users)

3.0

In [27]:
#Some states have very few users, so avg won't be particularly meaningful if, for instance, there is just 1 user. Let's only consider states in the top 50% in terms of unique user count
data_engagement = data_engagement.loc[data_engagement['count_unique_users'] > np.median(data_engagement.count_unique_users)]

In [28]:
#Top 3 states
print("Top 3 states for user engagement are: \n", data_engagement.nlargest(3, columns = 'engagement', keep='all'))

Top 3 states for user engagement are: 
         user_state  engagement  count_unique_users
0          Alabama   26.000000                 4.0
7          Florida   25.714286                 7.0
26  North Carolina   25.666667                 6.0


In [29]:
#Bottom 3 
print("Bottom 3 states for user engagement are: \n", data_engagement.nsmallest(3, columns = 'engagement', keep='all'))

Bottom 3 states for user engagement are: 
        user_state  engagement  count_unique_users
19      Minnesota   10.500000                 4.0
11        Indiana   13.750000                 4.0
17  Massachusetts   15.166667                 6.0


Q: The CEO wants to send a gift to the first user who signed-up for each state. That is, the first user who signed-up from California, from Oregon, etc. Create a list of those users.


In [32]:
data_min =data[data[["user_state", "user_id", "user_sign_up_date"]].groupby('user_state')['user_sign_up_date'].transform(min)== data['user_sign_up_date']][["user_state", "user_id", "user_sign_up_date"]]

In [33]:
data_min.head()

Unnamed: 0,user_state,user_id,user_sign_up_date
1,Ohio,3,2015-05-01
4,New Jersey,6,2015-05-01
8,Rhode Island,174,2015-05-19
18,Pennsylvania,11,2015-05-02
24,Missouri,85,2015-05-09


In [34]:
data_min.shape

(1005, 3)

In [36]:
data_min = data_min.drop_duplicates()

In [37]:
data_min.shape

(51, 3)

In [35]:
data_min.query('user_state == "Ohio"')

Unnamed: 0,user_state,user_id,user_sign_up_date
1,Ohio,3,2015-05-01
179,Ohio,3,2015-05-01
271,Ohio,3,2015-05-01
436,Ohio,3,2015-05-01
519,Ohio,3,2015-05-01
1166,Ohio,3,2015-05-01
1245,Ohio,3,2015-05-01
1589,Ohio,3,2015-05-01
1636,Ohio,3,2015-05-01
1747,Ohio,3,2015-05-01


Q: Build a function that takes as an input any of the songs in the data and returns the most likely song to be listened next

That is, if, for instance, a user is currently listening to “Eight Days A Week“, which song has the highest probability of being played right after it by the same user? This is going to be V1 of a song recommendation model.

The question specifically asks for the song that is more likely to be played right after another. Therefore, we need to create a dataset that tells usu, for each song, which song is played next. The time factor is crucial and weu don’t want to lose that information. Once we have this, we can directly estimate probabilities for all possible next songs. In short, we should solve this by building a Markov Chain where each song is a state.

Note that if we were building a similarity matrix based on songs played by the same person, we would be answering a different question related to song similarities/preferences that would not take into account the time factor.

In [38]:
#Firstly let's create a dataset where we add a column next_song which is the song played right after by the same user. 
#If there are more than 30 minutes between consecutive songs, we consider that a new session and therefore it is not counted as next_song. 30 minutes of inactivity is standard for defining a new session. If a song is the last of the session, we remove it from the dataset
  
#only keep variables we care about and sort by user_id and time_played
data_MC = data[['song_played', 'time_played', 'user_id']].sort_values(by=['user_id', 'time_played'])

In [40]:
data_MC.query("user_id==1")

Unnamed: 0,song_played,time_played,user_id
1952,yesterday,2015-06-05 14:30:22,1
619,while my guitar gently weeps,2015-06-07 18:54:56,1
2719,the long and winding road,2015-06-08 22:37:41,1
669,reprise / day in the life,2015-06-10 18:00:05,1
353,i feel fine,2015-06-15 15:46:46,1
993,hello goodbye,2015-06-19 14:54:57,1
627,here comes the sun,2015-06-21 21:53:48,1
1373,can't buy me love,2015-06-22 08:05:01,1
1563,birthday,2015-06-25 12:32:22,1
3172,here comes the sun,2015-06-25 20:28:47,1


In [41]:
#add next song and corresponding timestamp
data_MC['next_song'] = data_MC.groupby('user_id')['song_played'].shift(-1)
data_MC['next_time_played'] = data_MC.groupby('user_id')['time_played'].shift(-1)

In [43]:
data_MC.head()

Unnamed: 0,song_played,time_played,user_id,next_song,next_time_played
1952,yesterday,2015-06-05 14:30:22,1,while my guitar gently weeps,2015-06-07 18:54:56
619,while my guitar gently weeps,2015-06-07 18:54:56,1,the long and winding road,2015-06-08 22:37:41
2719,the long and winding road,2015-06-08 22:37:41,1,reprise / day in the life,2015-06-10 18:00:05
669,reprise / day in the life,2015-06-10 18:00:05,1,i feel fine,2015-06-15 15:46:46
353,i feel fine,2015-06-15 15:46:46,1,hello goodbye,2015-06-19 14:54:57


In [44]:
from datetime import datetime
#remove last song for each session. This happens when next_time_played is NA or when next_time_played - time_played > 30 mins
data_MC = data_MC.dropna(subset = ['next_time_played'])
data_MC = data_MC[(data_MC.next_time_played-data_MC.time_played)/pd.Timedelta(minutes=1) < 30]
#also remove when next song is same as current song. Would not make any sense as suggestion
data_MC = data_MC[data_MC.song_played != data_MC.next_song]

In [45]:
data_MC.head()

Unnamed: 0,song_played,time_played,user_id,next_song,next_time_played
2327,eleanor rigby,2015-06-07 20:03:00,3,lucy in the sky with diamonds,2015-06-07 20:24:55
1589,hey jude,2015-06-21 18:34:06,3,eleanor rigby,2015-06-21 18:52:15
3135,revolution,2015-06-12 13:04:57,4,hey jude,2015-06-12 13:33:38
2459,hey jude,2015-06-06 15:21:12,6,sun king,2015-06-06 15:29:28
781,dear prudence,2015-06-15 13:02:50,9,yesterday,2015-06-15 13:20:42


In [47]:
#let's estimate for each song, the next song with the highest count
data_MC = data_MC.groupby(['song_played', 'next_song']).size().reset_index(name = 'count')
data_MC = data_MC[data_MC.groupby(['song_played'])['count'].transform(max) == data_MC['count']]

In [48]:
data_MC[['song_played', 'next_song']].head()

Unnamed: 0,song_played,next_song
0,a day in the life,i am the walrus
1,a day in the life,i saw her standing there
2,a day in the life,i've just seen a face
3,a day in the life,"ob-la-di, ob-la-da"
4,a day in the life,oh darling


In [49]:
#write the function 
def MC_function(song):
  tmp = data_MC [data_MC.song_played == song]
  #if we have no suggestions, pick a song randomly, using as a probability of picking the relative frequency in the original dataset
  if tmp.shape[0]==0:
    return(print("Suggested song after", song, "is:", data[data.song_played != song].song_played.sample(1).values))
  #if we only have one suggestion, simply suggest that one
  if tmp.shape[0]==1:
    return(print("Suggested song after", song, "is:", tmp.next_song.values))
  #if we have more than one suggestion, all equally likely, randomly pick one
  if tmp.shape[0]>1:
    return(print("Suggested song after", song, "is:", tmp.next_song.sample(1).values))


In [50]:
MC_function("eight days a week")

Suggested song after eight days a week is: ['lady madonna']
