In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline

First things first -- Read the data

In [2]:
song_df = pd.read_json('song.json')

In [3]:
song_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 6 columns):
id                   4000 non-null object
song_played          4000 non-null object
time_played          4000 non-null object
user_id              4000 non-null int64
user_sign_up_date    4000 non-null object
user_state           4000 non-null object
dtypes: int64(1), object(5)
memory usage: 187.6+ KB


In [4]:
song_df.head()

Unnamed: 0,id,song_played,time_played,user_id,user_sign_up_date,user_state
0,GOQMMKSQQH,Hey Jude,2015-06-11 21:51:35,122,2015-05-16,Louisiana
1,HWKKBQKNWI,We Can Work It Out,2015-06-06 16:49:19,3,2015-05-01,Ohio
2,DKQSXVNJDH,Back In the U.S.S.R.,2015-06-14 02:11:29,35,2015-05-04,New Jersey
3,HLHRIDQTUW,P.s. I Love You,2015-06-08 12:26:10,126,2015-05-16,Illinois
4,SUKJCSBCYW,Sgt. Pepper's Lonely Hearts Club Band,2015-06-28 14:57:00,6,2015-05-01,New Jersey


### What are the top 3 and the bottom 3 states in terms of number of users?

In [5]:
song_df.groupby('user_state')['user_id'].apply(lambda group: group.unique().shape[0]).sort_values(ascending=True)

user_state
Arizona            1
New Mexico         1
Connecticut        1
Idaho              1
Nebraska           1
Rhode Island       1
Iowa               1
Kansas             1
North Dakota       1
Alaska             2
Arkansas           2
Washington         2
Virginia           2
Utah               2
Oklahoma           2
Mississippi        3
South Carolina     3
Kentucky           3
Colorado           3
Oregon             3
West Virginia      3
Alabama            4
Minnesota          4
Indiana            4
Tennessee          5
Missouri           5
Louisiana          5
Maryland           5
Wisconsin          5
Michigan           5
Massachusetts      6
North Carolina     6
New Jersey         6
Georgia            6
Illinois           7
Florida            7
Ohio               9
Pennsylvania       9
Texas             15
California        21
New York          23
Name: user_id, dtype: int64

The top 3 States are: New York, California, and Texas  
The bottom 3 States are: Arizona, New Mexico, and Connecticut

### What are the top 3 and bottom 3 states in terms of user engagement?

We'll define user engagement as the average number of plays per user

In [6]:
song_df.groupby('user_state').apply(lambda group: group.groupby('user_id')['id'].count().mean()).sort_values()

user_state
Kansas             8.000000
Virginia           8.500000
Minnesota         10.500000
West Virginia     12.666667
Indiana           13.750000
Massachusetts     15.166667
Texas             15.333333
Connecticut       16.000000
Michigan          16.000000
New Mexico        17.000000
Arkansas          17.000000
Colorado          18.000000
Wisconsin         19.000000
Utah              19.000000
New Jersey        19.500000
Pennsylvania      19.888889
California        20.238095
New York          20.391304
Tennessee         20.400000
Washington        20.500000
Oregon            20.666667
Louisiana         21.000000
Illinois          21.285714
Arizona           22.000000
Maryland          22.400000
Georgia           22.500000
Iowa              23.000000
Ohio              23.222222
Oklahoma          24.500000
Missouri          25.400000
North Carolina    25.666667
Florida           25.714286
Alabama           26.000000
North Dakota      26.000000
Kentucky          26.000000
Idaho    

Top 3 states: Nebraska, Alaska, Mississippi   
Bottom 3 states: Kansas, Virginia, Minnesota

### The CEO wants to send a gift to the first user who signed-up for each state. That is, the first user who signed-up from California, from Oregon, etc. Can you give him a list of those users?

In [7]:
song_df.user_sign_up_date = pd.to_datetime(song_df.user_sign_up_date)

In [8]:
song_df.groupby('user_state')\
       .apply(lambda group: group[group.user_sign_up_date == group.user_sign_up_date.min()]['user_id'].unique()[0])

user_state
Alabama             5
Alaska            106
Arizona           105
Arkansas           78
California         39
Colorado          173
Connecticut       127
Florida            41
Georgia            20
Idaho             165
Illinois           45
Indiana           102
Iowa              178
Kansas            177
Kentucky           34
Louisiana          50
Maryland           18
Massachusetts      15
Michigan           13
Minnesota           8
Mississippi        23
Missouri           85
Nebraska          134
New Jersey          6
New Mexico          4
New York           19
North Carolina      2
North Dakota      135
Ohio                3
Oklahoma          119
Oregon              1
Pennsylvania       11
Rhode Island      174
South Carolina     64
Tennessee          70
Texas               7
Utah               29
Virginia          142
Washington        125
West Virginia      60
Wisconsin          32
dtype: int64

### Build a function that takes as an input any of the songs in the data and returns the most likely song to be listened next. That is, if, for instance, a user is currently listening to "Eight Days A Week", which song has the highest probability of being played right after it by the same user? This is going to be v1 of a song recommendation model. ###

For each song, we'll recommend a second song by examining it's similarity to other songs that were played by users. Two songs that were played by the same set of users have the highest probability of being played next.

In [9]:
utility_matrix = song_df.groupby(['song_played','user_id'])['id'].count().unstack().fillna(0)

In [10]:
item_item_similarity = cosine_similarity(utility_matrix)

In [11]:
np.fill_diagonal(item_item_similarity, 0)

In [12]:
item_similarity_df = pd.DataFrame(item_item_similarity, index= utility_matrix.index, columns= utility_matrix.index)

In [13]:
item_similarity_df.head(10)

song_played,A Day In The Life,A Hard Day's Night,A Saturday Club Xmas/Crimble Medley,ANYTIME AT ALL,Across The Universe,All My Loving,All You Need Is Love,And Your Bird Can Sing,BAD BOY,BALLAD OF JOHN AND YOKO,...,We Can Work It Out,When I'm 64,While My Guitar Gently Weeps,Wild Honey Pie,With a Little Help From My Friends,YOUR MOTHER SHOULD KNOW,Yellow Submarine,Yesterday,You Never Give Me Your Money,You're Going To Lose That Girl
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,0.0,0.235702,0.074536,0.119523,0.212132,0.355023,0.329404,0.152145,0.210819,0.172133,...,0.464938,0.030429,0.508964,0.223607,0.359092,0.037268,0.318198,0.35322,0.087841,0.0
A Hard Day's Night,0.235702,0.0,0.0,0.0,0.1,0.136931,0.111803,0.0,0.0,0.091287,...,0.259548,0.129099,0.210099,0.0,0.0,0.0,0.05,0.195468,0.074536,0.0
A Saturday Club Xmas/Crimble Medley,0.074536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109435,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0
ANYTIME AT ALL,0.119523,0.0,0.0,0.0,0.0,0.154303,0.094491,0.109109,0.0,0.0,...,0.116991,0.0,0.138107,0.089087,0.183942,0.0,0.0,0.146845,0.0,0.0
Across The Universe,0.212132,0.1,0.0,0.0,0.0,0.091287,0.0,0.0,0.0,0.0,...,0.138426,0.0,0.116722,0.0,0.0,0.0,0.0,0.043437,0.0,0.0
All My Loving,0.355023,0.136931,0.0,0.154303,0.091287,0.0,0.204124,0.353553,0.136083,0.083333,...,0.205343,0.0,0.309001,0.048113,0.19868,0.144338,0.273861,0.178437,0.068041,0.0
All You Need Is Love,0.329404,0.111803,0.0,0.094491,0.0,0.204124,0.0,0.0,0.166667,0.204124,...,0.17411,0.0,0.313197,0.176777,0.162221,0.088388,0.111803,0.291386,0.166667,0.0
And Your Bird Can Sing,0.152145,0.0,0.0,0.109109,0.0,0.353553,0.0,0.0,0.19245,0.0,...,0.178707,0.0,0.195893,0.136083,0.140488,0.102062,0.258199,0.112154,0.0,0.0
BAD BOY,0.210819,0.0,0.0,0.0,0.0,0.136083,0.166667,0.19245,0.0,0.136083,...,0.103176,0.19245,0.191398,0.157135,0.108148,0.0,0.0,0.064752,0.111111,0.0
BALLAD OF JOHN AND YOKO,0.172133,0.091287,0.0,0.0,0.0,0.083333,0.204124,0.0,0.136083,0.0,...,0.25273,0.235702,0.234414,0.19245,0.066227,0.144338,0.0,0.079305,0.0,0.0


In [14]:
item_similarity_df.apply(lambda row: row.idxmax(), axis=1)

song_played
A Day In The Life                                                                  Come Together
A Hard Day's Night                                                                 Come Together
A Saturday Club Xmas/Crimble Medley                                                         GIRL
ANYTIME AT ALL                                                                 Can't Buy Me Love
Across The Universe                                                                   Revolution
All My Loving                                                                          Let It Be
All You Need Is Love                                                           A Day In The Life
And Your Bird Can Sing                                                             All My Loving
BAD BOY                                                                                 Hey Jude
BALLAD OF JOHN AND YOKO                                                          Golden Slumbers
Baby You're A Rich

### How would you set up a test to check if whether your model works well and is improving engagement?

I would set up an A/B Test. 

I'd start by dividing the user base into two groups:
* A control group who will not receive any song recommendations, and
* A treatment group who will receive recommendations from the model

I'd then measure the engagement user engagement rate within both groups (that is, the number of songs played by each user) after a period of time (say a week or a month) and then perform a one tailed t-test on the measurments in both groups.
* My null hypothesis will be that there's no difference in engament between the control and treatment group
* My alternative hypothesis will be that the treatment group has a greater engagement than the control group