# Recommendation system Amazon Music dataset


author: Elvira Dzhuraeva

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity




In [2]:
reviews = pd.read_json('Digital_Music_5.json', lines=True)
music =  pd.read_csv('amazon_music_metadata.csv')

In [3]:
interactions = reviews.groupby(['reviewerID', 'asin'])['overall'].sum().unstack().reset_index().fillna(0).set_index('reviewerID')
interactions.shape

(5541, 3568)

In [4]:
interactions.head()

asin,5555991584,B0000000ZW,B00000016T,B00000016W,B00000017R,B0000001P4,B0000002HZ,B0000002J9,B0000002JR,B0000002ME,...,B00II5VHBU,B00IOVH8AW,B00IXZ9QP4,B00J80ED9M,B00JJCQRDE,B00JJOG5D4,B00JRBLSR2,B00JTHVWO8,B00JYKU6BK,B00KILDVEI
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A08161909WK3HU7UYTMW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1020L7BWW9RAX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10323WWTFPSGP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A103KNDW8GN92L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A103W7ZPKGOCC9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
def create_user_dict(interactions):
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict

In [6]:
user_dict = create_user_dict(interactions=interactions)

In [7]:
def create_item_dict(df,id_col,name_col):
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict

In [8]:
music_dict = create_item_dict(df = music,
                               id_col = 'asin',
                               name_col = 'title')

In [9]:
x = sparse.csr_matrix(interactions.values)
model = LightFM(no_components= 30, loss='warp', k=15)
model.fit(x,epochs=30, num_threads = 4)

<lightfm.lightfm.LightFM at 0x1a207afb70>

In [10]:
# mf_model = runMF(interactions = interactions,
#                  n_components = 30,
#                  loss = 'warp',
#                  k =15,
#                  epoch = 30,
#                  n_jobs = 4)

In [11]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list

In [12]:
rec_list = sample_recommendation_user(model = model, 
                                      interactions = interactions, 
                                      user_id = 'A103KNDW8GN92L', 
                                      user_dict = user_dict,
                                      item_dict = music_dict, 
                                      threshold = 4,
                                      nrec_items = 10)

Known Likes:
1- Pretzel Logic
2- Royal Scam
3- Cant Buy a Thrill
4- Havana Daydreamin
5- Son of a Son of a Sailor
6- Tap Root Manuscript
7- Changes in Latitudes Changes in Attitudes
8- Living &amp; Dying in 3/4 Time
9- Stones
10- The Best Of 3 Dog Night
11- Second Helping
12- A1a
13- But Seriously
14- Abandoned Luncheonette
15- Face Value
16- Days of Future Passed
17- Beautiful Noise
18- Silk Degrees
19- Blood on the Tracks
20- Madman Across the Water
21- Captain Fantastic and the Brown Dirt Cowboy
22- Dont Shoot Me Im Only the Piano Player
23- Unorthodox Behavior

 Recommended Items:
1- Elton John
2- Honky Ch&acirc;teau
3- Rock of the Westies
4- Caribou
5- Aja
6- 52nd Street
7- 21 At 33
8- Empty Sky
9- Long Distance Voyager
10- Seventh Sojourn


In [13]:
def sample_recommendation_item(model,interactions,item_id,user_dict,item_dict,number_of_user):
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(item_id),n_users)))
    user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
    return user_list 

In [14]:
recommend_users_ids = sample_recommendation_item(model = model,
                           interactions = interactions,
                           item_id = 'B00000016W',
                           user_dict = user_dict,
                           item_dict = music_dict,
                           number_of_user = 15)


In [15]:
reviews[reviews['reviewerID'].isin(recommend_users_ids)]

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
60,B00000016T,"[0, 0]",5,These remarks are aboutASIN B0000262U4'The Car...,"03 4, 2014",A31I5XTQ5BAR9F,Paul H Turley,"Great New Wave album, sounding good",1393891200
162,B00000016W,"[3, 4]",5,"...If fans stopped trying to play ""Can You Top...","04 12, 2006",AGKPTMTR3UX1R,"Konrei ""Everything I need is right here""","""Wouldn't It Be Nice...?""",1144800000
178,B00000016W,"[1, 2]",5,As a landlocked kid growing up without ready a...,"10 10, 2008",A2SYP47BL4TP9Q,Mike B.,Here Comes The Sun King,1223596800
838,B00000064E,"[4, 4]",5,Each album made by this cult hero from the 70'...,"10 29, 2001",A3075RVSKC27HU,"P. Nicholas Keppler ""rorscach12""",Might be his best,1004313600
876,B00000064F,"[0, 1]",4,"After the rainy mournfulness of his debut, Fiv...","10 31, 2001",A3075RVSKC27HU,"P. Nicholas Keppler ""rorscach12""","A lighter, jazzier treatment of Nick's songs",1004486400
878,B00000064F,"[2, 27]",3,This is so totaly relaxing that it takes actua...,"06 25, 2003",A23YXA0AX47YPD,R. Bruynesteyn,UNOBNOXIOUS...,1056499200
941,B00000064G,"[11, 13]",5,"In 1972, Nick Drake, just twenty-four years ol...","10 29, 2001",A3075RVSKC27HU,"P. Nicholas Keppler ""rorscach12""","Emotional, bare boned music",1004313600
1137,B000000EDW,"[1, 4]",1,This is my 2nd time out for Baez's second albu...,"11 16, 2012",A291X75UCH6KN8,Robert G. Daugherty,I'd forgotten how bad this was,1353024000
1303,B000000IS5,"[9, 15]",5,As all concur this is a great jazz CD..know th...,"10 30, 2005",AARJLW29JSF31,"JG ""jg""",2 Versions Out There With Different Bonus Tracks,1130630400
1326,B000000ISE,"[1, 2]",4,This cd is a combination of the U.K. release (...,"08 13, 2001",A280XKGM4KT0KN,doug,Trippy n' groovy,997660800


In [16]:
def create_item_emdedding_distance_matrix(model,interactions):
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix

In [17]:
item_item_dist = create_item_emdedding_distance_matrix(model = model,
                                                       interactions = interactions)

In [18]:
def item_item_recommendation(item_emdedding_distance_matrix, item_id, 
                             item_dict, n_items = 10, show = True):
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    if show == True:
        print("Item of interest: {0}".format(item_dict[item_id]))
        print("Item similar to the above item:")
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' +  item_dict[i])
            counter+=1
    return recommended_items

In [19]:
rec_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
                                    item_id = 'B0000002ME',
                                    item_dict = music_dict,
                                    n_items = 10)

Item of interest: Now That Ive Found You
Item similar to the above item:
1- 9 to 5 &amp; Odd Jobs
2- Forget About It
3- No Place That Far
4- Pieces of the Sky
5- So Long So Wrong
6- Everybodys Got a Story
7- Yes I Am
8- Ive Got That Old Feeling
9- Five Days in July
10- Two Highways
