# Baseline and interactive recommender skeleton.

This notebook shows the skeleton and ideas of the functionalities implement in the flask web application. 

For final code and documentation see the `flaskapp/` folder.

In [1]:
# Loading dependencies
import operator
from operator import itemgetter
import codecs
import logging
import time
import tqdm
import h5py
from scipy.sparse import coo_matrix, csr_matrix
import scipy.sparse as sp
from random import randint
import os
import json
import pickle

import itertools
import numpy as np
import pandas as pd

from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares,
                                      NMSLibAlternatingLeastSquares)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)
from implicit.evaluation import precision_at_k, train_test_split
os.environ['OPENBLAS_NUM_THREADS'] = "1"
%alias_magic t timeit

ModuleNotFoundError: No module named 'implicit'

In [2]:
# borrowed from https://github.com/benfred/implicit, modified to read the file locally instead of downloading it.
def get_lastfm():
    """Returns a tuple of (artistids, userids, plays) where plays is a CSR matrix """
    
    filename = '../flaskapp/data/lastfm_360k.hdf5'

    with h5py.File(filename, 'r') as f:
        m = f.get('artist_user_plays')
        #print(type(m.get('indptr')))
        plays = csr_matrix((m.get('data'), m.get('indices'), m.get('indptr')))
        return np.array(f['artist']), np.array(f['user']), plays

In [3]:
# loading data
artists, users, plays = get_lastfm()

In [4]:
pickle_off = open("../flaskapp/data/tag_set.pickle","rb")
tag_set = pickle.load(pickle_off)

FileNotFoundError: [Errno 2] No such file or directory: '../flaskapp/data/tag_set.pickle'

In [3]:
tag_set

{3: ['soul',
  'jazz',
  'female vocalists',
  'british',
  'singer-songwriter',
  'pop',
  'funk',
  'amy winehouse',
  'blues',
  'neo-soul'],
 5: ['electronic',
  'big beat',
  'dance',
  'house',
  'electronica',
  'british',
  'alternative',
  'techno',
  'breakbeat'],
 6: ['hip-hop',
  'rap',
  'hip hop',
  'rnb',
  'kanye west',
  'gay fish',
  'american',
  'pop',
  'chicago'],
 7: ['hip-hop',
  'spanish',
  'female vocalists',
  'hip hop',
  'rap',
  'spanish hip-hop',
  'spain',
  'latin',
  'español'],
 9: ['new age',
  'ambient',
  'ethnic',
  'world',
  'chillout',
  'meditative',
  'oriental',
  'easy listening',
  'folk',
  'instrumental'],
 15: ['blues',
  'polish',
  'blues rock',
  'rock',
  'classic rock',
  'polish blues',
  '70s',
  'guitar',
  'polish rock',
  'progressive rock'],
 17: ['stolen mp3'],
 18: ['rock',
  'polish',
  'ska',
  'alternative',
  '80s',
  'polish rock',
  'alternative rock',
  'classic rock',
  'polish ska'],
 21: ['electronic',
  'indie',

# data loaded, now train model

In [70]:
os.environ['OPENBLAS_NUM_THREADS'] = "1"
model = AlternatingLeastSquares(factors=128, regularization=20, iterations=15)      
# train
model.fit(plays)

# Enabling multi threads again
os.environ['OPENBLAS_NUM_THREADS'] = "4"

100%|██████████| 15.0/15 [02:14<00:00, 10.59s/it]


# baseline recommender

In [71]:
# generate recommendations for each user and write out to a file
# parse plays or new_plays depending on what you recommend for
def generateRecommendations(plays, recalc, u_index):
    start = time.time()
    u_i = u_index
    user_plays = plays.T.tocsr()
    
    #remember to change "recalculate_user" to True before boosting and updated user_plays from new_plays on the line above.
    for artist_id, score in model.recommend(u_i, user_plays, recalculate_user=recalc, N=20):
            print('artist id: '+str(artist_id) + ' '+ artists[artist_id] + ' score: ' + str(score))
    logging.debug("generated recommendations in %0.2fs",  time.time() - start)

In [13]:
# user profile: 
[artists[i] for i in plays.getcol(38345).tocoo().row]

['al di meola',
 'antonio vivaldi',
 'bad religion',
 'beastie boys',
 'beatsteaks',
 'bill withers',
 'bright eyes',
 'cartoons',
 'children of bodom',
 'chimaira',
 'devendra banhart',
 'devildriver',
 'die apokalyptischen reiter',
 'die Ärzte',
 'excrementory grindfuckers',
 'fear factory',
 'finntroll',
 'gorillaz',
 'in flames',
 'infected mushroom',
 'joe pass',
 'juno reactor',
 'k.i.z.',
 'lamb of god',
 'linkin park',
 'mando diao',
 'metallica',
 'mojo club',
 'paco de lucía',
 'pig destroyer',
 'rammstein',
 'red hot chili peppers',
 'rené aubry',
 'richard cheese',
 'robbie williams',
 'santana',
 'seeed',
 'slipknot',
 'soilwork',
 'soulfly',
 'system of a down',
 'the beatles',
 'the doors',
 'the hives',
 'the offspring',
 'the police',
 'the toasters',
 'tool',
 'various house artists']

In [72]:
generateRecommendations(plays, False, 38345)

artist id: 51847 bob marley & the wailers score: 1.4894065
artist id: 91879 dire straits score: 1.4066366
artist id: 83885 dave matthews band score: 1.3683503
artist id: 25250 andy mckee score: 1.3619839
artist id: 149145 jethro tull score: 1.330158
artist id: 269844 tracy chapman score: 1.3260002
artist id: 181203 mark knopfler score: 1.3072503
artist id: 176900 machine head score: 1.2301874
artist id: 144255 jack johnson score: 1.2227654
artist id: 243915 steve vai score: 1.2215725
artist id: 145779 jamiroquai score: 1.2187392
artist id: 51829 bob marley score: 1.197113
artist id: 108936 eric clapton score: 1.1921062
artist id: 183148 matisyahu score: 1.1900365
artist id: 64191 cat stevens score: 1.1722248
artist id: 232379 serj tankian score: 1.1499741
artist id: 60753 cake score: 1.1388681
artist id: 152322 johnny cash score: 1.1374081
artist id: 245357 sublime score: 1.1351407
artist id: 149623 jimi hendrix score: 1.1334281


# below cell is just an idea, not tested in this study

In [54]:
"""Function takes input_artist and boosts them to some value k plays 
    and removes the rest of the entries in the profile by setting plays value to zero
"""
def boost_input_artists(boost_list, u_i, k):
    # new_plays is the updated interaction matrix to produce new recommendations for
    new_plays = plays.copy()
    
    profile_col = new_plays.getcol(u_i).tocoo()
    profile_a = profile_col.row
    
    for a in profile_a:
        if a in boost_list:
            new_plays[a, u_i] = k
        else:
            new_plays[a, u_i] = 0
    return new_plays

# interactive recommender prototype

In [16]:
"""Function takes input_artist and boosts them to some value k plays 
    and removes the rest of the entries in the profile by setting plays value to zero
"""
def adjust_click(a, u_i, value):
    # new_plays is the updated interaction matrix to produce new recommendations for
    new_plays = sp.csr_matrix(plays.copy())
    
    profile_col = new_plays.getcol(u_i).tocoo()
    profile_a = profile_col.row
    
    for a in profile_a:
        if a in boost_list:
            new_plays[a, u_i] = value
        else:
            new_plays[a, u_i] = 0
    return new_plays.getcol(u_i)

In [35]:
""" Decreasing the interaction/plays value
"""
def plays_down(plays, artist, u_i):
    start = time.time()
    increase_val = 5000
    plays[artist, u_i] -= increase_val
    print("elapsed: " +str(time.time()-start))
    return plays

In [78]:
#TODO make plays_up()

In [18]:
def adjust_plays_matrix(plays, u_i, pl_data):
    profile_col = plays.getcol(u_i).tocoo()
    plays_row = profile_col.row
    i = 0
    for a in plays_row:
        #print(str(a)+": "+str(pl_data[i]))
        plays[a, u_i] = pl_data[i]
        i+=1
    return plays

In [303]:
#b_index = int(np.where(plays_row == 15063)[0])

In [165]:
#str_data = list(map(str, new_data))

In [45]:
# before adjust
print(plays.getcol(38345).tocoo())

  (15063, 0)	-19892.0
  (28676, 0)	419.0
  (38343, 0)	394.0
  (42098, 0)	330.0
  (42394, 0)	173.0
  (47300, 0)	101.0
  (56308, 0)	198.0
  (63632, 0)	139.0
  (68167, 0)	219.0
  (68364, 0)	150.0
  (89636, 0)	96.0
  (89711, 0)	1156.0
  (90588, 0)	1187.0
  (90948, 0)	184.0
  (111541, 0)	485.0
  (113849, 0)	228.0
  (115413, 0)	903.0
  (127243, 0)	339.0
  (140720, 0)	3473.0
  (141240, 0)	268.0
  (150688, 0)	139.0
  (155642, 0)	124.0
  (156316, 0)	106.0
  (166345, 0)	376.0
  (171186, 0)	109.0
  (178835, 0)	978.0
  (186258, 0)	2200.0
  (191031, 0)	121.0
  (206579, 0)	928.0
  (212279, 0)	291.0
  (218594, 0)	249.0
  (220146, 0)	797.0
  (221113, 0)	825.0
  (222082, 0)	222.0
  (223465, 0)	105.0
  (228979, 0)	544.0
  (231520, 0)	420.0
  (237513, 0)	329.0
  (238685, 0)	803.0
  (240037, 0)	123.0
  (247960, 0)	2844.0
  (252512, 0)	347.0
  (255149, 0)	691.0
  (257333, 0)	190.0
  (260396, 0)	141.0
  (261072, 0)	201.0
  (263863, 0)	216.0
  (269151, 0)	96.0
  (275208, 0)	100.0


In [52]:
new_plays = plays_down(plays, 15063, 38345)

elapsed: 0.0005071163177490234


In [44]:
# after adjust
print(new_plays.getcol(38345).tocoo())

  (15063, 0)	-19892.0
  (28676, 0)	419.0
  (38343, 0)	394.0
  (42098, 0)	330.0
  (42394, 0)	173.0
  (47300, 0)	101.0
  (56308, 0)	198.0
  (63632, 0)	139.0
  (68167, 0)	219.0
  (68364, 0)	150.0
  (89636, 0)	96.0
  (89711, 0)	1156.0
  (90588, 0)	1187.0
  (90948, 0)	184.0
  (111541, 0)	485.0
  (113849, 0)	228.0
  (115413, 0)	903.0
  (127243, 0)	339.0
  (140720, 0)	3473.0
  (141240, 0)	268.0
  (150688, 0)	139.0
  (155642, 0)	124.0
  (156316, 0)	106.0
  (166345, 0)	376.0
  (171186, 0)	109.0
  (178835, 0)	978.0
  (186258, 0)	2200.0
  (191031, 0)	121.0
  (206579, 0)	928.0
  (212279, 0)	291.0
  (218594, 0)	249.0
  (220146, 0)	797.0
  (221113, 0)	825.0
  (222082, 0)	222.0
  (223465, 0)	105.0
  (228979, 0)	544.0
  (231520, 0)	420.0
  (237513, 0)	329.0
  (238685, 0)	803.0
  (240037, 0)	123.0
  (247960, 0)	2844.0
  (252512, 0)	347.0
  (255149, 0)	691.0
  (257333, 0)	190.0
  (260396, 0)	141.0
  (261072, 0)	201.0
  (263863, 0)	216.0
  (269151, 0)	96.0
  (275208, 0)	100.0


In [65]:
b = [221113, 28676]

In [55]:
%t new_plays = boost_input_artists(b, 38345,50000)

75.4 ms ± 2.19 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [66]:
new_plays = boost_input_artists(b, 38345,50000)

In [67]:
print(new_plays.getcol(38345).tocoo())

  (15063, 0)	0.0
  (28676, 0)	50000.0
  (38343, 0)	0.0
  (42098, 0)	0.0
  (42394, 0)	0.0
  (47300, 0)	0.0
  (56308, 0)	0.0
  (63632, 0)	0.0
  (68167, 0)	0.0
  (68364, 0)	0.0
  (89636, 0)	0.0
  (89711, 0)	0.0
  (90588, 0)	0.0
  (90948, 0)	0.0
  (111541, 0)	0.0
  (113849, 0)	0.0
  (115413, 0)	0.0
  (127243, 0)	0.0
  (140720, 0)	0.0
  (141240, 0)	0.0
  (150688, 0)	0.0
  (155642, 0)	0.0
  (156316, 0)	0.0
  (166345, 0)	0.0
  (171186, 0)	0.0
  (178835, 0)	0.0
  (186258, 0)	0.0
  (191031, 0)	0.0
  (206579, 0)	0.0
  (212279, 0)	0.0
  (218594, 0)	0.0
  (220146, 0)	0.0
  (221113, 0)	50000.0
  (222082, 0)	0.0
  (223465, 0)	0.0
  (228979, 0)	0.0
  (231520, 0)	0.0
  (237513, 0)	0.0
  (238685, 0)	0.0
  (240037, 0)	0.0
  (247960, 0)	0.0
  (252512, 0)	0.0
  (255149, 0)	0.0
  (257333, 0)	0.0
  (260396, 0)	0.0
  (261072, 0)	0.0
  (263863, 0)	0.0
  (269151, 0)	0.0
  (275208, 0)	0.0


# Visual Interative use case:  Recommending for the following artists

In [68]:
[artists[i] for i in b]

['rené aubry', 'antonio vivaldi']

In [73]:
generateRecommendations(new_plays, True, 38345)

artist id: 175365 ludwig van beethoven score: 0.7901967176274609
artist id: 151096 johann sebastian bach score: 0.7586437605948044
artist id: 280531 wolfgang amadeus mozart score: 0.7546179994104548
artist id: 127082 goran bregovic score: 0.7072151970239857
artist id: 120312 frédéric chopin score: 0.6934351206175615
artist id: 240337 soundtrack score: 0.6828410160429623
artist id: 123613 georg friedrich händel score: 0.6422260625592194
artist id: 109306 erik satie score: 0.6406746452407777
artist id: 152166 john williams score: 0.639241371211656
artist id: 57349 bruno coulais score: 0.6243103154275138
artist id: 201400 nouvelle vague score: 0.5953689616798346
artist id: 224535 rodrigo y gabriela score: 0.5699598246917814
artist id: 72668 clint mansell score: 0.563986105054543
artist id: 175341 ludovico einaudi score: 0.557928520342298
artist id: 265937 thomas newman score: 0.5484340064534046
artist id: 161427 kings of convenience score: 0.5424736663593734
artist id: 216676 pyotr ilyich

# Based on above the following hypothesis is created: 
*Recommending for a subset of the user interactions, provides better results and transparancy than for a users full interaction history (user items)* 