In [1]:
import numpy as np
import pandas as pd
from pandas.io import gbq

import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import glob

from pitched_recommend import Recommender

sns.set(style="whitegrid")
pd.set_option('display.float_format', lambda x: '%.1f' % x)
%matplotlib inline

In [3]:
path =r'/Users/daria/Downloads/'
all_files = glob.glob(path + "/recommender_model_data_20190221_*.csv")

In [4]:
all_files

['/Users/daria/Downloads/recommender_model_data_20190221_playlists_data_000000000002.csv',
 '/Users/daria/Downloads/recommender_model_data_20190221_playlists_data_000000000003.csv',
 '/Users/daria/Downloads/recommender_model_data_20190221_playlists_data_000000000001.csv',
 '/Users/daria/Downloads/recommender_model_data_20190221_playlists_data_000000000000.csv',
 '/Users/daria/Downloads/recommender_model_data_20190221_playlists_data_000000000004.csv',
 '/Users/daria/Downloads/recommender_model_data_20190221_playlists_data_000000000005.csv']

In [5]:
playlists_data = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
playlists_data.head()

Unnamed: 0,playlist_uri,isrc,days_on_playlist
0,spotify:user:paulioh:playlist:3qBHomJ6UDF8zt8q...,UK74K1400365,107
1,spotify:user:1235160051:playlist:76PuSTwbsQPyZ...,GBAAA9300178,343
2,spotify:user:22x5mfof5k3pzrgdoxqkz5zpi:playlis...,US7VG1687854,40
3,spotify:user:thesoundsofspotify:playlist:2DbQN...,USRE11100068,6
4,spotify:user:amirghalil:playlist:6AnT62f6ky8S7...,USIR19905031,90


In [81]:
playlists_data = pd.concat([playlists_data,missing_browse_data], ignore_index = True)

In [82]:
len(playlists_data.playlist_uri.unique())

268490

In [83]:
len(playlists_data.isrc.unique())

4226761

In [84]:
rec_config = {  
    'pct_test': 0.1,
    'alpha': 100,
    'factors': 200,
    'regularization': 0.1,
    'iterations': 1000,
    'rseed': 4393971,
    'save_basedir' : '../saved_models/iter1000_alpha100_factors200_reg01_rseed0_pctTest01'
                   }

In [85]:
rec = Recommender(rec_config)

In [None]:
rec.build_recommender('/Users/daria/Downloads/recommender_model_data_20181124_model_20181112_playlists_data.csv')

Loading and preparing data...
Training recommender...


 37%|███▋      | 374.5/1000 [5:31:08<6:05:42, 35.08s/it]    

In [None]:
with open('model_20190221.pkl', 'wb') as file:
    pickle.dump(rec, file)

In [None]:
gbq.to_gbq(rec.isrcs, project_id='umg-comm-tech-dev', destination_table='recommender.isrcs', 
           if_exists='replace')

## Testing

In [13]:
with open('../playlist_pitching/model_20181112_lighter.pkl', 'rb') as f:
    model_old = pickle.load(f)

In [14]:
jazz_test = pd.read_csv('/Users/daria/Downloads/')

In [15]:
jazz_test = jazz_test.drop(columns=['Name','Description','Image','Label','Duration','Upc','AddedAt'])

col_dict = {
    'PlaylistUri':'playlist_uri',
    'Isrc':'isrc',
    'Artist':'track_artist',
    'TrackName':'track_title',
    'AddedAt':'date_add'
}

jazz_test = jazz_test.rename(columns=col_dict)
jazz_test.head()

Unnamed: 0,playlist_uri,isrc,track_artist,track_title
0,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...,USPR36486918,Chet Baker,The Touch Of Your Lips
1,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...,USPR36212077,Rahsaan Roland Kirk,When The Sun Comes Out
2,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...,USPR36305146,"Stan Getz,João Gilberto,Astrud Gilberto",The Girl From Ipanema
3,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...,USBN20100615,"Charlie Hunter,Norah Jones",More Than This
4,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...,USPR35400074,Erroll Garner,Misty


In [16]:
jazz_test['days_on_playlist'] = 30

In [20]:
jazz_test.shape

(20, 5)

In [19]:
print(len(jazz_test.isrc[jazz_test.isrc.isin(model_old.isrcs)]))
print(len(jazz_test.isrc[jazz_test.isrc.isin(rec.isrcs)]))

12
12


In [21]:
jazz_test_old = jazz_test[jazz_test.isrc.isin(model_old.isrcs)]
jazz_test_old = jazz_test_old.reset_index(drop=True)
jazz_test_old.shape

(12, 5)

In [22]:
jazz_test_new = jazz_test[jazz_test.isrc.isin(rec.isrcs)]
jazz_test_new = jazz_test_new.reset_index(drop=True)
jazz_test_new.shape

(12, 5)

In [41]:
jazz_recs_old = model_old.recommend_outofmodel(jazz_test_old, 
                               model_old.playlists_sparse, N=300)
jazz_recs_new = rec.recommend_outofmodel(jazz_test_new, 
                               rec.playlists_sparse, N=300)



In [31]:
with open('../playlist_pitching/metadata_dict.pkl', 'rb') as f:
    metadata = pickle.load(f)

In [42]:
jazz_recs_old_readable = model_old.rec_to_isrc(jazz_recs_old)
jazz_recs_new_readable = rec.rec_to_isrc(jazz_recs_new)

In [43]:
jazz_recs_old_final = [(x[0],x[1],metadata['track_artist'][x[0]], metadata['track_title'][x[0]], metadata['major_label'][x[0]], metadata['label_studio'][x[0]],
               metadata['original_release_date'][x[0]], metadata['genre_name'][x[0]], metadata['parent_genre_name'][x[0]], 
               x[2]) for x in list(jazz_recs_old_readable)]

jazz_recs_new_final = [(x[0],x[1],metadata['track_artist'][x[0]], metadata['track_title'][x[0]], metadata['major_label'][x[0]], metadata['label_studio'][x[0]],
               metadata['original_release_date'][x[0]], metadata['genre_name'][x[0]], metadata['parent_genre_name'][x[0]], 
               x[2]) for x in list(jazz_recs_new_readable)]

all_columns = ['isrc','score','artist','title','major_label','label_studio','original_release_date','genre_name',
               'parent_genre_name','seed_playlist_uri']
rec_df_old = pd.DataFrame(jazz_recs_old_final, columns = all_columns)
rec_df_new = pd.DataFrame(jazz_recs_new_final, columns = all_columns)

In [44]:
rec_df_old.head(100)

Unnamed: 0,isrc,score,artist,title,major_label,label_studio,original_release_date,genre_name,parent_genre_name,seed_playlist_uri
0,USSM15900108,0.9,The Dave Brubeck Quartet,Take Five,Sony,Columbia/Legacy,2003-03-11 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
1,USSM13300772,0.9,Art Tatum,Tea for Two,Sony,Columbia/Legacy,2011-03-07 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
2,JPCO05000120,0.9,Dizzy Gillespie,All The Things You Are,UMG,Savoy,1955-01-01 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
3,USSM15600497,0.9,Miles Davis,'Round Midnight,Sony,Columbia/Legacy,1980-01-01 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
4,USSM15900113,0.9,Miles Davis,So What,Sony,Sony Music Entertainment,2012-07-10 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
5,USMC16351954,0.9,Duke Ellington,In A Sentimental Mood,UMG,Impulse!,2001-01-01 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
6,USPR36200025,0.9,Stan Getz,Desafinado,UMG,Verve Records,1989-10-01 00:00:00,Brazilian,Brazilian,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
7,USMC16352622,0.9,Duke Ellington,My Little Brown Book,UMG,U-5,2014-10-14 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
8,USAT20102797,0.9,John Coltrane,Naima,Other,Warner Music Group - X5 Music Group,2018-01-15 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
9,USSM10021825,0.9,Chet Baker,Autumn Leaves,Sony,Sony Classical,2015-02-13 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...


In [45]:
rec_df_new.head(100)

Unnamed: 0,isrc,score,artist,title,major_label,label_studio,original_release_date,genre_name,parent_genre_name,seed_playlist_uri
0,USSM15900108,0.9,The Dave Brubeck Quartet,Take Five,Sony,Columbia/Legacy,2003-03-11 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
1,USSM13300772,0.9,Art Tatum,Tea for Two,Sony,Columbia/Legacy,2011-03-07 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
2,USSM15900115,0.9,Miles Davis,Blue in Green,Sony,Columbia,2003-03-27 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
3,USSM15900113,0.9,Miles Davis,So What,Sony,Sony Music Entertainment,2012-07-10 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
4,JPCO05000120,0.9,Dizzy Gillespie,All The Things You Are,UMG,Savoy,1955-01-01 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
5,USSM15901082,0.9,Gerry Mulligan,My Funny Valentine,Sony,Columbia,1998-06-01 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
6,USMC16351954,0.9,Duke Ellington,In A Sentimental Mood,UMG,Impulse!,2001-01-01 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
7,USMC16353721,0.9,John Coltrane,My One And Only Love,UMG,Universal Music Division Decca Records France,2015-07-10 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
8,USF095900250,0.9,Louis Armstrong,I've Got The World On A String,UMG,Verve Reissues,2018-04-13 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...
9,USBN28900078,0.9,Chet Baker,It's Always You - Vocal,UMG,U-5,2015-10-10 00:00:00,Jazz,Jazz,spotify:user:0gd7eg9rkw25b3cvhwj5fqpl6:playlis...


In [48]:
rec_df_new[rec_df_new.major_label=='UMG'].shape

(180, 10)

## Checking if all browse category playlists are in the model

In [49]:
get_browse_uri_sql = """
SELECT
  playlist_uri 
FROM
  `umg-tools.metadata.spotify_playlist_browse`
WHERE
  report_date = DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
GROUP BY playlist_uri 
"""

browse_uris = gbq.read_gbq(get_browse_uri_sql, project_id='umg-comm-tech-dev', dialect='standard')

In [50]:
browse_uris.shape

(3968, 1)

In [70]:
not_in_model = ['' if x in rec.playlists else x for x in browse_uris.playlist_uri]

In [71]:
not_in_model=list(set(not_in_model))
len(not_in_model)

1445

In [72]:
not_in_model = not_in_model[1:]

In [73]:
len(not_in_model)

1444

In [76]:
not_in_model_df = pd.DataFrame(not_in_model, columns=['playlist_uri'])

In [77]:
gbq.to_gbq(not_in_model_df, destination_table='recommender_model.missing_browse_playlists', if_exists='replace',
          project_id = 'umg-comm-tech-dev')

1it [00:05,  5.59s/it]


In [79]:
playlists_tracks_browse_sql = """
SELECT
  playlist_uri,
  isrc,
  COUNT(DISTINCT playlist_date) AS days_on_playlist
FROM
  `umg-partner.spotify.playlist_track_history`
WHERE
  _PARTITIONTIME >= TIMESTAMP(DATE_SUB(CURRENT_DATE(), INTERVAL 367 DAY))
  AND _PARTITIONTIME < TIMESTAMP(DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY))
  AND playlist_uri IN (
  SELECT
    playlist_uri
  FROM
    `umg-comm-tech-dev.recommender_model.missing_browse_playlists`
  GROUP BY
    playlist_uri)
GROUP BY
  playlist_uri,
  isrc
"""

In [80]:
missing_browse_data = gbq.read_gbq(playlists_tracks_browse_sql, dialect='standard', project_id='umg-comm-tech-dev')