# Update spotify net/assign tracks

> Update spotify net by assigning tracks with above a playcount threshold to an existing public playlist and removing from net

In [46]:
#| default_exp prep_model

In [47]:
# export
import pandas as pd
import requests
import boto3
import json
from io import BytesIO
import joblib
import pickle
import numpy as np
pd.set_option('display.max_columns', None)

In [48]:
#hide
from nbdev.showdoc import *

In [49]:
class ModelPrep:
    def __init__(self):
        pass
        
    def load_s3(self):
        '''
        Ok, so it looks like the order in which I'm doing things is:
        1. Load a pretrained scaler
        2. Load a pretrained SVD...model?
        3. Load Spotify data
        4. Load LastFM data
        5. Load a pandas series of genres?
        6. Load a pandas series of eleven keys
        7. Load a pandas series of four time signatures

        Could I instead load a sci-kit learn pipeline?
        '''

        s3_resource = boto3.resource('s3')

        scaler = s3_resource.Object('spotify-net', 'scaler').get()
        scaler = pickle.loads(scaler['Body'].read())

        svd = s3_resource.Object('spotify-net', 'svd').get()
        svd = pickle.loads(svd['Body'].read())

        recent_spotify_tracks = pd.read_csv('s3://spotify-net/newer_tracks.csv', index_col=0)
        recent_lastFM_tracks = pd.read_csv('s3://spotify-net/last_fm_recent_tracks.csv', index_col=0)

        genre_series = pd.read_csv('s3://spotify-net/genres_svd.csv', index_col=0, squeeze=True)

        # These don't need to be loaded from S3, but I'm doing it anyway because...?
        key_series = pd.read_csv('s3://spotify-net/key_list.csv', index_col=0, squeeze=True)
        time_series = pd.read_csv('s3://spotify-net/timeSig_list.csv', index_col=0, squeeze=True)

        s3_dict = {
            'scaler': scaler,
            'svd': svd,
            'recent_spotify_tracks': recent_spotify_tracks,
            'recent_lastFM_tracks': recent_lastFM_tracks,
            'genre_series': genre_series,
            'key_series': key_series,
            'time_series': time_series
        }

        return s3_dict

In [50]:
m = ModelPrep()
d = m.load_s3()

In [51]:
d.keys()

dict_keys(['scaler', 'svd', 'recent_spotify_tracks', 'recent_lastFM_tracks', 'genre_series', 'key_series', 'time_series'])

In [52]:
d['recent_lastFM_tracks']

Unnamed: 0,name,artist,playcount
0,Fine Day Anthem,Skrillex,18
1,Zebra,Jackboy,16
2,Baby Again..,Fred again..,12
3,Trojan Horse,Dave,11
4,Now U Do,DJ Seinfeld,11
5,GMT - Wolfgang Tillmans / Marc Krether Remix,Oliver Sim,11
6,Schweigen,The True Spacemen,11
7,Sirens (feat. Caroline Polachek) [Tourist Remix],Flume,10
8,Growing Pains,mindchatter,9
9,Tell Me One More Time - Mona Yim Remix,DJ Seinfeld,8


In [53]:
df_classify = pd.merge(
    d['recent_spotify_tracks'][['name', 'artist']].applymap(lambda x: x.upper()), 
    d['recent_lastFM_tracks'][['name', 'artist']].applymap(lambda x: x.upper()), 
    )

In [54]:
df_classify

Unnamed: 0,name,artist
0,FINE DAY ANTHEM,SKRILLEX
1,ZEBRA,JACKBOY
2,SCHWEIGEN,THE TRUE SPACEMEN


In [None]:
# export
def dummies_and_scale(df_classify, constant, scaler):
   # log-transform
   # c = 0.0000001
    c=constant
    df_classify[['speechiness', 'acousticness', 'instrumentalness']] = df_classify[['speechiness', 'acousticness', 'instrumentalness']] + c
    df_classify[['speechiness', 'acousticness', 'instrumentalness']] = np.log(df_classify[['speechiness', 'acousticness', 'instrumentalness']])

   # one-hot
    df_track = pd.get_dummies(df_classify , prefix=['key', 'time_signature'], columns=['key', 'time_signature'])

   # standardScaler
    scale_col = ['danceability', 'energy', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms']
    df_track[scale_col].head()
    df_track[scale_col] = scaler.transform(df_track[scale_col])

    return df_track

In [None]:
# export
def full_frame(df_track, gen_series, svd, key_series, time_series):

    curr_gen = df_track.loc[:, df_track.columns.str.startswith('genre_')]
    full_gen = pd.DataFrame(np.zeros((len(df_track), len(gen_series.tolist()))) , columns=gen_series.tolist())
    full_gen = full_gen.add_prefix('genre_')
    full_gen.update(curr_gen)
    full_gen.columns = full_gen.columns.str.replace('genre_', '')

    test_trans = svd.transform(full_gen)
    test_trans = pd.DataFrame(test_trans)
    test_trans = test_trans.add_prefix('genre_')

    df_track = df_track.loc[:, ~df_track.columns.str.startswith('genre_')]
    df_track = pd.concat([df_track, test_trans], axis=1)

    curr_key = df_track.loc[:, df_track.columns.str.startswith('key_')]
    full_key = pd.DataFrame(np.zeros((len(df_track), len(key_series.tolist()))) , columns=key_series.tolist())
    full_key.update(curr_key)

    df_track = df_track.loc[:, ~df_track.columns.str.startswith('key_')]
    df_track = pd.concat([df_track, full_key], axis=1)

    curr_time = df_track.loc[:, df_track.columns.str.startswith('time_signature_')]
    full_time = pd.DataFrame(np.zeros((len(df_track), len(time_series.tolist()))) , columns=time_series.tolist())
    full_time.update(curr_time)

    df_track = df_track.loc[:, ~df_track.columns.str.startswith('time_signature_')]
    df_track = pd.concat([df_track, full_time], axis=1)
    df_track.to_csv('s3://spotify-net/for_prediction.csv')
    print(df_track.shape)

    print('Uploaded to S3')


In [None]:
# export
if __name__ == '__main__':
    s3_objects = load_s3()
    merged_df = merge_frame(s3_objects['spot_tracks'], s3_objects['last_tracks'])
    transformed = dummies_and_scale(merged_df, 0.0000001, s3_objects['scaler'])
    full_frame(transformed, s3_objects['gen_series'], s3_objects['svd'], s3_objects['key_series'], s3_objects['time_series'])