## Load packages and data

In [1]:
# remove hashtag and run code below to install packages

#!pip install -U sentence-transformers
#!pip install --user annoy

In [11]:
import numpy as np
import pandas as pd
import json
import copy
import time
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:

# n is the number of json files we wish to read (max is 1000)
n = 5

for i in range(n):
    # create the file_name string in order to read in file
    file_name = 'spotify_million_playlist_dataset/data/mpd.slice.' \
        + str(i*1000 ) \
        + '-' + str(i*1000+999) + '.json'

    # Uncomment the following line to show progress
    #print(file_name)

    # open the file and store its contents in file_contents
    with open(file_name) as user_file:
        file_contents = user_file.read()

    # we only care about the "playlists" part of this dictionary
    # save the list of playlists in playlist_list
    parsed_json = json.loads(file_contents)
    playlist_list = parsed_json["playlists"]


    # create dataframe if it's first playlist, otherwise append info to existing dataframe
    # the record_path argument tells the json_normalize function how to flatten the data
    # the meta argument tells the json_nomralize function what meta data to keep
    if i == 0:
        data = pd.json_normalize(
            playlist_list,
            record_path="tracks",
            meta=["name", "collaborative", "pid", "num_followers", "num_edits"]
        )
    else:
        data = pd.concat(
            [
                data,
                pd.json_normalize(
                    playlist_list,
                    record_path="tracks",
                    meta=["name", "collaborative", "pid", "num_followers", "num_edits"]
                )
            ],
            ignore_index = True
        )

In [4]:
## make copy of data to prevent any funny business
df = data.copy()
## drop duplicate pid as well
df = df.drop_duplicates(subset = ['pid'])

# Generate playlist vectors using sentence transformers and load into annoy index


In [12]:
#Building the database here
#make vectors using sentence transformer
#load vectors into annoy index along with pid
t0 = time.time()

#dimension of vectors the model uses
f = model.get_sentence_embedding_dimension()

#build database we will load vectors in
#angular means we are using cosine similarty metric
t = AnnoyIndex(f , 'angular')

for (_,name,pid) in df[['name', 'pid']].itertuples(name=None):
    v = model.encode(name.lower().strip() )
    t.add_item(pid , v)

#number of trees for search
t.build(100)
t.save('playlist_vectors.ann')

t1 =time.time()

print(t1-t0)

32.310036420822144
