## Load packages and data

In [None]:
# remove hashtag and run code below to install packages

#!pip install spacy
#!python -m spacy download en_core_web_sm

In [None]:
import numpy as np
import pandas as pd
import json
import copy
import spacy

In [None]:
# n is the number of json files we wish to read (max is 1000)
# this sets n to randomly draw from the 1000 json files, with size indicating how many
# json files you want

# uncomment the following line to ensure you get the same json files when running the code
#np.random.seed(110)
n = np.random.randint(1001, size = 5)


for i in (n):
    file_name = '../spotify_million_playlist_dataset/data/mpd.slice.' \
    + str(i*1000) \
    + "-" + str(i*1000+999) + '.json'

    #uncomment the following line to show progress
    print(file_name)

    #open the file and store its contents in file_contets
    with open(file_name) as user_file:
        file_contents = user_file.read()

    # only care about the "playlists" part of dictionatry
    # save the list of playlists in playlist_list
    parsed_json = json.loads(file_contents)
    playlist_list = parsed_json['playlists']

    # create dataframe if it's first playlist, otherwise append info to existing dataframe
    # the record_path argument tells the json_normalize function how to flatten the data
    # the meta argument tells the json_normalize function what meta data to keep

    if i == min(n):
        data = pd.json_normalize(
            playlist_list, 
            record_path = 'tracks', 
            meta = ['name', 'collaborative', 'pid', 'num_followers', 'num_edits']
        )
    else:
        data = pd.concat([data,
                          pd.json_normalize(
                              playlist_list,
                              record_path = 'tracks', 
                              meta = ['name', 'collaborative', 'pid', 'num_followers', 'num_edits']
                          )
                         ], 
                         ignore_index = True
                        )

## Clean Playlist Titles (Lemmmatize, lowercase)

In [None]:
# create spacy object
nlp = spacy.load("en_core_web_sm")


# Function to lemmatize texts using nlp.pipe
def text_processing(title):
    # nlp.pipe processes texts as a stream
    for doc in nlp.pipe(title, disable=["parser", "ner"]):
        yield ' '.join([token.lemma_.lower() for token in doc])

# Apply the lemmatization function to the text column using nlp.pipe. 
# Adds new column "playlist_title" to dataframe
data['playlist_title'] = list(text_processing(data['name']))

In [None]:
# create a new version of the dataframe with duplicate playlist titles removed.
# Since this operation is being done on the duplicate values of pid column, should not impact
# playlist titles that now have same name. Note, this can be done before applying the text_processing
# function. Just change the name of the reference dataframe accordingly.

df = data.drop_duplicates(subset = ["pid"]).copy()