In [1]:
# this version reads a chunk of posts with the score embedded
# ranks by summing scores

import os
import glob
import pickle
from datetime import datetime
import time
import dotenv
import re
from tqdm import tqdm
from schema import Schema
import csv

import pandas as pd
import pandas_dedupe

import requests
import requests.auth

import praw

import openai
import tiktoken

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# load secrets from .env into environment variables
dotenv.load_dotenv()

print(f"{'Praw:':<20} {praw.__version__ :>10}")
print(f"{'OpenAI:':<20} {openai.version.VERSION :>10}")


Praw:                     7.7.0
OpenAI:                  0.27.4


See README.md
 - objective is to use OpenAI for named entity extraction to extract all the songs form [this reddit thread](https://www.reddit.com/r/AskReddit/comments/12viv4v/what_is_the_prettiest_song_you_ever_heard_in_your/) and make Spotify playlist
 - use Reddit PRAW API to download all the comments (get [Reddit API key](https://www.reddit.com/prefs/apps))
 - use OpenAI API with a prompt like, extract all the songs from this text to CSV get ([OpenAI API key](https://platform.openai.com/account/api-keys))
 - use Spotify API to make a playlist (get [Spotify API key](https://developer.spotify.com/documentation/web-api/tutorials/getting-started))
 - works, needed a lot of scrubbing, but about 1 day of work, wouldn't have been possible to do a 700-song playlist manually without a team of Mechanical Turks or something
 - If I wanted to go nuts, would process comments individually, save a file for each comment's extracted songs, would make it easier to track down what OpenAI gets wrong, have a resumable, retryable, repeatable process and 
 - Spotify playist is [here](https://open.spotify.com/playlist/08YFkbtTV6GBfNtjJ4PHDu?si=f4761d983ac84091) 
 
 needs a .env file per dot-env-template
 

# Configs

In [2]:
# model
gptmodel = 'gpt-3.5-turbo'

# a thread 
submission = "12viv4v"

# minimum karma to process a reply 
minkarma = 1

# an output file to accumulate all the responses
savefile = 'bronze.txt'

# main prompt 
prompt_prefix1="""You will act as a research assistant finding all the artists and track titles mentioned in a series of messages about music, and returning them in a CSV format.
Define a post delimited below by ===
===
post_id: "abcdefg"
post_score: "6996"
I love Yesterday by the Beatles. Also Hotel California from The Eagles. And Bruce Springsteen's Born To Run!
===

Define a CSV format delimited below by ---
---
"post_id","post_score","artist","track"
"abcdefg","6996","The Beatles","Yesterday"
"abcdefg","6996","The Eagles","Hotel California"
"abcdefg","6996","Bruce Springsteen","Born To Run"
---

You will extract all artists and tracks from each post below delimited by ~~~ .
You will return a list of records containing the artist and track extracted from the input, and the post_id and post_score of the post the artist and track is mentioned in.
You will return the records in a CSV format.
The header row should contain `"post_id","post_score","artist","track"`. 
The input is:
"""

# an output file to accumulate all the responses
outdir = 'out'
logdir = 'logs'

savefile = 'bronze.txt'

# to speed things we'll cumulate posts til we get to nposts posts or maxchars total chars, whichever comes first
max_post_size=300  # redditor needs to put any songs in 1st couple hundred chars
maxtokens = 1024   # max tokens to send to get_response (with room for response)
# maxchars = 6000  # max tokens (words/fragments) is 4096 but I think stuffing the prompt maybe reduces quality?
nposts = 1000 # max posts to combine into a chunk



# Get all comments from a reddit posting

In [None]:
def getPraw():
    return praw.Reddit(user_agent="prettiest_song/0.001", 
                       client_id=os.getenv('CLIENT_ID'), 
                       client_secret=os.getenv('CLIENT_SECRET'))


def getAll(r, submissionId, verbose=True):
    submission = r.submission(submissionId)
    submission.comments.replace_more(limit=None)
    commentsList=submission.comments.list()
    return commentsList


In [None]:
# print(datetime.now())
# r = getPraw()
# res = getAll(r, submission)
# print(datetime.now())

# print("retrieved ", len(res), 'comments')


In [None]:
# # we have a list of comment objects
# # filter comments with at least some karma
# res3 = [r for r in res if r.score >= minkarma]
# print('filtered to ', len(res3), 'comments')
# res3[0].body, res3[0].score


In [3]:
# save so we can reload it later without downloading

# with open('reddit_full.pkl', 'wb') as f:
#     pickle.dump(res3, f)
    
with open('reddit_full.pkl', 'rb') as f:
    res3 = pickle.load(f)


# Extract artists and song titles using OpenAI

In [4]:
# check lengths of posts
shorties = []
big_ones = []
for i in range(len(res3)):
    if len(res3[i].body) <3:
        print (i, res3[i].body)
        shorties.append(i)
    if len(res3[i].body) > 1024:
        print(i, len(res3[i].body))
        big_ones.append(i)
        

812 26
11565 Up
21388 W
21557 -🤓
21562 W
21907 :)
22168 t
23326 <3
23401 ✨️
24352 Ye


In [5]:
# avg length
sum([len(r.body) for r in res3]) / len(res3)

70.90953465668727

In [6]:
[i for i in range(len(res3)) if res3[i].score <= 0]

[]

In [None]:
# already truncated
print (res3[big_ones[0]].body[:500])

In [7]:
csv_validate_re = re.compile(r'''
    \s*                # Any whitespace.
    (                  # Start capturing here.
      [^,"']+?         # Either a series of non-comma non-quote characters.
      |                # OR
      "(?:             # A double-quote followed by a string of characters...
          [^"\\]|\\.   # That are either non-quotes or escaped...
       )*              # ...repeated any number of times.
      "                # Followed by a closing double-quote.
      |                # OR
      '(?:[^'\\]|\\.)*'# Same as above, for single quotes.
    )                  # Done capturing.
    \s*                # Allow arbitrary space before the comma.
    (?:,|$)            # Followed by a comma or the end of a string.
    ''', re.VERBOSE)


In [8]:
# use tokenizer to get accurate token count

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model(gptmodel)
assert enc.decode(enc.encode("hello world")) == "hello world"

def count_tokens(s):
    return len(enc.encode(s))

count_tokens('four score and 7 years go our forefathers brought forth')

13

In [9]:
openai.api_key = os.getenv('OPENAI_API_KEY')

models = openai.Model.list()
print([(i, m.id) for i, m in enumerate(models["data"])])
models['data'][2]

[(0, 'whisper-1'), (1, 'babbage'), (2, 'gpt-3.5-turbo'), (3, 'davinci'), (4, 'text-davinci-edit-001'), (5, 'text-davinci-003'), (6, 'babbage-code-search-code'), (7, 'text-similarity-babbage-001'), (8, 'code-davinci-edit-001'), (9, 'text-davinci-001'), (10, 'ada'), (11, 'babbage-code-search-text'), (12, 'babbage-similarity'), (13, 'code-search-babbage-text-001'), (14, 'text-curie-001'), (15, 'code-search-babbage-code-001'), (16, 'text-ada-001'), (17, 'text-embedding-ada-002'), (18, 'text-similarity-ada-001'), (19, 'curie-instruct-beta'), (20, 'ada-code-search-code'), (21, 'ada-similarity'), (22, 'code-search-ada-text-001'), (23, 'text-search-ada-query-001'), (24, 'davinci-search-document'), (25, 'ada-code-search-text'), (26, 'text-search-ada-doc-001'), (27, 'davinci-instruct-beta'), (28, 'text-similarity-curie-001'), (29, 'code-search-ada-code-001'), (30, 'ada-search-query'), (31, 'text-search-davinci-query-001'), (32, 'curie-search-query'), (33, 'davinci-search-query'), (34, 'babbage-s

<Model model id=gpt-3.5-turbo at 0x7fdc5048d130> JSON: {
  "created": 1677610602,
  "id": "gpt-3.5-turbo",
  "object": "model",
  "owned_by": "openai",
  "parent": null,
  "permission": [
    {
      "allow_create_engine": false,
      "allow_fine_tuning": false,
      "allow_logprobs": true,
      "allow_sampling": true,
      "allow_search_indices": false,
      "allow_view": true,
      "created": 1684434433,
      "group": null,
      "id": "modelperm-Gsp3SyIu7GamHB3McQv3rMf5",
      "is_blocking": false,
      "object": "model_permission",
      "organization": "*"
    }
  ],
  "root": "gpt-3.5-turbo"
}

In [10]:
MAX_TOKENS = 4096   # https://platform.openai.com/docs/models

def get_response(messages, prompt_prefix="", verbose=False):

    prompt = prompt_prefix
    
    if type(messages) == list:
        for msg in messages:
            prompt += f"""
~~~
{msg}
~~~
"""
    else:
        prompt += messages
        
    if verbose:
        print(prompt)
        
    # retry loop, have received untrapped 502 error
    if count_tokens(prompt) > MAX_TOKENS:
        print("WARNING: %d tokens > %d" % (count_tokens(prompt), MAX_TOKENS))
        
    RETRIES = 3
    success = False    
    for i in range(RETRIES):
        try:
            response = openai.ChatCompletion.create(
                model=gptmodel,
                messages=[{"role":"user", 
                           "content": prompt}],
                temperature=0,
            )
            # no exception thrown
            success=True
            break   
        except Exception as error:
            print("An exception occurred:", error)
            print("Retrying chunk...")
            time.sleep(5)
            continue  # try again
    if success:
        # check response payload for any error message?
        response_msg = response['choices'][0]['message']
        if len(response_msg['content'])==0:
            print("there was a problem, content is empty, full payload follows:")
            print(response)
        if verbose:
            print(response_msg)
        return response_msg['content']
    else:
        return None



In [None]:
# for each comment object we will extract the body 
# then submit as part of a prompt to chatgpt
print(datetime.now())

nposts = 1000
slist = res3.copy()
total_posts = len(slist)
print("processing %d posts" % total_posts)

# make sure out and logs are empty
for f in glob.glob('%s/*' % outdir):
    os.remove(f)
for f in glob.glob('%s/*' % logdir):
    os.remove(f)
count = 0
c = 0
maxtokens=2048

while(slist):  # still comments to process
    tokens_to_date = count_tokens(prompt_prefix1)
    reply_ids = []
    messages = []
    for _ in range(nposts):  # add up to this many posts to the prompt
        if slist:
            # make sure no single post > max_post_size, truncate in place as nec 
            slist[0].body = slist[0].body[:max_post_size]
            if tokens_to_date + count_tokens(slist[0].body) < maxtokens:
            # total post content < maxchars
            # if chars_to_date + len(slist[0].body) < maxchars:
                reply = slist.pop(0)
                reply_ids.append(reply.id)
                body = reply.body
                
                messages.append(f"""
post_id: "{reply.id}"
post_score: "{reply.score}"
{body}
"""
                )
                tokens_to_date += count_tokens(messages[-1])
                # chars_to_date += len(messages[-1])
                c += 1
            
    response = get_response(messages, prompt_prefix1, verbose=False)
    if response is None:   # FAIL - retries exhausted
        print('Bailing to next chunk')
        continue

    # do basic validation and cleanup
    # should check first line is valid header and doesn't reverse columns
    csv_valid, csv_err = [], []
    for line in response.split("\n"):
        try:
            csv_values = csv_validate_re.findall(line)
            if len(csv_values) == 4:
                csv_valid.append(line)
            else:
                csv_err.append(line)
        except:
            csv_err.append(line)
    csv_output = "\n".join(csv_valid)
        
    with open("%s/%04d.csv" % (outdir, count), 'w') as outfile:
        outfile.write(csv_output)
    
    if csv_err:
        with open("%s/%04d.err" % (outdir, count), 'w') as outfile:
            outfile.write("\n".join(csv_err))
        
    with open("%s/%04d.log" % (logdir, count), 'w') as logfile:
        logfile.write(str(reply_ids))
        logfile.write('\n\n===== raw prompt =====\n\n')        
        logfile.write("\n=====\n".join(messages))
        logfile.write('\n\n===== raw response =====\n\n')
        logfile.write(response)
        logfile.write('\n\n===== failed validation =====\n\n')
        logfile.write("\n".join(csv_err))
 
    count += 1
#     print(c)
    outcount = total_posts-len(slist)
    print(outcount, end=' ')
    
    
print()
print(datetime.now())



In [None]:
## concatenate outputs as bronze.txt
# may still have to tweak the files to get them to load
# should inspect .err files and clean up as necessary


In [None]:
# filelist = glob.glob('%s/*.csv' % outdir)

# output_df = None
# count = 0
# for f in sorted(filelist):
#     print(f)
#     try:
#         tempdf = pd.read_csv("%s" % (f), header=None)
#     except Exception as exc:
#         print(str(exc))
#         continue
#     colcount = len(tempdf.columns)
#     if len(tempdf.columns) != 4:
#         print('%s has %d columns, skipped' % (f, colcount))
#         continue
        
#     # ok
#     # truncate header row if it looks like a header
#     if tempdf.iloc[0][0]=='post_id':
#         tempdf = tempdf[1:]
#     # set the header explicitly
#     tempdf.columns=["post_id","post_score","artist","track"]

#     if output_df is not None:        
#         output_df = pd.concat([output_df, tempdf], axis=0)
#     else:
#         output_df = tempdf
#     count += 1
#     if count % 10 == 0:
#         print(count, end=' ')

        
        
        

In [12]:
def valid_post_id(s):
    s = s.strip()
    valid = 3 < len(s) < 10
    return valid
# validator.add_record_check(check_post_id)

def valid_post_score(s):
    s = s.strip()
    valid = all([c.isdigit() for c in s]) and int(s) < 99999
    return valid
    
schema = Schema([{'post_id': valid_post_id,
                  'post_score': valid_post_score, 
                  'artist': str,
                  'track': str,
                 }])

filelist = glob.glob('%s/*.csv' % outdir)

objlist = []
with open(savefile, 'w') as outfile:
    for f in tqdm(filelist, desc = 'File concat'):
        with open(f, 'r') as csvfile:
            reader = csv.DictReader(csvfile, delimiter=",", fieldnames=[
                "post_id",
                "post_score",
                "artist",
                "track"
            ])
            data=[row for row in reader]
                                   
        data = data[1:]
        
        try:
            objlist.extend(schema.validate(data))
        except Exception as error:
            print(f)
            print(error)
            break



File concat: 100%|██████████| 482/482 [00:05<00:00, 94.52it/s]


In [20]:
tempdf = pd.DataFrame.from_dict(objlist)
tempdf.columns=['post_id','post_score','artist','track']
tempdf

Unnamed: 0,post_id,post_score,artist,track
0,jhfyzqd,1,REM,Find the River
1,jhfyzqd,1,Chris Cornell,Sunshower
2,jhfyzqd,1,Chris Cornell,Moonchild
3,jhfz0ky,1,Irma Thomas,Anyone who knows what love is
4,jhfz0rf,1,Mac Miller,You
...,...,...,...,...
26778,jhfvmdt,1,Queen,Teo Torriate
26779,jhfvmdt,1,Queen,Who Wants to Live Forever
26780,jhfvmdt,1,Queen,Dear Friends
26781,jhfvoqv,1,Disturbed,The Sound of Silence


In [21]:
tempdf = tempdf.drop_duplicates() \
    .sort_values("post_score", ascending=False)
# drop header row
tempdf = tempdf.loc[~(tempdf['post_id'].str.strip()=='post_id')]
# na to ""
tempdf.loc[tempdf['post_id'].isna(), 'post_id'] = ''
tempdf.loc[tempdf['post_score'].isna(), 'post_score'] = ''
tempdf.loc[tempdf['artist'].isna(), 'artist'] = ''
tempdf.loc[tempdf['track'].isna(), 'track'] = ''
# strip spaces
tempdf['post_id'] = tempdf['post_id'].str.strip()
tempdf['post_score'] = tempdf['post_score'].str.strip()
tempdf['artist'] = tempdf['artist'].str.strip()
tempdf['track'] = tempdf['track'].str.strip()
# clean up post_score to valid int
tempdf['post_score'] = tempdf['post_score'].apply(lambda s: "".join([c for c in s if c.isdigit()]))
tempdf['post_score'] = tempdf['post_score'].apply(lambda x: x[-5:])
tempdf['post_score'] = tempdf['post_score'].apply(lambda s: int(s) if s else 1)
# drop missing tracks, cleanup track
tempdf = tempdf.drop(tempdf.loc[tempdf['track']==''].index)
tempdf = tempdf.drop(tempdf.loc[tempdf['track'].str.lower()=='unknown'].index)
tempdf = tempdf.drop(tempdf.loc[tempdf['track'].str.lower()=='track'].index)
tempdf = tempdf.sort_values('post_score', ascending=False)
tempdf.loc[tempdf['post_score']==0, 'post_score'] = 1
# any test examples
tempdf = tempdf.loc[~(tempdf['post_id']=='abcdefg')]
tempdf


Unnamed: 0,post_id,post_score,artist,track
20230,jhbktrn,13941,Claude Debussy,Claire de Lune
20216,jhc2dyv,6996,Erik Satie,Gymnopédies
20221,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair
20237,jhcc2q8,5014,Israel Kamakawiwoʻole,Over the Rainbow
20249,jhbjlwj,4332,The Cranberries,Dreams
...,...,...,...,...
3118,jhgycrq,1,Nicki Minaj,Anaconda
3110,jhdsdrq,1,Nightwish,The Islander
3107,jhdd0mb,1,Three Dog Night,Pieces of April
3109,jhdpp17,1,Unknown,billy beane’s daughter singing ‘the show’ in m...


In [22]:
tempdf.to_csv('bronze.csv', index=False)

tempdf.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(tempdf, f)

len(tempdf)



23654

In [27]:
df = tempdf
df.loc[df['artist']=='N/A', 'artist']=''
df.loc[df['artist'].str.lower().str.startswith('unknown'), 'artist']=''
df.loc[df['artist'].str.lower().str.startswith('various'), 'artist']=''

artist_df = df[['artist','track']].groupby('artist') \
    .count() \
    .sort_values('track', ascending=False) \
    .reset_index()

artist_df


Unnamed: 0,artist,track
0,,3158
1,The Beatles,328
2,Radiohead,216
3,Fleetwood Mac,150
4,Pink Floyd,139
...,...,...
6930,La Oreja de VanGogh,1
6931,La Oreja De Van Gogh,1
6932,La Monte Young,1
6933,La La Land Soundtrack,1


# Impute missing artists
if someone just says 'Clair de Lune', or 'Let it be', without specifying the artist, maybe we can impute that?

In [28]:
missing_map = {}
try:
    artist_map = pd.read_csv("missing_artists.csv")
    missing_map = dict(zip(artist_map['track'],artist_map['artist']))
except:
    pass

missing_map

{' what comes to mind is either simple and clean, or dear sunshine': 'Utada Hikaru',
 '23': 'Jimmy Eat World',
 '26': 'Paramore',
 '3 little birds': 'Bob Marley',
 '74-75': 'The Connells',
 '86d - no escort': 'Mitski',
 "Don't Break My Heart": 'UB40',
 "Don't Know Much": 'Linda Ronstadt and Aaron Neville',
 "Don't Think Twice, It's All Right": 'Bob Dylan',
 'Don’t L': 'Missy Elliott',
 'Don’t Let Me Down': 'The Beatles',
 'Don’t Look Back': 'Boston',
 'Don’t Talk': 'The Beach Boys',
 'Doschitaii': 'Tatu',
 'Down in a Hole': 'Alice in Chains',
 'Down to You': 'Joni Mitchell',
 'Down to the River to Pray': 'Alison Krauss',
 'Dream Sweet in Sea Major': 'Miracle Musical',
 'Dream a Little Dream': 'The Mamas & The Papas',
 'Dreaming Again': 'Jim Croce',
 'Dreaming My Dreams': 'Waylon Jennings',
 'Dreams': 'Fleetwood Mac',
 'Drips//Auntie’s Harp': 'Flying Lotus',
 'Dry Hands': 'C418',
 'Duo des Fluers': 'Léo Delibes',
 'Dust in the Wind': 'Kansas',
 'Duvet': 'Boa',
 'Dylan Version': 'The Ave

In [29]:
# for missing artists, try to impute the artist based on the track
missing_artist_df = df.loc[(df['artist']=='')]
missing_artist_df


Unnamed: 0,post_id,post_score,artist,track
20561,jhc6oqe,4163,,Clair de Lune
20562,jhbxc5p,2812,,Claire de Lune
19249,jhd065k,1284,,
12909,jhc5hb0,782,,
17538,jhdrv5t,731,,Avril 14th
...,...,...,...,...
3093,jhd5gso,1,,wow you guys suck at this
3117,jhgx5i7,1,,Suprised no one is mentioning gucci gang
3094,jhdwbl5,1,,"Every time I encounter the prettiest song, I l..."
3096,jhclx96,1,,Untitled


In [30]:
df['artist2'] = df.apply(lambda row: missing_map[row.track.lower().strip()] if row.artist=="" and row.track.lower().strip() in missing_map else row.artist, axis=1)
df.loc[df['artist'] != df['artist2']]



Unnamed: 0,post_id,post_score,artist,track,artist2
20561,jhc6oqe,4163,,Clair de Lune,Claude Debussy
20562,jhbxc5p,2812,,Claire de Lune,Claude Debussy
17538,jhdrv5t,731,,Avril 14th,Aphex Twin
9772,jhc7g4r,573,,Linger,The Cranberries
13086,jhclthe,356,,Little Green,Joni Mitchell
...,...,...,...,...,...
19333,jhd1f37,1,,Nocturne in E flat major,Chopin
1871,jhegm4p,1,,Murmaider,Dethklok
12980,jhcuma2,1,,You Needed Me,Anne Murray
6272,jhcppp7,1,,Somewhere over the rainbow,Israel Kamakawiwo'ole


In [31]:
df['artist'] = df.apply(lambda row: missing_map[row.track.lower().strip()] if row.artist=="" and row.track.lower().strip() in missing_map else row.artist, axis=1)



In [32]:
prompt_prefix3 = """I will provide you a list of well-known recordings.
I would like you to review each recording, and provide the name of the artist most closely associated with the recording.
You will provide them in CSV format, one record per line in the following order: recording, artist. Enclose each field in double-quotes.
The input is:

"""

missing_artist_df = df.loc[(df['artist']=='')]

def missing_artists(missing_artist_df):
    
    missing_track_map = {}
    
    slist = missing_artist_df['track'] \
        .dropna() \
        .str.lower() \
        .str.strip() \
        .drop_duplicates() \
        .tolist()

    slist.sort()
    n_missing = len(slist)

    while(slist):  # still artists to process
        print(datetime.now(), end=" ")

        tokens_to_date = count_tokens(prompt_prefix3)
        prompt = ''
        rows = 0
        for _ in range(nposts):  # add up to nposts posts to the prompt
            if slist and tokens_to_date + count_tokens(slist[0]) < 1024:
                track = f'"{slist.pop(0)}"\n'
                prompt += track
                tokens_to_date += count_tokens(track)
                rows += 1
            else:
                break
        print(f"sending {rows} rows...", end=" ")
        response = get_response(prompt, prompt_prefix3, verbose=False)

        if response is None:   # FAIL - retries exhausted
            print('Bailing to next chunk')
            continue

        if not response:
            print("nothing returned ... check returned dict for errors")

        lines = response.split("\n")
        print(f"received {len(lines)} lines...")
        c=0        
        for line in lines:
            try:
                csv_values = csv_validate_re.findall(line)
                if len(csv_values) != 2:
                    print(f"{len(csv_values)} values found: ", line)
                    continue
                track_input, artist_correct = csv_values[0], csv_values[1]
                # fix artist enclosed in quotes, parens, etc.
                while len(artist_correct) >=2 and (not artist_correct[0].isalnum()) and artist_correct[0] == artist_correct[-1]:
                    artist_correct = artist_correct[1:-1]
                while len(track_input) >=2 and (not track_input[0].isalnum()) and track_input[0] == track_input[-1]:
                    track_input = track_input[1:-1]
                # if it wasn't found then skip
                if len(artist_correct) == 0:
                    print("empty artist returned")
                    continue
                if artist_correct.lower().startswith('unknown'):
                    continue
                if artist_correct.lower() == 'n/a':
                    continue
                # store in dict to update df
                c += 1
                missing_track_map[track_input]=artist_correct
                print(f'{track_input}: {artist_correct}')                    
            except Exception as error:
                print('error', line)
                print(error)
                continue
                
        print(f"{c} lines processed, total {n_missing-len(slist)}, {len(slist)} of {n_missing} remaining")
        
    return missing_track_map
                
missing_track_map = missing_artists(missing_artist_df)



2023-05-21 20:25:02.696233 sending 139 rows... received 139 lines...
( ): Sigur Rós
..... in paris: Frank Sinatra
1, 2, buckle my shoes.: Mother Goose Club
10,000 days (wings pt. 2): Tool
12 stout street: The Pogues
23: Jimmy Eat World
26: Paramore
3 little birds: Bob Marley
7 weeks and 3 days: Maria Taylor
74-75: The Connells
86d - no escort: Cass McCombs
a bitter sweet genesis for him and her: The Dear Hunter
a boy and a girl: Eric Whitacre
a capella arrangement, agnus dei: Samuel Barber
a case of you: Joni Mitchell
a change is gonna come: Sam Cooke
a dream is a wish your heart makes: Ilene Woods
a glimpse of us: Steve Gibbs
a la claire fontaine: Traditional
a lack of color: Death Cab for Cutie
a long way past the past: Ray LaMontagne
a love once new has now grown old: The Caretaker
a million dreams: Ziv Zaifman, Hugh Jackman, Michelle Williams
a mortal heart: The Paper Kites
a nightingale sang in berkeley square: Vera Lynn
a nova vida: Carter Burwell
a pillow of winds: Pink Floyd
a 

2023-05-21 20:28:42.652101 sending 132 rows... received 132 lines...
chamber of reflection: Mac DeMarco
champagne supernova: Oasis
chan chan: Buena Vista Social Club
chances are: Johnny Mathis
charging fort wagner: James Horner
che gelida manina: Giacomo Puccini
che herizade: Ravi Shankar
chelsea hotel: Leonard Cohen
cherry waves: Deftones
cherry wine: Hozier
chevaliers de sangreal: Hans Zimmer
chicken thigh: Clarence Williams
china: Tori Amos
choot volume 1: Badshah
chopin: Frédéric Chopin
chopin nocturne op 27 no 2: Frédéric Chopin
chorale from jupiter: Johann Sebastian Bach
christmas ep: She & Him
cigarettes and coffee: Otis Redding
cinema paradiso: Ennio Morricone
circle game: Joni Mitchell
circles: Post Malone
city of angels: Thirty Seconds to Mars
city of stars: Ryan Gosling & Emma Stone
city of stars duet: Ryan Gosling & Emma Stone
clair de lune: Claude Debussy
clair de lune - leopold stokowski’s orchestral version: Claude Debussy
claire de lune: Claude Debussy
claire de lune, m

2023-05-21 20:32:26.247815 sending 147 rows... received 147 lines...
follow you live at the royal albert hall: bring me the horizon
following: eden
for emily wherever i may find her: simon & garfunkel
for emily, whenever i may find her: simon & garfunkel
for emma, forever ago: bon iver
for my lady: the moody blues
for river: to the moon soundtrack
for you: coldplay
forever autumn: justin hayward
foxtrot fleur: the world is a beautiful place & i am no longer afraid to die
free bird: lynyrd skynyrd
freebird: lynyrd skynyrd
freezing moon: mayhem
from the window,to the wall: lil jon
fruits basket opening song: rhapsody
fuck the doomed, you're on your own: clarence clarity
fuiste tu: ricardo arjona
funeral song: the rasmus
fungi forest from the donkey kong 64 ost: grant kirkhope
fur elise: ludwig van beethoven
futile devices: sufjan stevens
fuyu biyori: sato naoki
für elise: ludwig van beethoven
gabriel's oboe: ennio morricone
garden state soundtrack: various artists
gently on my mind: glen

2023-05-21 20:37:18.191379 sending 153 rows... received 153 lines...
jupiter from the planets: Gustav Holst
just a tribute: Tenacious D
just as i am: Air Supply
just breathe: Pearl Jam
just monika: Dan Salvato
just the two of us: Grover Washington Jr.
k-rose radio station from gta san andreas: Various Artists
kainé salvation: Keiichi Okabe
kairi's theme: Yoko Shimomura
kaisi hai ye rut: Asha Bhosle
kalimankou denkou: Bulgarian State Radio & Television Female Vocal Choir
kalinka: Ivan Larionov
kaneda's death: Geinoh Yamashirogumi
karelian- lament for boromir: Karelian Folk Music Ensemble
karma police: Radiohead
kathy's song: Simon & Garfunkel
kathy’s song: Simon & Garfunkel
kenya's national anthem: Graham Hyslop
key of the twilight: Yuki Kajiura
khooneye ma: Mohsen Namjoo
kill the sun: Xandria
killing me softly: Roberta Flack
kimi ga soba ni iru youni: Maiko Fujita
kiss from a rose: Seal
kiss from a rose - wake me: Seal
kiss me: Sixpence None the Richer
knife dance: Erutan
koi: Nobuo Ue

2023-05-21 20:40:46.640122 sending 136 rows... received 136 lines...
never let me go: Florence + The Machine
never my love: The Association
new favorite: Alison Krauss & Union Station
new magic wand: Tyler, The Creator
new moon melody: George Winston
new york nagaram: A. R. Rahman
nico’s red truck: Atmosphere
night flight: Led Zeppelin
nights in white satin: The Moody Blues
njosnavelin: Sigur Rós
no need to argue: The Cranberries
no one: Alicia Keys
no quarter: Led Zeppelin
nocturne in c minor: Chopin
nocturne in e flat major: Chopin
nocturne in e minor: Chopin
non nobis and te deum: Patrick Doyle
none: Morphine
norman fucking rockwell: Lana Del Rey
norman's walk: Jon Brion
norwegian wood: The Beatles
not tomorrow: Silent Hill
nothing compares 2 u: Sinead O'Connor
nothing else matters live era 1996-2011: Metallica
nothing gonna change my love for you: George Benson
nothing matters when we're dancing: The Magnetic Fields
notions: The Cranberries
nude: Radiohead
nuns singing in saint-pie

2023-05-21 20:44:20.948911 sending 148 rows... received 148 lines...
scriabin piano concerto second movement: Alexander Scriabin
sea of dreams: Oberhofer
sea of tranquility: Ernest Hood
sea shanty 2: Ian Taylor
sea shanty- wellerman: Nathan Evans
secret library daguerreo: Nobuo Uematsu
secret of kells - aisling's song (pangur bán): Christen Mooney
secret of the deep sea: Yasunori Mitsuda
see you again: Wiz Khalifa
seeing you again: Dan Fogelberg
seize the day or die regretting the time you lost its empty and cold without you here please tell me what we had is reeeeeeaaaal: Avenged Sevenfold
sellerisången: Traditional Swedish Folk Song
send her my love: Journey
send in the clowns: Judy Collins
send me a peach: Blossom Dearie
serenade for strings 2nd movement: Pyotr Ilyich Tchaikovsky
serenata de amor: Agustín Lara
set list: The Frames
set the fire to the third bar: Snow Patrol
seven spanish angels: Willie Nelson and Ray Charles
seventh petal: Makoto Shinkai
sh-boom: The Chords
shame in 

2023-05-21 20:48:11.825169 sending 104 rows... received 104 lines...
the look of love: Dusty Springfield
the love theme from cinema paradiso: Ennio Morricone
the love you take: The Beatles
the lovers waltz: Jay Ungar and Molly Mason
the magic of the orchid: Yanni
the margarita song: Jimmy Buffett
the middle: Zedd, Maren Morris, Grey
the mind electric: Miranda Cosgrove
the moon represents my heart: Teresa Teng
the music box dancer: Frank Mills
the music of stillness: Deuter
the name of life from spirited away: Joe Hisaishi
the neverending story theme: Limahl
the night we met: Lord Huron
the nightingale: Carole King
the noose: A Perfect Circle
the noose!!!: A Perfect Circle
the numbers: Radiohead
the nurse walks in with her head hung low, and the cardinal hits the window.: mewithoutYou
the nurse who loved me (cover): A Perfect Circle
the one: Kodaline
the one about blonde hair and blue eyes.: Luke Bryan
the one from my near death experience: Kenny Chesney
the one i've just had written fo

In [33]:
missing_track_map 


{'( )': 'Sigur Rós',
 '..... in paris': 'Frank Sinatra',
 '1, 2, buckle my shoes.': 'Mother Goose Club',
 '10,000 days (wings pt. 2)': 'Tool',
 '12 stout street': 'The Pogues',
 '23': 'Jimmy Eat World',
 '26': 'Paramore',
 '3 little birds': 'Bob Marley',
 '7 weeks and 3 days': 'Maria Taylor',
 '74-75': 'The Connells',
 '86d - no escort': 'Cass McCombs',
 'a bitter sweet genesis for him and her': 'The Dear Hunter',
 'a boy and a girl': 'Eric Whitacre',
 'a capella arrangement, agnus dei': 'Samuel Barber',
 'a case of you': 'Joni Mitchell',
 'a change is gonna come': 'Sam Cooke',
 'a dream is a wish your heart makes': 'Ilene Woods',
 'a glimpse of us': 'Steve Gibbs',
 'a la claire fontaine': 'Traditional',
 'a lack of color': 'Death Cab for Cutie',
 'a long way past the past': 'Ray LaMontagne',
 'a love once new has now grown old': 'The Caretaker',
 'a million dreams': 'Ziv Zaifman, Hugh Jackman, Michelle Williams',
 'a mortal heart': 'The Paper Kites',
 'a nightingale sang in berkeley s

In [34]:
# check for reasonableness, clean up and apply
df['track']=df['track'].astype(str)
df['artist2'] = df.apply(lambda row: missing_track_map[row.track.lower().strip()] if row.artist=="" and row.track.lower().strip() in missing_track_map else row.artist, axis=1)
df.loc[df['artist'] != df['artist2']]



Unnamed: 0,post_id,post_score,artist,track,artist2
18972,jhdmda9,253,,Blue Ridge Mountains,Fleet Foxes
11409,jhbxdhd,120,,Mona Lisas and Mad Hatters,Elton John
18655,jhc6zgm,52,,Send in the Clowns,Judy Collins
19690,jhc6j1p,33,,Chopin,Frédéric Chopin
9718,jhd93b4,32,,The Shrine/An Argument,Fleet Foxes
...,...,...,...,...,...
6287,jhd15nl,1,,I’m on Fir,Bruce Springsteen
3103,jhd9vtw,1,,Two sparrows in a hurricane,Tanya Tucker
3116,jhg11nu,1,,Can i put my balls in your jaws,Eminem
3117,jhgx5i7,1,,Suprised no one is mentioning gucci gang,Lil Pump


In [35]:
df['artist'] = df.apply(lambda row: missing_track_map[row.track.lower()] if row.artist=="" and row.track.lower() in missing_track_map else row.artist, axis=1)



In [36]:
dupes = [k for k in missing_track_map.keys() if k in missing_map]

['..... in paris',
 '23',
 '26',
 '3 little birds',
 '74-75',
 '86d - no escort',
 'a bitter sweet genesis for him and her',
 'a case of you',
 'a change is gonna come',
 'a la claire fontaine',
 'a lack of color',
 'a million dreams',
 'a nightingale sang in berkeley square',
 'a pillow of winds',
 'a summer place theme',
 'a thing on strings',
 'a thousand years',
 'a tine for us',
 'a whiter shade of pale',
 'a whole new world',
 'achilles come down',
 'across the universe',
 'adagio for sparatacus & phrygia',
 'adagio for strings',
 'adagio in g minor',
 "adam's song",
 'africa',
 'agape',
 "ain't no sunshine",
 'aisatsana 102',
 'ajj din chadheya',
 'akuma no ko',
 'alberto balsalm',
 'alesund',
 'all across the universe.',
 'all i ask of you',
 'all is full of love',
 'all love can be',
 'all mine',
 'all songs of pink floyd',
 'all star',
 'always gold',
 'amazing grace',
 'amelia',
 'america',
 'american pie',
 'american tune',
 'amish paradise',
 'amor de mi alma',
 'amy ray',

In [37]:
# add new ones to missing_artists.csv
temp = pd.DataFrame({'track': missing_track_map.keys(),
              'artist': missing_track_map.values()}) \
    .sort_values('track')

temp.loc[]
    .to_csv('missing_artists_new.csv', index=False)


# Fix typos, abbreviations, missing artists using ChatGPT

In [51]:
artist_map = {}
try:
    artist_map = pd.read_csv("artist_map.csv")
    artist_map = dict(zip(artist_map['artist'],artist_map['map']))
except:
    pass
artist_map

{" Kamakawiwo'ole'": "Israel Kamakawiwo'ole",
 ' Otubanjo': 'Oliver Otubanjo',
 'Andrea Bocelli, Sarah Brightman': 'Andrea and Matteo Bocelli',
 'Andrew Lloyd Webber, Sarah B': 'Andrew Lloyd Webber, Sarah Brightman',
 'Bon Iver, St. V': 'Bon Iver',
 'Bon Iver, St. V.': 'Bon Iver',
 'Bon Iver, St. Vincent': 'Bon Iver',
 'Brad Paisley, Allison Krauss': 'Alison Krauss',
 'Bryce Dessner, James McAlister, Nico Muhly and Sufjan Stevens': 'Bryce Dessner, James McAlister, Nico Muhly, Sufjan Stevens',
 'Case, lang, Veirs': 'Case / Lang / Veirs',
 'Crosby, Stills & Nash': 'Crosby Stills & Nash',
 'Crosby, Stills Nash': 'Crosby Stills & Nash',
 'Crosby, Stills and Nash': 'Crosby Stills & Nash',
 'Crosby, Stills, & Nash': 'Crosby Stills & Nash',
 'Crosby, Stills, Nash & Young': 'Crosby, Stills, Nash, Young',
 'Crosby, Stills, Nash and Young': 'Crosby, Stills, Nash, Young',
 'Crosby, Stills, Nash': 'Crosby Stills & Nash',
 'Crosby, Stills, Nash, & Young': 'Crosby, Stills, Nash, Young',
 'Crosby, St

In [52]:
df['artist2'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)
df.loc[df['artist'] != df['artist2']]


Unnamed: 0,post_id,post_score,artist,track,artist2
20237,jhcc2q8,5014,Israel Kamakawiwoʻole,Over the Rainbow,Israel Kamakawiwoole
20239,jhc74qz,2978,Edith Piaf,La Vie en Rose,Édith Piaf
20226,jhbm8ne,2461,Sigur Ros,Hoppipolla,Sigur Rós
20245,jhc4ym3,1372,Ben E. King,Stand By Me,Ben E King
19633,jhc1tl5,832,Israel Kamakawiwoʻole,Somewhere Over the Rainbow/Wonderful World,Israel Kamakawiwoole
...,...,...,...,...,...
3121,jhhoaxo,1,Israel Kamakawiwoʻole,Somewhere Over The Rainbow/What A Wonderful World,Israel Kamakawiwoole
3113,jhere0a,1,Queensrÿche,Silent Lucidity,Queensryche
3097,jhcwka0,1,MGŁA,Exercises is Futility V,Mgla
3091,jhd1ecx,1,City and Color,Woman,City and Colour


In [53]:
# apply the map
df['artist'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)


In [54]:
prompt_prefix2 = """You will act as a proofreader. I will provide you a list of recording artists or composers.
You will review each input artist for any spelling errors or abbreviations and provide the corrected full artist without abbreviation. 
You will provide them in CSV format, one record per line in the following order: input_artist, corrected_artist. Enclose each field in double-quotes.
The input is:

"""


In [55]:
# proofread / dedupe artists
# may want to run this whole sequence a couple of times and update df, silver.csv

def dedupe_artists(artist_df):
    
    nposts = 1000
    artist_map

    slist = sorted(artist_df['artist'].tolist())
    n_artists=len(slist)

    while(slist):  # still artists to process
        print(datetime.now(), end=" ")

        prompt = ""
        tokens_to_date = count_tokens(prompt_prefix2)
        rows = 0
        for _ in range(nposts):  # add up to 100 posts to the prompt
            if slist:
                if tokens_to_date + count_tokens(slist[0]) < 1024:
                    artist = f'{slist.pop(0)}\n'
                    prompt += artist
                    tokens_to_date += count_tokens(artist)
                    rows += 1
            else:
                break
        print(f"sending {rows} rows...", end=" ")

        response = get_response(prompt, prompt_prefix2, verbose=False)
        if response is None:   # FAIL - retries exhausted
            print('Bailing to next chunk')
            continue

        if not response:
            print("there was a problem, check the payload")


        lines = response.split("\n")
        print(f"received {len(lines)} lines...")
        # sometimes doesn't match, chatgpt monkeys skip some

        c=0
        for line in lines:
            try:
                csv_values = csv_validate_re.findall(line) 
                if len(csv_values) != 2:
                    print('%d values found' % len(csv_values), line)
                    continue
                artist_input, artist_correct = csv_values[0].strip(), csv_values[1].strip()
                # fix artist enclosed in quotes, parens, etc.
                while len(artist_input) >= 2 and (not artist_input[0].isalnum()) and artist_input[0] == artist_input[-1]:
                    artist_input = artist_input[1:-1]
                while len(artist_correct) and (not artist_correct[0].isalnum()) and artist_correct[0] == artist_correct[-1]:
                    artist_correct = artist_correct[1:-1]
                # if it matches modulo case then skip
                if artist_input.lower() == artist_correct.lower():
                    continue
                # if it wasn't found then skip
                if len(artist_correct) == 0:
                    print("empty artist returned")
                    continue
                if artist_correct.lower().startswith('unknown'):
                    continue
                if artist_correct.lower() == 'n/a':
                    continue
                if artist_correct.lower() == "no correction needed":
                    continue
                # store in dict to update df
                c+=1
                artist_map[artist_input]=artist_correct
                print(f'"{artist_input}", "{artist_correct}"')
            except Exception as error:
                print('error', line)
                print(error)
                continue
        print(f"{c} lines processed, total {n_artists-len(slist)}, {len(slist)} of {n_artists} remaining")
        
    return artist_map

artist_map=dedupe_artists(artist_df)

print(datetime.now())


2023-05-21 21:30:34.642756 sending 211 rows... An exception occurred: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID be14bcec1b65bf69ffb872865b0c589e in your message.)
Retrying chunk...
received 209 lines...
"$not", "Snot"
"uicideboy", "Suicideboys"
"(G)-IDLE", "(G)I-DLE"
"*nsync", "NSYNC"
empty artist returned
"1975", "The 1975"
"2 cellos", "2Cellos"
"2Pac", "Tupac Shakur"
"2pac", "Tupac Shakur"
"3", "Three"
"30 seconds to mars", "Thirty Seconds to Mars"
"411", "Four One One"
"42 dougg and lil baby", "42 Dugg and Lil Baby"
"5sos", "5 Seconds of Summer"
"8485", "88rising"
"88lien", "88rising"
"A R Rahman", "A. R. Rahman"
"A Silver Mt. Zion", "Thee Silver Mt. Zion Memorial Orchestra"
"A Touch of Class aka ATC", "ATC"
"A$AP Rocky", "ASAP Rocky"
"A.R. Rahman", "A. R. Rahman"
"A.R.Rehman", "A. R. Rahman"
"ALanis Morisette", "Alanis Morissette"


KeyboardInterrupt: 

In [None]:
print(artist_map)

In [None]:
len(old_artist_map)

In [None]:
len(artist_map)

In [None]:
# save in artist_map.csv but no dupes
old_artist_map = pd.read_csv("artist_map.csv")
old_artist_map = dict(zip(old_artist_map['artist'],old_artist_map['map']))
not_dupes = {k: artist_map[k] for k in artist_map.keys() if k not in old_artist_map}
not_dupes
len(not_dupes)


In [None]:
pd.DataFrame({'artist': not_dupes.keys(), 'map': not_dupes.values()}).to_csv('artist_map_new.csv', index=False)

In [None]:
# check the map for reasonableness
# it does pretty smart stuff like map nin to Nine Inch Nails 
# but if it screws up that artist probably won't show up in spotify
df['artist2'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)
df.loc[df['artist'] != df['artist2']]


In [None]:
# run again if desired

In [56]:
df.loc[df['artist']=="Simon And Garfunkel", 'artist'] = "Simon & Garfunkel"

In [57]:
df.loc[df['artist'].isna(), 'artist']=""
df = df.drop(df.loc[df['artist'].str.lower().str.startswith('unknown')].index)
df = df.drop(df.loc[df['artist'].str.lower().str.startswith('various')].index)
df = df.drop(df.loc[df['artist'].str.lower().str.startswith('no correction needed')].index)
df = df.drop(df.loc[df['artist']==''].index)

artist_df = df[['artist','track']].groupby('artist') \
    .count() \
    .sort_values('track', ascending=False) \
    .reset_index()

artist_df.head(20)

Unnamed: 0,artist,track
0,The Beatles,407
1,Simon & Garfunkel,247
2,Radiohead,243
3,Fleetwood Mac,178
4,Jeff Buckley,160
5,Pink Floyd,158
6,Claude Debussy,155
7,Led Zeppelin,152
8,Sigur Rós,137
9,The Beach Boys,129


# Dedupe with pandas_dedupe

In [58]:
def fix_leading_trailing(s):
    """First and last should be alphanumeric"""
    # regex prob better if re.match('^\W+(.*)\W+$',playerName): 

    s = str(s)
    while len(s) >= 2 and (not s[0].isalnum()) and s[0] == s[-1]:
        s = s[1:-1]
            
    return s.lower().strip()


In [59]:
df['artist_dedupe'] = df['artist'].apply(fix_leading_trailing)
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('unknown')].index)
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('various')].index)
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('no artist found')].index)
df = df.drop(df.loc[df['artist_dedupe']=='none'].index)
df = df.drop(df.loc[df['artist_dedupe']==''].index)
df = df.drop(df.loc[df['artist_dedupe']=='post_score'].index)


In [60]:
df['artist_dedupe'] = df['artist_dedupe'].apply(lambda s: s[4:] if s[:4]=='the ' else s)

df.loc[df['artist_dedupe']=='band', 'artist_dedupe']='the band'



In [61]:
df[['artist_dedupe', 'post_score']] \
    .groupby('artist_dedupe') \
    .agg( \
         count=('post_score', 'count'), \
        ) \
    .reset_index() \
    .sort_values('count', ascending=False) 

Unnamed: 0,artist_dedupe,count
498,beatles,410
5070,simon & garfunkel,248
4591,radiohead,243
1870,fleetwood mac,179
2616,jeff buckley,160
...,...,...
2470,isaac albéniz-andrés segovia,1
2469,irresponsibles,1
2468,irresistible force,1
2465,iron butterfly,1


In [62]:
dedupe_df = df[['artist', 'artist_dedupe', 'post_score']] \
    .groupby(['artist', 'artist_dedupe']) \
    .count() \
    .sort_values('post_score', ascending=False) \
    .reset_index() \
    .reset_index() 

dedupe_df


Unnamed: 0,index,artist,artist_dedupe,post_score
0,0,The Beatles,beatles,407
1,1,Simon & Garfunkel,simon & garfunkel,247
2,2,Radiohead,radiohead,243
3,3,Fleetwood Mac,fleetwood mac,178
4,4,Jeff Buckley,jeff buckley,160
...,...,...,...,...
6207,6207,"Jeff Buckley, The Righteous Brothers, Johann P...","jeff buckley, the righteous brothers, johann p...",1
6208,6208,Jeannette,jeannette,1
6209,6209,Jeanette,jeanette,1
6210,6210,Jean-Yves Thibaudet,jean-yves thibaudet,1


In [65]:
# reset dedupe learned settings
# !rm dedupe_dataframe_learned_settings 
# !rm dedupe_dataframe_training.json   
dedupe_df2 = pandas_dedupe.dedupe_dataframe(dedupe_df, ['artist_dedupe'])


Importing data ...
Reading from dedupe_dataframe_learned_settings
Clustering...


  dedupe_df2 = pandas_dedupe.dedupe_dataframe(dedupe_df, ['artist_dedupe'])


# duplicate sets 5660


In [66]:
dedupe_df2

Unnamed: 0,index,artist,artist_dedupe,post_score,cluster id,confidence
0,0,the beatles,beatles,407,0,0.736693
1,1,simon garfunkel,simon garfunkel,247,1,0.736693
2,2,radiohead,radiohead,243,485,1.000000
3,3,fleetwood mac,fleetwood mac,178,2,0.606397
4,4,jeff buckley,jeff buckley,160,3,0.473825
...,...,...,...,...,...,...
6207,6207,"jeff buckley, the righteous brothers, johann p...","jeff buckley, the righteous brothers, johann p...",1,5656,1.000000
6208,6208,jeannette,jeannette,1,5657,1.000000
6209,6209,jeanette,jeanette,1,5658,1.000000
6210,6210,jean-yves thibaudet,jean-yves thibaudet,1,5659,1.000000


In [67]:
dedupe_df['cluster id'] = dedupe_df2['cluster id']
name2i = {a: i for i, a in zip(dedupe_df['cluster id'].tolist(), dedupe_df['artist_dedupe'].tolist())}
df['artist_index'] = df['artist_dedupe'].apply(lambda s: name2i[s])
df


Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index
20230,jhbktrn,13941,Claude Debussy,Claire de Lune,Claude Debussy,claude debussy,487
20216,jhc2dyv,6996,Erik Satie,Gymnopédies,Erik Satie,erik satie,49
20221,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair,Simon & Garfunkel,simon & garfunkel,1
20237,jhcc2q8,5014,Israel Kamakawiwoole,Over the Rainbow,Israel Kamakawiwoole,israel kamakawiwoole,22
20249,jhbjlwj,4332,The Cranberries,Dreams,The Cranberries,cranberries,26
...,...,...,...,...,...,...,...
3118,jhgycrq,1,Nicki Minaj,Anaconda,Nicki Minaj,nicki minaj,2552
3110,jhdsdrq,1,Nightwish,The Islander,Nightwish,nightwish,547
3107,jhdd0mb,1,Three Dog Night,Pieces of April,Three Dog Night,three dog night,1897
3109,jhdpp17,1,Lenka,billy beane’s daughter singing ‘the show’ in m...,Lenka,lenka,1157


In [68]:
df.loc[(df['artist_index'].isna())]
df.loc[(df['artist_index']==0)]

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index
20228,jhc1e1u,3675,The Beatles,In my Life,The Beatles,beatles,0
20232,jhbf5ng,2314,The Beatles,Blackbird,The Beatles,beatles,0
18226,jhc2oi2,347,The Beatles,Let it be,The Beatles,beatles,0
11078,jhc943z,291,The Beatles,Yesterday,The Beatles,beatles,0
19666,jhc8851,218,The Beatles,Let it Be,The Beatles,beatles,0
...,...,...,...,...,...,...,...
19098,jhecz4i,1,The Beatles,In My Life,The Beatles,beatles,0
19103,jheo8ur,1,The Beatles,Julia,The Beatles,beatles,0
19104,jheyto3,1,The Beatles,In My Life,The Beatles,beatles,0
18909,jhdp48d,1,The Beatles,Blackbird,The Beatles,beatles,0


In [69]:
dedupe_df

Unnamed: 0,index,artist,artist_dedupe,post_score,cluster id
0,0,The Beatles,beatles,407,0
1,1,Simon & Garfunkel,simon & garfunkel,247,1
2,2,Radiohead,radiohead,243,485
3,3,Fleetwood Mac,fleetwood mac,178,2
4,4,Jeff Buckley,jeff buckley,160,3
...,...,...,...,...,...
6207,6207,"Jeff Buckley, The Righteous Brothers, Johann P...","jeff buckley, the righteous brothers, johann p...",1,5656
6208,6208,Jeannette,jeannette,1,5657
6209,6209,Jeanette,jeanette,1,5658
6210,6210,Jean-Yves Thibaudet,jean-yves thibaudet,1,5659


In [70]:
# map to artist 
tempdf = dedupe_df[['artist_dedupe', 'artist', 'cluster id', 'post_score']] \
    .groupby(['artist_dedupe', 'cluster id']) \
    .agg( \
         count=('post_score', 'count'), \
         artist=('artist', 'first') \
        ) \
    .reset_index() \
    .sort_values('count', ascending=False) 
with pd.option_context("display.max_rows", 9999):
    display(tempdf.head(100))

Unnamed: 0,artist_dedupe,cluster id,count,artist
2865,juice wrld,257,4,Juice Wrld
4301,paper kites,103,3,The Paper Kites
911,carpenters,55,3,The Carpenters
1205,cranberries,26,3,The Cranberries
4027,national,148,3,The National
5224,staves,374,3,The Staves
3469,lumineers,134,3,The Lumineers
4521,pretenders,202,3,The Pretenders
2910,jvke,81,3,JVKE
5034,shins,164,2,The Shins


In [71]:
i2name = {i: a for i, a in zip(tempdf['cluster id'].tolist(), tempdf['artist'].tolist())}
df['artist'] = df.apply(lambda r: i2name[r.artist_index], axis=1)
df

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index
20230,jhbktrn,13941,Claude Debussy,Claire de Lune,Claude Debussy,claude debussy,487
20216,jhc2dyv,6996,Erik Satie,Gymnopédies,Erik Satie,erik satie,49
20221,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair,Simon & Garfunkel,simon & garfunkel,1
20237,jhcc2q8,5014,Israel Kamakawiwoole,Over the Rainbow,Israel Kamakawiwoole,israel kamakawiwoole,22
20249,jhbjlwj,4332,The Cranberries,Dreams,The Cranberries,cranberries,26
...,...,...,...,...,...,...,...
3118,jhgycrq,1,Nicki Minaj,Anaconda,Nicki Minaj,nicki minaj,2552
3110,jhdsdrq,1,Nightwish,The Islander,Nightwish,nightwish,547
3107,jhdd0mb,1,Three Dog Night,Pieces of April,Three Dog Night,three dog night,1897
3109,jhdpp17,1,Lenka,billy beane’s daughter singing ‘the show’ in m...,Lenka,lenka,1157


In [72]:
df.loc[df['artist'].str.lower().str.find('carp') >=0]

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index
19643,jhc74vp,360,Karen Carpenter,(They Long to Be) Close to You,Karen Carpenter,karen carpenter,1024
18193,jhcpyx8,31,The Carpenters,We've only just begun,Carpenters,carpenters,55
15286,jhcptrv,14,The Carpenters,Close to You,The Carpenters,carpenters,55
7627,jhd2fwe,11,The Carpenters,Superstar,The Carpenters,carpenters,55
13980,jhbtyu5,8,The Carpenters,Top of the World,The Carpenters,carpenters,55
...,...,...,...,...,...,...,...
20289,jhda2s0,1,The Carpenters,Top of the world,The Carpenters,carpenters,55
24677,jhf2jc7,1,The Carpenters,We've Only Just Begun,The Carpenters,carpenters,55
19577,jhdaead,1,Carpenter Brut,You're Mine,Carpenter Brut,carpenter brut,4334
19621,jhdalqp,1,The Carpenters,Close to you,The Carpenters,carpenters,55


In [73]:
df.groupby('track') \
    .count() \
    .reset_index() \
    .sort_values('artist', ascending=False) \
    .head(20)


Unnamed: 0,track,post_id,post_score,artist,artist2,artist_dedupe,artist_index
4035,Hallelujah,159,159,159,159,159,159
3802,God Only Knows,65,65,65,65,65,65
988,Ave Maria,63,63,63,63,63,63
9353,Songbird,62,62,62,62,62,62
6956,,55,55,55,55,55,55
11739,What a Wonderful World,53,53,53,53,53,53
11419,Vincent,51,51,51,51,51,51
8672,Saturn,51,51,51,51,51,51
7197,No track mentioned,44,44,44,44,44,44
5660,Landslide,43,43,43,43,43,43


In [74]:
df.loc[df['track']=='Claire de Lune', 'track']='Clair de Lune'


In [75]:
df['track2'] = df['track'].apply(fix_leading_trailing)


In [76]:
df = df.drop(df.loc[df['track2'].str.startswith('unknown')].index)
df = df.drop(df.loc[df['track2'].str.startswith('no track')].index)
df = df.drop(df.loc[df['track2']=='cover'].index)
df = df.drop(df.loc[df['track2']=='version'].index)
df = df.drop(df.loc[df['track2']=='anything'].index)
df = df.drop(df.loc[df['track2']=='none'].index)
df = df.drop(df.loc[df['track2'].str.startswith('no artist')].index)
df = df.drop(df.loc[df['track2'].str.startswith('various')].index)
df = df.drop(df.loc[df['track2']==''].index)
len(df)

22763

In [77]:
df[['artist', 'track', 'post_score', 'track2']] \
    .groupby(['artist', 'track2']) \
    .agg(sum= ('post_score', 'sum'), \
         track= ('track', 'first') \
        ) \
    .reset_index() \
    .sort_values('sum', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'sum': 'score'})



Unnamed: 0,artist,track2,score,track
0,Claude Debussy,clair de lune,21979,Clair de Lune
1,Erik Satie,gymnopédies,7124,Gymnopédies
2,Simon & Garfunkel,scarborough fair,6175,Scarborough Fair
3,Neil Young and Crazy Horse,harvest moon,5337,Harvest Moon
4,Israel Kamakawiwoole,over the rainbow,5092,Over the Rainbow
...,...,...,...,...
13448,Island Boys,imma just islaaaand boooooiiiiii,1,IMMA JUST ISLAAAAND BOOOOOIIIIII
13449,Isley Brothers,footsteps in the dark,1,Footsteps in the Dark
13450,Isley Brothers,summer breeze,1,Summer Breeze
13451,Isley Brothers,the highways of my life,1,The Highways of My Life


In [78]:
dedupe_track_df = df[['artist', 'track', 'track2', 'post_score']] \
    .groupby(['artist', 'track', 'track2']) \
    .count() \
    .sort_values('post_score', ascending=False) \
    .reset_index() \
    .reset_index() \
    .rename(columns={'post_score': 'count'})


dedupe_track_df


Unnamed: 0,index,artist,track,track2,count
0,0,Jeff Buckley,Hallelujah,hallelujah,96
1,1,Claude Debussy,Clair de Lune,clair de lune,65
2,2,The Beach Boys,God Only Knows,god only knows,62
3,3,Sleeping at Last,Saturn,saturn,51
4,4,Don McLean,Vincent,vincent,50
...,...,...,...,...,...
14931,14931,Hans Zimmer & Lisa Gerrard,Honor,honor,1
14932,14932,Hans Zimmer & Lisa Gerrard,Interstellar Album,interstellar album,1
14933,14933,Hans Zimmer & Lisa Gerrard,Interstellar Intro,interstellar intro,1
14934,14934,Hans Zimmer & Lisa Gerrard,Interstellar piano cover,interstellar piano cover,1


In [79]:
!rm dedupe_dataframe_learned_settings 
!rm dedupe_dataframe_training.json   
dedupe_track_df2 = pandas_dedupe.dedupe_dataframe(dedupe_track_df[['artist', 'track2']], 
                                                  ['artist','track2'],
                                                  canonicalize=True,)


Importing data ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dedupe_track_df2 = pandas_dedupe.dedupe_dataframe(dedupe_track_df[['artist', 'track2']],
  dedupe_track_df2 = pandas_dedupe.dedupe_dataframe(dedupe_track_df[['artist', 'track2']],
artist : yiruma
track2 : river flows in you

artist : yiruma
track2 : river flows in you

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


Starting active labeling...
y


artist : explosions in the sky
track2 : your hand in mine

artist : explosions in the sky
track2 : your hand in mine

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : billy joel
track2 : the downeaster alexa

artist : billy joel
track2 : the downeaster alex

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : joanna newsom
track2 : sawdust and diamonds

artist : joanna newsom
track2 : sawdust and diamond

3/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : opeth
track2 : patterns in the ivy ii

artist : opeth
track2 : patterns in the ivy

4/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : cass elliot
track2 : dream a little dream

artist : cass elliot
track2 : dream a little dream of me

5/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : elvis presley
track2 : cant help falling in love

artist : elvis presley
track2 : cant help falling in love with you

6/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : sigur ros
track2 : saeglopur

artist : sigur ros
track2 : saeglopur

7/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : vangelis
track2 : reve

artist : vangelis
track2 : reve

8/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : claude debussy
track2 : arabesque

artist : claude debussy
track2 : arabesque 1

9/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : franz liszt
track2 : liebestraum

artist : franz liszt
track2 : liebestraum no 3

10/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : sampha
track2 : (no one knows me) like the piano

artist : sampha
track2 : (no one knows me) like the piano in my mothers home

11/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : pink floyd
track2 : shine on you crazy diamond

artist : pink floyd
track2 : shine in you crazy diamond

12/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : pink floyd
track2 : shine on you crazy diamond

artist : pink floyd
track2 : shine in you crazy diamond

13/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : leo delibes
track2 : lakme - duo des fleurs (flower duet)

artist : leo delibes
track2 : lakme

14/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : johann sebastian bach
track2 : air

artist : johann sebastian bach
track2 : air on g string

15/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : gordon lightfoot
track2 : if you could read my mind

artist : gordon lightfoot
track2 : if ypu could read my mind

16/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : ella fitzgerald
track2 : evry time we say goodbye

artist : ella fitzgerald
track2 : every time we say goodbye

17/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : sarah brightman and andrea bocelli
track2 : time to say goodbye

artist : sarah brightman
track2 : time to say goodbye

18/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : gorillaz
track2 : empire ants

artist : gorillaz feat. little dragon
track2 : empire ants

19/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : alt-j
track2 : bloodflood

artist : alt-j
track2 : bloodflood part ii

20/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : patrick
track2 : je te laisserais de mots

artist : patrick
track2 : je te laisserai de mots

20/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : yiruma
track2 : river flows in you

artist : yiruma
track2 : river flow in you

21/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : jim croce
track2 : i got a name

artist : jim croce
track2 : i gotta name

22/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : jim croce
track2 : i got a name

artist : jim croce
track2 : i gotta name

23/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : elvis presley
track2 : cant help falling in love

artist : elvis presley
track2 : i cant help falling in love

24/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : elvis presley
track2 : cant help falling in love

artist : elvis presley
track2 : i cant help falling in love

25/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : simon
track2 : sound of silence

artist : simon garfunkel
track2 : sound of silence

26/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : jimi hendrix
track2 : little wing

artist : jimi
track2 : little wing

27/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : the moody blues
track2 : never comes the day

artist : x
track2 : never

28/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : israel kamakawiwoole
track2 : over the rainbow / what a wonderful world

artist : isreal iz kamakawiwoole
track2 : over the rainbow

28/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : simon
track2 : sound of silence

artist : simon garfunkel
track2 : sound of the silence

28/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : dave matthews
track2 : the maker

artist : dave matthews tim reynolds
track2 : 41

29/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : joji
track2 : glimpse of us

artist : joji
track2 : a glimpse of us

29/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : u2
track2 : one

artist : u2
track2 : sometimes you cant make it on your own

30/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : franz liszt
track2 : un suspiro

artist : franz liszt
track2 : un sospiro

30/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : edith piaf
track2 : la vie en rose

artist : edith piaf
track2 : la vei en rose

31/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : liana flores
track2 : rises the moon

artist : laura flores
track2 : rises the moon

32/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : doves
track2 : satellites

artist : dave matthews
track2 : satellite

33/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : roberta flack
track2 : first time ever i saw your face

artist : roberta flack
track2 : the first time ever i saw your face

33/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : simon garfunkel
track2 : 59th street bridge song

artist : simon garfunkel
track2 : the 59th street bridge song (feeling groovy)

34/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : stevie wonder
track2 : as

artist : stevie wonder
track2 : isnt she lovely

35/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : iron wine
track2 : trapeze swinger

artist : iron wine
track2 : the trapeze swinger

35/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : iron wine
track2 : the trapeze swinger

artist : iron wine
track2 : trapeze swinger

36/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : delirium
track2 : silence

artist : delirium ft. sarah mclachlan
track2 : silence

37/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : derek the dominos
track2 : layla

artist : derek and the dominos
track2 : layla

38/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : yes
track2 : fragile

artist : dustin ohalloran
track2 : fragile n.4

39/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : stephan mathieu
track2 : until i found you3

artist : stephan sanchez
track2 : until i found you

39/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : minmi samurai champloo
track2 : shiki no uta (samurai champloo ending)

artist : nujabes, samurai champloo, and minmi
track2 : shiki no uta (samurai champloo ending)

40/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : u2
track2 : bad

artist : u2
track2 : sometimes you cant make it on your own

41/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : u2
track2 : mlk

artist : u2
track2 : sometimes you cant make it on your own

41/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : radiohead
track2 : creep

artist : radiohead
track2 : street spirit and reckoner

41/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : lakme
track2 : flower duo

artist : lakme
track2 : the flower duet from lakme

41/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : u2
track2 : all i want is you

artist : u2
track2 : mlk

42/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : fleetwood mac / peter green
track2 : crystal

artist : pink
track2 : crystal ball

42/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : sia
track2 : breathe me

artist : skinshape
track2 : breathe

42/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : ben e king
track2 : stand by me

artist : florence and the nightingales
track2 : stand by me

42/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : leo delibes
track2 : the flower duet

artist : leo delibes
track2 : flower duet sung by anna netrebko elina garanca

42/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : death cab for cutie
track2 : soul meets body

artist : death cab for cutie
track2 : where soul meets body

42/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : joe satriani
track2 : always with you, always with me

artist : joe satriani
track2 : always with me always with you

43/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : the smashing pumpkins
track2 : 33

artist : the smashing pumpkins
track2 : appels and oranjes

44/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : pyotr ilyich tchaikovsky
track2 : the nutcracker op 71 pas de deux

artist : pyotr ilyich tchaikovsky
track2 : the nutcracker, op 71, act ii: no. 14, pas de duex

44/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : kanye west
track2 : 24

artist : kanye west
track2 : bittersweet poetry

45/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : sigur ros
track2 : n/a

artist : sigur ros
track2 : untitled 3 - samskeyti

45/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : howard shore featuring sir james galway
track2 : lord of the rings trilogy

artist : howard shore featuring sir james galway
track2 : lord of the rings soundtrack

45/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : johann sebastian bach
track2 : n/a

artist : johann sebastian bach
track2 : alle menschen mussen sterben

46/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : ludwig van beethoven
track2 : ode to joy

artist : ludwig van beethoven
track2 : piano concerto no. 5 in e-flat major op. 73 emperor: ii adagio un poco mosso

46/10 positive, 20/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : u2
track2 : all i want is you

artist : u2
track2 : mlk

46/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : kanye west
track2 : 24

artist : kanye west
track2 : devil in a new dress

46/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : stan rogers
track2 : lies

artist : stan rogers
track2 : the witch of the westmoreland

46/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : sufjan stevens
track2 : carrie and lowell

artist : sufjan stevens
track2 : carrie lowell

46/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : pink floyd
track2 : comfortably numb live in pompeii

artist : pink floyd
track2 : dogs

47/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : ludovico einaudi ft. greta svabo bech
track2 : fly

artist : ludovico einaudi ft. greta svabo bech
track2 : elegy for the arctic

47/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : otis redding
track2 : sitting on the dock of the bay

artist : otis redding
track2 : sitting on a dock of a bay

47/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : billy joel
track2 : n/a

artist : billy joel
track2 : new york state of mind

48/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : joe satriani
track2 : always with me, always with you

artist : joe satriani
track2 : always with you, always with me

48/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : led zeppelin
track2 : all my love

artist : led zeppelin
track2 : all of my love

49/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : johann sebastian bach
track2 : n/a

artist : johann sebastian bach
track2 : st matthews passion, erbarme dich

50/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : tool
track2 : wings for marie (pt 1)

artist : tool
track2 : wings for marie part 1

50/10 positive, 28/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : phoebe bridgers
track2 : song

artist : phoebe bridgers
track2 : motion sickness (demo/acoustic)

51/10 positive, 28/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : gustav holst
track2 : the planets, op. 32: iv. jupiter, the bringer of jollity

artist : gustav holst
track2 : jupiter

51/10 positive, 29/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : coldplay avicii
track2 : o

artist : coldplay avicii

52/10 positive, 29/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : la la land soundtrack
track2 : mia sebastian theme

artist : la la land soundtrack
track2 : mia and sebastians theme

52/10 positive, 30/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : stevie wonder
track2 : as

artist : stevie wonder
track2 : knocks me off my feet

53/10 positive, 30/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : louis armstrong jr.
track2 : what a wonderful world

artist : louis armstrong jr.
track2 : (its a )wonderful world

53/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : frederic chopin
track2 : nocturne op. 9 no. 2

artist : frederic chopin
track2 : nocturne no. 2 in e-flat major, op. 9 no. 2

54/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : bette midler
track2 : the rose

artist : bette midler
track2 : god help the outcasts from the hunchback of notre dame

55/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : johann sebastian bach
track2 : air

artist : johann sebastian bach
track2 : chello suite no. 1 in g major

55/10 positive, 32/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : michael nyman
track2 : the heart asks pleasure first

artist : michael nyman
track2 : the piano

55/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : prince tui teka
track2 : sometimes it snows in april

artist : prince and the revolution
track2 : sometimes it snows in april

55/10 positive, 34/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : neil young and crazy horse
track2 : little wing

artist : jimi
track2 : little wing

56/10 positive, 34/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : alpine
track2 : forever young

artist : audra mae the forest rangers
track2 : forever young

56/10 positive, 35/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : john frusciante
track2 : anne

artist : john frusciante
track2 : song to sing when im lonely

56/10 positive, 36/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : the rolling stones
track2 : shes a rainbow

artist : the rolling stones
track2 : shes like a rainbow

56/10 positive, 37/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : the rolling stones
track2 : shes a rainbow

artist : the rolling stones
track2 : shes like a rainbow

57/10 positive, 37/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : elvis presley
track2 : cant help falling in love with you

artist : elvis presley
track2 : falling in love with you

58/10 positive, 37/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : ludwig van beethoven
track2 : fur elise

artist : ludwig van beethoven
track2 : piano concerto no. 5 in e-flat major op. 73 emperor: ii adagio un poco mosso

59/10 positive, 37/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : johann sebastian bach
track2 : air

artist : johann sebastian bach
track2 : cello suite no 1 in g major

59/10 positive, 38/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : johann sebastian bach
track2 : air

artist : johann sebastian bach
track2 : cello suite no 1 in g major

59/10 positive, 39/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : prince tui teka
track2 : sometimes it snows in april

artist : prince tui teka
track2 : n/a

59/10 positive, 40/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : johann sebastian bach
track2 : cello suite no 1 in g major

artist : johann sebastian bach
track2 : n/a

59/10 positive, 41/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : johann sebastian bach
track2 : jesu, joy of mans desiring

artist : johann sebastian bach
track2 : n/a

59/10 positive, 42/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : ludwig van beethoven
track2 : ninth symphony

artist : ludwig van beethoven
track2 : 7th symphony in a major opus 92

59/10 positive, 43/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


Clustering...
# duplicate sets 5026


In [80]:
dedupe_track_df2

Unnamed: 0,artist,track2,cluster id,confidence,canonical_artist,canonical_track2
0,jeff buckley,hallelujah,53,0.578420,"leonard cohen, brandi carlile",hallelujah
1,claude debussy,clair de lune,54,0.578384,claude debussy,clair de lune - leopold stokowskis orchestral ...
2,the beach boys,god only knows,55,0.578356,the beach boys,dont talk (put your head on my shoulder)
3,sleeping at last,saturn,56,0.578371,sleeping at last,saturn (feat. tim fain)
4,don mclean,vincent,57,0.578382,don mclean,vincent (starry starry night)
...,...,...,...,...,...,...
14931,hans zimmer lisa gerrard,honor,163,0.578383,hans zimmer lisa gerrard,what are you going to do when you are not savi...
14932,hans zimmer lisa gerrard,interstellar album,163,0.578356,hans zimmer lisa gerrard,what are you going to do when you are not savi...
14933,hans zimmer lisa gerrard,interstellar intro,163,0.578358,hans zimmer lisa gerrard,what are you going to do when you are not savi...
14934,hans zimmer lisa gerrard,interstellar piano cover,163,0.578364,hans zimmer lisa gerrard,what are you going to do when you are not savi...


In [81]:
dedupe_track_df['track_id']=dedupe_track_df2['cluster id']
dedupe_track_df['confidence']=dedupe_track_df2['confidence']
dedupe_track_df

Unnamed: 0,index,artist,track,track2,count,track_id,confidence
0,0,Jeff Buckley,Hallelujah,hallelujah,96,53,0.578420
1,1,Claude Debussy,Clair de Lune,clair de lune,65,54,0.578384
2,2,The Beach Boys,God Only Knows,god only knows,62,55,0.578356
3,3,Sleeping at Last,Saturn,saturn,51,56,0.578371
4,4,Don McLean,Vincent,vincent,50,57,0.578382
...,...,...,...,...,...,...,...
14931,14931,Hans Zimmer & Lisa Gerrard,Honor,honor,1,163,0.578383
14932,14932,Hans Zimmer & Lisa Gerrard,Interstellar Album,interstellar album,1,163,0.578356
14933,14933,Hans Zimmer & Lisa Gerrard,Interstellar Intro,interstellar intro,1,163,0.578358
14934,14934,Hans Zimmer & Lisa Gerrard,Interstellar piano cover,interstellar piano cover,1,163,0.578364


In [82]:
dedupe_track_df3 = dedupe_track_df.loc[dedupe_track_df['confidence'] > 0.4][['artist', 'track', 'track_id']]
dedupe_track_df3


Unnamed: 0,artist,track,track_id
0,Jeff Buckley,Hallelujah,53
1,Claude Debussy,Clair de Lune,54
2,The Beach Boys,God Only Knows,55
3,Sleeping at Last,Saturn,56
4,Don McLean,Vincent,57
...,...,...,...
14931,Hans Zimmer & Lisa Gerrard,Honor,163
14932,Hans Zimmer & Lisa Gerrard,Interstellar Album,163
14933,Hans Zimmer & Lisa Gerrard,Interstellar Intro,163
14934,Hans Zimmer & Lisa Gerrard,Interstellar piano cover,163


In [83]:
names2i = {(artist, track): id for artist, track, id in zip(dedupe_track_df3['artist'], 
                               dedupe_track_df3['track'],
                               dedupe_track_df3['track_id'])
         }
names2i

{('Jeff Buckley', 'Hallelujah'): 53,
 ('Claude Debussy', 'Clair de Lune'): 54,
 ('The Beach Boys', 'God Only Knows'): 55,
 ('Sleeping at Last', 'Saturn'): 56,
 ('Don McLean', 'Vincent'): 57,
 ('Louis Armstrong Jr.', 'What a Wonderful World'): 58,
 ('Fleetwood Mac / Peter Green', 'Songbird'): 59,
 ('Ludwig van Beethoven', 'Moonlight Sonata'): 60,
 ('Bon Iver and St. Vincent', 'Holocene'): 61,
 ('Ludovico Einaudi ft. Greta Svabo Bech', 'Nuvole Bianche'): 62,
 ('Leonard Cohen, Brandi Carlile', 'Hallelujah'): 53,
 ('The Beatles', 'Blackbird'): 63,
 ('Fleetwood Mac / Peter Green', 'Landslide'): 64,
 ('The Beatles', 'Something'): 63,
 ('Johann Pachelbel', 'Canon in D'): 1046,
 ('The Cranberries', 'Linger'): 65,
 ('The Beatles', 'Yesterday'): 63,
 ('Led Zeppelin', 'The Rain Song'): 66,
 ('Radiohead', 'Nude'): 67,
 ('Neil Young and Crazy Horse', 'Harvest Moon'): 68,
 ('Lord Huron', 'The Night We Met'): 69,
 ('Aphex Twin', 'Avril 14th'): 70,
 ('The Goo Goo Dolls', 'Iris'): 71,
 ('The Beatles', 

In [84]:
df['track_index']= df.apply(lambda r: str(names2i[(r['artist'], r['track'])]) if (r['artist'], r['track']) in names2i else r['track'], axis=1)

In [85]:
df

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index,track2,track_index
20230,jhbktrn,13941,Claude Debussy,Clair de Lune,Claude Debussy,claude debussy,487,clair de lune,54
20216,jhc2dyv,6996,Erik Satie,Gymnopédies,Erik Satie,erik satie,49,gymnopédies,122
20221,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair,Simon & Garfunkel,simon & garfunkel,1,scarborough fair,73
20237,jhcc2q8,5014,Israel Kamakawiwoole,Over the Rainbow,Israel Kamakawiwoole,israel kamakawiwoole,22,over the rainbow,92
20249,jhbjlwj,4332,The Cranberries,Dreams,The Cranberries,cranberries,26,dreams,65
...,...,...,...,...,...,...,...,...,...
3118,jhgycrq,1,Nicki Minaj,Anaconda,Nicki Minaj,nicki minaj,2552,anaconda,2619
3110,jhdsdrq,1,Nightwish,The Islander,Nightwish,nightwish,547,the islander,307
3107,jhdd0mb,1,Three Dog Night,Pieces of April,Three Dog Night,three dog night,1897,pieces of april,1678
3109,jhdpp17,1,Lenka,billy beane’s daughter singing ‘the show’ in m...,Lenka,lenka,1157,billy beane’s daughter singing ‘the show’ in m...,1389


In [86]:
tempdf = df[['artist', 'track', 'track_index', 'post_score']] \
    .groupby(['artist', 'track_index']) \
    .agg(sum= ('post_score', 'sum'), \
         track= ('track', 'first') \
        ) \
    .reset_index() \
    .sort_values(['sum'], ascending=False) \
    .rename(columns={'sum': 'score'}) \
    .reset_index(drop=True)
tempdf

Unnamed: 0,artist,track_index,score,track
0,Claude Debussy,54,22088,Clair de Lune
1,Simon & Garfunkel,73,11922,Scarborough Fair
2,The Beatles,63,9049,In my Life
3,Erik Satie,122,7620,Gymnopédies
4,The Cranberries,65,7305,Dreams
...,...,...,...,...
6359,Jens Lekman,4369,1,Night Falls Over Kortedala/The Linden Trees Ar...
6360,Jenny Hval,507,1,Die
6361,Jennifer Hudson,4368,1,Carry That Weight
6362,Jenevieve,4367,1,Babe Powder


In [87]:
df

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index,track2,track_index
20230,jhbktrn,13941,Claude Debussy,Clair de Lune,Claude Debussy,claude debussy,487,clair de lune,54
20216,jhc2dyv,6996,Erik Satie,Gymnopédies,Erik Satie,erik satie,49,gymnopédies,122
20221,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair,Simon & Garfunkel,simon & garfunkel,1,scarborough fair,73
20237,jhcc2q8,5014,Israel Kamakawiwoole,Over the Rainbow,Israel Kamakawiwoole,israel kamakawiwoole,22,over the rainbow,92
20249,jhbjlwj,4332,The Cranberries,Dreams,The Cranberries,cranberries,26,dreams,65
...,...,...,...,...,...,...,...,...,...
3118,jhgycrq,1,Nicki Minaj,Anaconda,Nicki Minaj,nicki minaj,2552,anaconda,2619
3110,jhdsdrq,1,Nightwish,The Islander,Nightwish,nightwish,547,the islander,307
3107,jhdd0mb,1,Three Dog Night,Pieces of April,Three Dog Night,three dog night,1897,pieces of april,1678
3109,jhdpp17,1,Lenka,billy beane’s daughter singing ‘the show’ in m...,Lenka,lenka,1157,billy beane’s daughter singing ‘the show’ in m...,1389


In [88]:
tempdf = tempdf[['artist', 'track', 'score']]
display(tempdf.loc[tempdf['score'] > 4].head(20))
display(tempdf.loc[tempdf['score'] > 4].tail(20))


Unnamed: 0,artist,track,score
0,Claude Debussy,Clair de Lune,22088
1,Simon & Garfunkel,Scarborough Fair,11922
2,The Beatles,In my Life,9049
3,Erik Satie,Gymnopédies,7620
4,The Cranberries,Dreams,7305
5,Israel Kamakawiwoole,Over the Rainbow,6681
6,Neil Young and Crazy Horse,Harvest Moon,5337
7,Jim Croce,Time in a Bottle,5285
8,Mazzy Star,Fade into you,4357
9,Don McLean,"Vincent (Starry, Starry Night)",3989


Unnamed: 0,artist,track,score
1033,Ana Carolina Seu Jorge,Love Is A Losing Game,5
1034,Trevor Morris,Lost Elf,5
1035,Yuka Kitamura,Lord of Cinder,5
1036,Whiskey Myers,broken window serenade,5
1037,Leon Bridges,Beyond,5
1038,Bloc Party,This Modern Love,5
1039,Arijit Singh,Laal ishq,5
1040,Nikolai Rimsky-Korsakov,Scheherezade,5
1041,"10,000 Maniacs",Verdi Cries,5
1042,Yasunori Mitsuda,Scars of Time,5


In [89]:
df = tempdf.loc[tempdf['score'] > 4]
df.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(df, f)


In [107]:
# I think dedupe may not be matching most popular form of artist
z = {'Hans Zimmer Radiohead':'Hans Zimmer',
'Fleetwood Mac / Peter Green':'Fleetwood Mac',
'Jeremy Soule featuring Asja':'Jeremy Soule',
'Cardi B featuring Megan Thee Stallion':'Cardi B',
'FWM':'Fleetwood Mac',
'Eric Whitacre ft. Voces8':'Eric Whitacre',
'Gorillaz feat. Little Dragon':'Gorillaz',
'RAC (feat. Katie Herzig)':'RAC',
'Jamie XX featuring Romy':'Jamie XX',
'Spell Songs featuring Julie Fowlis':'Spell Songs',
'Kenny G featuring Aaron Neville':'Kenny G',
'Charlie Haden Quartet featuring Norah Jones':'Charlie Haden Quartet',
'Wiz Khalifa feat. Charlie Puth':'Wiz Khalifa',
'Marshmello feat. Khalid':'Marshmello',
'YOSHIKI feat. HYDE':'YOSHIKI',
'Charlie Hunter Quartet featuring Norah Jones':'Charlie Hunter Quartet',
'Alison Krauss featuring Natalie MacMaster':'Alison Krauss',
'Hodson featuring Jay-Z':'Hodson',
'Nujabes feat. Cise Starr':'Nujabes',
'Flume feat. Tove Lo':'Flume',
'Drake feat. Yebba':'Drake',
'Radwimps featuring Toaka':'Radwimps',
'Weezer featuring Hayley Williams':'Weezer',
'Post Malone featuring Swae Lee':'Post Malone',
'Polyphia featuring Ichika':'Polyphia',
'Eminem featuring Dido':'Eminem',
'PJ Morton featuring Yebba':'PJ Morton',
'Aurora featuring Pomme':'Aurora',
'Kaskade featuring Haley':'Kaskade',
'XXXTENTACION featuring Scott James':'XXXTENTACION',
'Snow Patrol feat. Martha Wainwright':'Snow Patrol',
'Direct featuring Danyka Nadeau':'Direct',
'Erra feat. Courtney LaPlante from Spirit Box':'Erra',
'Ursine Vulpine featuring Annaca':'Ursine Vulpine',
'Howard Shore featuring Sir James Galway':'Howard Shore',
'Daft Punk feat. Paul Williams':'Daft Punk',
'Wooli featuring Delaney Kai':'Wooli',
'UNKLE featuring Thom Yorke':'UNKLE',
'Bastille feat. The Chamber Orchestra of London':'Bastille',
"Marty O'Donnell, Stan LePard, & Michael Salvatori":"Marty O'Donnell",
"Stan Getz, João Gilberto":"Stan Getz",
'RAC (feat. Katie Herzig)':'RAC',
'Taylor Swift ft. Bon Iver':'Taylor Swift',
'Calvin Harris ft. Florence Welch':'Calvin Harris',
'Brad Paisley ft. Alison Krauss':'Brad Paisley',
'Delirium ft. Sarah McLachlan':'Delirium',
'T-Pain ft. Akon, Mary J. Blige':'T-Pain',
'Manchester Orchestra ft. Daniel Radcliffe and Paul Dano':'Manchester Orchestra',
'Nujabes ft. MINMI':'Nujabes',
'Samurai Champloo ft. MINMI & Nujabes':'Samurai Champloo',
'MINMI ft. Nujabes':'MINMI',
'ODESZA ft. MARO':'ODESZA',
'Sharon Jones & The Dap-Kings ft. Lee Fields':'Sharon Jones & The Dap-Kings',
'Sarah Barrios ft. Eric Nam':'Sarah Barrios',
'David Arkenstone ft. Charlee Brooks':'David Arkenstone',
'XXYYXX ft. Anneka':'XXYYXX',
'Leonard Cohen, Brandi Carlile':'Leonard Cohen',
'Harry Waters Jr., Marvin Berry, and the Starlighters':'Harry Waters Jr.',
'Roberta Flack, Donny Hathaway':'Roberta Flack',
'Zedd, Maren Morris, Grey':'Zedd',
'Frank Ocean, James Blake':'Frank Ocean',
'Susan Suh, Robert Koch':'Susan Suh',
'Black Country, New Road':'Black Country',
'Khruangbin, Leon Bridges':'Khruangbin',
'Friendship, Emily Warren':'Friendship',
'Jonsi, Alex':'Jonsi',
'Khalid, Future':'Khalid',
'Duke Ellington, John Coltrane':'Duke Ellington',
'Clams Casino, Imogen Heap':'Clams Casino',
'Burial, Sacred Tapestry':'Burial',
'foudeqush, Ludwig Goransson':'foudeqush',
'Conjure One, Poe':'Conjure One',
'Cyril Giroux, Chloé Lacan':'Cyril Giroux',
'Carrie Underwood, Travis Cottrell, Debby Boone':'Carrie Underwood',
'Aska, Chage':'Aska',
'T-Pain ft. Akon, Mary J. Blige':'T-Pain',
'Dan Balan, Katerina Begu':'Dan Balan',
'Jose Padilla, Seal':'Jose Padilla',
'Ratso, Nick Cave':'Ratso',
'Slaughter Beach, Dog':'Slaughter Beach',
'Paganini, Liszt':'Paganini',
'May Erlewine, Woody Goss':'May Erlewine',
'Cyua, Hiroyuki Sawano':'Cyua',
'Dan Zanes, Natalie Merchant':'Dan Zanes',
'Sting, Ray Chen':'Sting',
'Coco and Clair Clair, Okthxbb':'Coco and Clair Clair',
'Edgar Meyer, Mike Marshall, Bela Fleck':'Edgar Meyer',
'Ray Charles, Willie Nelson':'Ray Charles',
'Wildlight, The Polish Ambassador and Ayla Nereo':'Wildlight',
'Max Richter, Dinah Washington':'Max Richter',
'Steve Martin, Dolly Parton, Vince Gill':'Steve Martin',
'Elis Regina, Antonio Carlos Jobim':'Elis Regina',
'James Blunt, The Righteous Brothers, Brad Paisley':'James Blunt',
'Leprous, Dream Theater, Periphery':'Leprous',
'Frank Sinatra, Glenn Miller, Van Morrison':'Frank Sinatra',
'Pink, Willow Sage Hart':'Pink',
'Jessye Norman, Stephen Adams, Christopher Bowers-Broadbent':'Jessye Norman',
'Nujabes, MINMI, and Samurai Champloo':'Nujabes',
'Nujabes, Samurai Champloo, and MINMI':'Nujabes',
'Steve Martin, Steep Canyon Rangers':'Steve Martin',
'Celine Dion, Barbra Streisand':'Celine Dion',
'Bryan Adams, Luciano Pavarotti':'Bryan Adams',
'MUZZ (Mat Zo, Olan and A&B)':'MUZZ',
'Pink, Sage (The Gemini':'Pink',
'Moby, Sinead Oconnor':'Moby',
'I Vow to Thee, My Country':'I Vow to Thee',
'Jose Padilla, Kirsty Keach':'Jose Padilla',
'Don Francisco, Wendy Francisco, Jerry Palmer':'Don Francisco',
'Dave Grohl, Josh Homme, & Trent Reznor':'Dave Grohl',
'Steve Conte, Maaya Sakamoto':'Steve Conte',
'Sting, Stevie Wonder':'Sting',
'Jacob Collier, Lizzy McAlpine, John Mayer':'Jacob Collier',
'Joseph Shabason, Nicholas Krgovich, Shabason & Krgovich':'Joseph Shabason',
'Khalid, Benny Blanco, Halsey':'Khalid',
'Ed Sheeran, Andrea Bocelli':'Ed Sheeran',
'Kim Petras, Nicki Minaj':'Kim Petras',
'Bryce Dessner, James McAlister, Nico Muhly, Sufjan Stevens':'Bryce Dessner',
'Appleseed, YouSeeBigGirl, T:T':'Appleseed',
'Solarstone, Andy Bury':'Solarstone',
'Carti, Summertime Sadness':'Carti',
'Dolly Parton, Linda Ronstadt, and Emmylou Harris':'Dolly Parton',
'LMM, Hwasa':'LMM',
'Snowgoons, Viro the Virus':'Snowgoons',
'Sarah Class, Cantamus Choir':'Sarah Class',
'Boy meets Girl, Brian McKnight, Vanessa Williams, Bonnie Tyler, Jax':'Boy meets Girl',
'Jeff Buckley, The Righteous Brothers, Johann Pachelbel':'Jeff Buckley',
'Nate J, Nate Traveller':'Traveller',
'Debussy, Flight Facilities':'Flight Facilities',
'Death Cab for Cutie/The Postal Service':'Death Cab for Cutie',
'Sting (musician)':'Sting',
'Mick Hucknall Simply Red':'Simply Red',
'Johnny Cash and Bob Dylan':'Johnny Cash',
'John Prine and Bonnie Raitt':'Bonnie Raitt',
'Andrea Bocelli and Celine Dion':'Andrea Bocelli',
'Bob Dylan & Johnny Cash':'Bob Dylan',
'Minnie Riperton and Richard Rudolph':'Minnie Riperton',
'Eric Whitacre featuring Voces8':'Eric Whitacre',
'Hans Zimmer and Benjamin Wallfisch':'Hans Zimmer',
'Grover Washington Jr. & Bill Withers':'Grover Washington Jr.',
'Barry DeVorzon and Perry Botkin Jr.':'Barry DeVorzon',
'Porter Robinson and Madeon':'Porter Robinson',
'Ed Sheeran ft Yebba':'Ed Sheeran',
'Sara Bareilles & Josh Groban':'Sara Bareilles',
'St. Vincent (musician)':'St. Vincent',
'Glen Hansard and Marketa Irglova':'Glen Hansard',
'Willie Nelson & Ray Charles':'Willie Nelson',
'Alina Baraz Galimatias':'Alina Baraz',
'Hans Zimmer & Lisa Gerrard':'Hans Zimmer' ,
'Louis Armstrong Jr.': 'Louis Armstrong',
'Ludovico Einaudi ft. Greta Svabo Bech': 'Ludovico Einaudi',
'Henry Mancini & Audrey Hepburn': 'Henri Mancini',
'Bon Iver and St. Vincent': 'Bon Iver',
'Coldplay Avicii': 'Coldplay',
'Bruce Springsteen, Melissa Etheridge': 'Bruce Springsteen',
'Billie Eilish ft. Khalid': 'Billie eilish',
'Andrea Bocelli and Josh Groban': 'Andrea Bocelli',
'Norah Jones and Danger Mouse': 'Norah Jones',
'Ennio Morricone & Joan Baez': 'Ennio Morricone',
"Des'ree": 'Desree',
'Porter Robinson, Madeon': 'Porter Robinson',
'Ray Charles & Willie Nelson': 'Ray Charles',
'Ludvig Forssell and Jenny Plant': 'Ludvig Forssell',
'Nicole Kidman & Ewan McGregor': 'Ewan McGregor',
'Deadmau5 and Kaskade': 'Deadmau5',
'Beabadoobee feat. Clairo': 'Beabadoobee',
'John Coltrane, Duke Ellington': 'John COltrane',
'Nicholas Britell and Spring 1 - Max Richter': 'Nicholas Britell',
'Dave Matthews & Tim Reynolds': 'Dave Matthews',
'MINMI & Nujabes': 'MINMI',
'Nu Deco Ensemble Kishi Bashi': 'Kishi Bashi',
'Ana Carolina Seu Jorge': 'Ana Carolina',
'Skillet (band': 'Skillet',
'Soccer Mommy (Sophie Allison': 'Soccer Mommy',

}

In [108]:
for k,v in z.items(): 
    tdf = df.loc[df['artist']==k]
    if len(tdf) > 0:
        print(k, len(tdf))        
        df.loc[df['artist']==k, 'artist']=v


Hans Zimmer & Lisa Gerrard 1
Louis Armstrong Jr. 2
Ludovico Einaudi ft. Greta Svabo Bech 1
Henry Mancini & Audrey Hepburn 2
Bon Iver and St. Vincent 2
Coldplay Avicii 1
Bruce Springsteen, Melissa Etheridge 1
Billie Eilish ft. Khalid 2
Andrea Bocelli and Josh Groban 2
Norah Jones and Danger Mouse 1
Ennio Morricone & Joan Baez 1
Des'ree 1
Porter Robinson, Madeon 1
Ray Charles & Willie Nelson 1
Ludvig Forssell and Jenny Plant 1
Nicole Kidman & Ewan McGregor 1
Deadmau5 and Kaskade 1
Beabadoobee feat. Clairo 1
John Coltrane, Duke Ellington 1
Nicholas Britell and Spring 1 - Max Richter 1
Dave Matthews & Tim Reynolds 1
MINMI & Nujabes 1
Nu Deco Ensemble Kishi Bashi 1
Ana Carolina Seu Jorge 1
Skillet (band 1
Soccer Mommy (Sophie Allison 1


In [None]:
# tempdf = df[['artist', 'post_score']] \
#     .groupby('artist') \
#     .sum() \
#     .reset_index() 

# tempdf.loc[tempdf['post_score']> 2].to_csv('x.csv', index=False)

In [109]:
df

Unnamed: 0,artist,track,score
200,Claude Debussy,Clair de Lune,22088
831,Simon & Garfunkel,Scarborough Fair,11922
892,The Beatles,In my Life,9049
309,Erik Satie,Gymnopédies,7620
900,The Cranberries,Dreams,7305
...,...,...,...
860,Steve Reich,Duet for two violins,5
361,Future,The Fountain,5
359,Fun,The gambler,5
140,Bobby Caldwell,What you won’t do for love,5


In [110]:
df = df[['artist', 'track', 'score']].groupby(["artist", "track"]) \
    .sum() \
    .reset_index() \
    .sort_values('score', ascending=False)

df.head(20)



Unnamed: 0,artist,track,score
200,Claude Debussy,Clair de Lune,22088
831,Simon & Garfunkel,Scarborough Fair,11922
892,The Beatles,In my Life,9049
309,Erik Satie,Gymnopédies,7620
900,The Cranberries,Dreams,7305
443,Israel Kamakawiwoole,Over the Rainbow,6681
677,Neil Young and Crazy Horse,Harvest Moon,5337
479,Jim Croce,Time in a Bottle,5285
635,Mazzy Star,Fade into you,4357
267,Don McLean,"Vincent (Starry, Starry Night)",3989


## Filter by minimum score


In [111]:
df = df.loc[df['score'] >4]
df

Unnamed: 0,artist,track,score
200,Claude Debussy,Clair de Lune,22088
831,Simon & Garfunkel,Scarborough Fair,11922
892,The Beatles,In my Life,9049
309,Erik Satie,Gymnopédies,7620
900,The Cranberries,Dreams,7305
...,...,...,...
688,NieR: Automata,Peaceful Sleep,5
380,Giovanni Pierluigi da Palestrina,Sicut Cervus,5
233,Dario Marianelli,Dawn,5
318,Eva Cassidy,time after time,5


In [112]:
df.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(df, f)


In [147]:
with open('silver.pkl', 'rb') as f:
    df = pickle.load(f)
df.head(20)

Unnamed: 0,artist,track,score
200,Claude Debussy,Clair de Lune,22088
831,Simon & Garfunkel,Scarborough Fair,11922
892,The Beatles,In my Life,9049
309,Erik Satie,Gymnopédies,7620
900,The Cranberries,Dreams,7305
443,Israel Kamakawiwoole,Over the Rainbow,6681
677,Neil Young and Crazy Horse,Harvest Moon,5337
479,Jim Croce,Time in a Bottle,5285
635,Mazzy Star,Fade into you,4357
267,Don McLean,"Vincent (Starry, Starry Night)",3989


# Load into a Spotify playlist


In [114]:
# log in
client_credentials_manager = SpotifyClientCredentials(client_id=os.getenv('SPOTIFY_CLIENT_ID'), 
                                                      client_secret=os.getenv('SPOTIFY_CLIENT_SECRET'),
                                                      )

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


In [115]:
df = pd.read_csv("silver.csv")
df = df.groupby(['artist','track']).sum().reset_index().sort_values('score', ascending=False)
df.to_csv('silver.csv', index=False)


In [138]:
# check artists
# update to spotify canonical name as necessary

df = pd.read_csv("silver.csv")
df = df.groupby(['artist','track']).sum().reset_index().sort_values('score', ascending=False)

dedupe = {}
fail_list = []
artist_map = {}
for index, artist, title, score in df.itertuples():
    artist = str(artist)
    if artist in dedupe:
        continue
    dedupe[artist]=1
    query_str = 'artist:%s' % (artist)
    artist_results = sp.search(q=query_str, type='artist', limit=3, offset=0, market='US')
    artist_names = [artist['name'] for artist in artist_results['artists']['items']]
    if artist_names:
        if artist.lower() != artist_names[0].lower():
            artist_map[artist] = artist_names[0]
            print(artist, '->', artist_names[0])
    else:
        fail_list.append((artist, title))
        print("not found:", artist, "-", title)

# then clean up manually as appropriate

Israel Kamakawiwoole -> Israel Kamakawiwo'ole
Neil Young and Crazy Horse -> Neil Young
Sigur Ros -> Sigur Rós
Edith Piaf -> Édith Piaf
Ben E King -> Ben E. King
Sinead O'connor -> Sinéad O'Connor
Dream Academy -> The Dream Academy
Emerson -> Emerson, Lake & Palmer
Annie -> Annie Lennox
Crosby, Stills, Nash, Young -> Crosby, Stills, Nash & Young
Hollies -> The Hollies
Cat Stevens -> Yusuf / Cat Stevens
Alison Krauss -> Alison Krauss & Union Station
Vince -> St. Vincent
Taylor -> Taylor Swift
Bjork -> Björk
Traditional -> Chinese Traditional
Flaming Lips -> The Flaming Lips
Loggins Messina -> Loggins & Messina
Dave Matthews -> Dave Matthews Band
Les Miserables -> Les Misérables - 10th Anniversary Cast
Fray -> The Fray
Celine Dion -> Céline Dion
José Gonzalez -> José González
Desree -> Des'ree
Saint Saens -> Camille Saint-Saëns
Patrick -> Patrick Watson
Nick Cave -> Nick Cave & The Bad Seeds
Doors -> The Doors
Guns N Roses -> Guns N' Roses
Everly Brothers -> The Everly Brothers
Crosby Sti

In [139]:
[f[0] for f in fail_list]

['The Lord of the Rings: The Fellowship of the Ring',
 'college a capella',
 'Steven Universe (soundtrack)',
 'Guitars and Dragons',
 'Bladee, Ecco2k',
 'Amadeus soundtrack',
 'South Park (soundtrack)',
 'Star Trek: The Next Generation']

In [140]:
artist_map

{'Israel Kamakawiwoole': "Israel Kamakawiwo'ole",
 'Neil Young and Crazy Horse': 'Neil Young',
 'Sigur Ros': 'Sigur Rós',
 'Edith Piaf': 'Édith Piaf',
 'Ben E King': 'Ben E. King',
 "Sinead O'connor": "Sinéad O'Connor",
 'Dream Academy': 'The Dream Academy',
 'Emerson': 'Emerson, Lake & Palmer',
 'Annie': 'Annie Lennox',
 'Crosby, Stills, Nash, Young': 'Crosby, Stills, Nash & Young',
 'Hollies': 'The Hollies',
 'Cat Stevens': 'Yusuf / Cat Stevens',
 'Alison Krauss': 'Alison Krauss & Union Station',
 'Vince': 'St. Vincent',
 'Taylor': 'Taylor Swift',
 'Bjork': 'Björk',
 'Traditional': 'Chinese Traditional',
 'Flaming Lips': 'The Flaming Lips',
 'Loggins Messina': 'Loggins & Messina',
 'Dave Matthews': 'Dave Matthews Band',
 'Les Miserables': 'Les Misérables - 10th Anniversary Cast',
 'Fray': 'The Fray',
 'Celine Dion': 'Céline Dion',
 'José Gonzalez': 'José González',
 'Desree': "Des'ree",
 'Saint Saens': 'Camille Saint-Saëns',
 'Patrick': 'Patrick Watson',
 'Nick Cave': 'Nick Cave & Th

In [142]:
ignore_list = [ 

'1',     #One Direction',
 'Hem', #'Natalie Hemby',
    'Priscilla',     #: 'Priscilla Chan',
     'Drake',     #: 'Nick Drake',
 'William Ackerman',     #: 'Mark Ackerman, William James Ross',
 'Jason',     #: 'Jason Mraz',
 'Juice',     #: 'Juice WRLD',
 'Origa',     #: 'Origami Angel',
 'Nico',     #: 'Nico & Vinz',
 'Mako',     #: 'Mako Road',
 'Low',     #: 'All Time Low',
 'La La Land Soundtrack',     #: 'LAND Soundtrack',
 'Flamingos',     #: 'Flamingosis',
 'BoA',     #: 'Boards of Canada',
 'Traditional',     #: 'Chinese Traditional',
 'Future',     #: 'Future Islands',
'ASAP Rocky',     # -> Seth Narley feat. ASAP Rocky
'Acoustic',     # -> Acoustic Alchemy
'Adeem',     #Adeem the Artist',
'Al Stewart',     #Alexander Stewart',
'Alpine' ,     #-> Alpine Universe
'America',     #The All-American Rejects',
'Arrow',     # -> Arrows in Action
'Berlin',     #Berliner Philharmoniker',
'Brian Wilson',     # -> Brian Courtney Wilson
'CSNY',     #Csnyee_',
'Choir Choir Choir!',     #Mav City Gospel Choir',
'Dallas Green',     # -> Jimmy Carter and Dallas County Green
'Death',     #Five Finger Death Punch',
'Dixie Chicks',     # -> Karaoke - Dixie Chicks
'Eileen',     #Eileen Walker',
'Eric Johnson',     #Eric D. Johnson',
'Frente',     #Frente Cumbiero',
'IZ',     #Izzamuzzic',
'Japanese House',     # -> The Japanese House
'Jewel',     # -> Run The Jewels
'LP',     #LP Giobbi',
'La La Land Soundtrack' ,     #-> LAND Soundtrack
'Live',     # -> DPR LIVE
'MCR',     #Tate McRae',
'Meatloaf',     #meatloafi',
'Múm',     #Mumford & Sons',
'Nico',     #Nicki Nicole',
'One',     # -> One Direction
'Phil',     #Phil Collins',
'Pink',     #PinkPantheress',
'Priscilla',     # -> Priscilla Block
'Rainbow',     #Rainbow Kitten Surprise',
'Seal',     #Seals and Crofts',
'South Park',     #South Park Mexican',
'The Band',     #The Band CAMINO',
'The La’s',     #The Kid LAROI',
'The Philadelphia Orchestra',     #The Philadelphia Virtuosi Chamber Orchestra',
'The Promise',     #Lukas Nelson and Promise of the Real',
'Train',     #Meghan Trainor',
'Vince',     #Vince Staples',
'a-ha',     #Daryl Hall & John Oates',

]

for k in ignore_list:
    try:
        print(k, artist_map.get(k))
        artist_map.pop(k)
    except:
        print('error', k)
        pass





1 None
error 1
Hem Natalie Hemby
Priscilla Priscilla Chan
Drake Nick Drake
William Ackerman Mark Ackerman, William James Ross
Jason Jason Mraz
Juice Juice WRLD
Origa Origami Angel
Nico Nico & Vinz
Mako Mako Road
Low All Time Low
La La Land Soundtrack LAND Soundtrack
Flamingos Flamingosis
BoA Boards of Canada
Traditional Chinese Traditional
Future Future Islands
ASAP Rocky None
error ASAP Rocky
Acoustic None
error Acoustic
Adeem None
error Adeem
Al Stewart None
error Al Stewart
Alpine None
error Alpine
America None
error America
Arrow Arrow Benjamin
Berlin None
error Berlin
Brian Wilson None
error Brian Wilson
CSNY None
error CSNY
Choir Choir Choir! None
error Choir Choir Choir!
Dallas Green None
error Dallas Green
Death Death Cab for Cutie
Dixie Chicks Karaoke - Dixie Chicks
Eileen None
error Eileen
Eric Johnson None
error Eric Johnson
Frente None
error Frente
IZ None
error IZ
Japanese House The Japanese House
Jewel None
error Jewel
LP None
error LP
La La Land Soundtrack None
error La 

In [120]:
artist_map.get('Train')

In [143]:
df['artist'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)
df.head(20)


Unnamed: 0,artist,track,score
197,Claude Debussy,Clair de Lune,22088
826,Simon & Garfunkel,Scarborough Fair,11922
887,The Beatles,In my Life,9049
306,Erik Satie,Gymnopédies,7620
895,The Cranberries,Dreams,7305
441,Israel Kamakawiwo'ole,Over the Rainbow,6681
670,Neil Young,Harvest Moon,5337
477,Jim Croce,Time in a Bottle,5285
629,Mazzy Star,Fade into you,4357
264,Don McLean,"Vincent (Starry, Starry Night)",3989


In [144]:
df = df.groupby(['artist','track']).sum().reset_index().sort_values('score', ascending=False)
df.to_csv('silver.csv', index=False)


In [146]:
# check tracks
# possibly update tracks to spotify canonical name

df = pd.read_csv("silver.csv")

dedupe = {}
mylist = []
fail_list = []
artist_list, track_list, uri_list, album_list, score_list = [], [], [], [], []
orig_artist, orig_track = [], []

for index, artist, title, score in df.itertuples():
    query_str = 'artist:%s track:%s' % (artist, title)
    track_results = sp.search(q=query_str, type='track', limit=1, offset=0, market='US')
    results = track_results['tracks']['items']
    
    if results:
        r = results[0]
        # failsafe to never put same track twice
        if dedupe.get(r['id']):
            continue
        dedupe[r['id']]=True
        if title.lower() != r['name'].lower():
            print ("%s|%s : %s|%s" % (artist, title, r['artists'][0]['name'], r['name']))
        uri_list.append(r['uri'])
        artist_list.append(r['artists'][0]['name'])
        track_list.append(r['name'])
        album_list.append(r['album']['name'])
        orig_artist.append(artist)
        orig_track.append(title)
        score_list.append(score)
#         print('  ',
#               r['artists'][0]['name'],'|',
#               r['name'], '|',
#               r['album']['name'],'|',
#               r['album']['release_date'],'|',
#               r['popularity'])
    else:
        fail_list.append((artist, title))
        print("not found:", artist, "-", title)
        

Claude Debussy|Clair de Lune : Claude Debussy|Suite bergamasque, L. 75: III. Clair de lune
Simon & Garfunkel|Scarborough Fair : Simon & Garfunkel|Scarborough Fair / Canticle
The Beatles|In my Life : The Beatles|In My Life - Remastered 2009
Erik Satie|Gymnopédies : Erik Satie|3 Gymnopédies: No. 1 Lent et douloureux
not found: Neil Young - Harvest Moon
The Beach Boys|God Only Knows : The Beach Boys|God Only Knows - Mono
Sigur Rós|Hoppipolla : Sigur Rós|Hoppípolla
John Denver|Annie’s Song : John Denver|Annie's Song
not found: Hans Zimmer - Interstellar Intro
Peter Gabriel|In Your Eyes : Peter Gabriel|In Your Eyes - 2012 Remaster
Samuel Barber|Adagio for Strings : Samuel Barber|Barber: Adagio for Strings
Grateful Dead|Standing on the Moon : Grateful Dead|Standing on the Moon - 2013 remaster
The Smiths|There is a light that never goes out : The Smiths|There Is a Light That Never Goes Out - 2011 Remaster
Jeff Buckley|Lover you should’ve come over : Jeff Buckley|Lover, You Should've Come Over

not found: Kishi Bashi - I am the Anti Christ
Art Garfunkel|Bridge Over Troubled Water : Art Garfunkel jr.|Geh mit mir durch den Regenbogen (Bridge Over Troubled Water)
Bibio|Lover's Carvings : Bibio|lovers’ carvings
not found: Muse - Exogenesis Symphony 1-3
not found: Jewel - Angels standing by
The Shins|New Slang : The Shins|New Slang - 2021 Remaster
The Paper Kites|Bloom : The Paper Kites|Bloom - Bonus Track
not found: Kermit the Frog - Rainbow Connection
Loreen|Dante’s Prayer : Loreena McKennitt|Dante's Prayer
Procol Harum|Whiter Shade of Pale : Procol Harum|A Whiter Shade of Pale - Original Single Version
not found: Requiem Inc. - live in concert
Franz Schubert|Ave Maria : Franz Schubert|Ave Maria, D. 839 "Ellens Gesang III"
not found: La La Land Soundtrack - Mia & Sebastian's Theme
Talking Heads|This must be the place : Talking Heads|This Must Be the Place (Naive Melody) - 2005 Remaster
not found: Tom Waits - Weeping Willow
not found: Ludvig Forssell - Quiet's Theme
not found: Ho

Ms. Lauryn Hill|Ex factor : Ms. Lauryn Hill|Ex-Factor
not found: Mogwai - Mogwai Fear Satan (Kevin Shields remix)
Modest Mouse|White Teeth : Modest Mouse|White Lies, Yellow Teeth
not found: Dimash Qudaibergen - SOS d'un terrien en détresse
Harry Chapin|Cats in the cradle : Harry Chapin|Cat's in the Cradle
not found: Emmylou Harris with Mark Knopfler & His Band - Till I Gain Control Again
not found: Trey Parker - Jacking it in San Diego
not found: Barbra Streisand - Avinu Malkenu
not found: Skyrim - Skyrim soundtrack
The Waterboys|The Whole of the Moon : The Waterboys|The Whole of the Moon - 2004 Remaster
Clint Mansell|Death is the road to awe : Clint Mansell|Death Is the Road to Awe (feat. Kronos Quartet)
not found: Amadeus soundtrack - best pacing
Feist|1 2 3 4 : Feist|1, 2, 3, 4 (Van She Tech Remix) - Mixed
Grover Washington, Jr.|Just the two of us : Grover Washington, Jr.|Just the Two of Us (feat. Bill Withers)
Gabriel Fauré|Pavane, Op. 50 : Gabriel Fauré|Pavane, Op. 50 (Version for

not found: Alison Krauss & Union Station - I Will
Kishi Bashi|Am I the antichrist to you : Kishi Bashi|I Am the Antichrist to You
not found: Kenshi Yonezu - Might*U
not found: Kenny Rogers - Thru the Years
not found: Keaton Henson - About Sophie
not found: William Ackerman - The Impending Death of the Virgin Spirit
Calvin Harris|sweet nothing : Calvin Harris|Sweet Nothing (feat. Florence Welch)
Charlie Megira|Yesterday : Charlie Megira|Yesterday, Today and Tomorrow
Robert Schumann|Träumerei : Robert Schumann|Kinderszenen, Op. 15: Träumerei (Arr. Lewin for Guitar)
Bedřich Smetana|The Moldau : Bedřich Smetana|Má Vlast (My Fatherland): Vltava (The Moldau River)
Basil Poledouris|Theology/Civilization : Basil Poledouris|Conan the Barbarian (arr. P. Pelster for organ): Theology / Civilization
not found: Rebekah Del Rio - Llorando (Crying)
Dmitri Shostakovich|Waltz No 2 : Dmitri Shostakovich|Jazz Suite No. 2: VI. Waltz 2
Bobby Caldwell|What you won’t do for love : Bobby Caldwell|What You Won'

## Save gold.csv


In [124]:

gold_df = pd.DataFrame({'score': score_list,
                        'input_artist': orig_artist,
                        'artist': artist_list,
                        'input_track': orig_track,
                        'track': track_list,
                        'album': album_list,
                        'uri': uri_list})

with pd.option_context("display.max_rows", 9999):
    display(gold_df)



Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
0,22088,Claude Debussy,Claude Debussy,Clair de Lune,"Suite bergamasque, L. 75: III. Clair de lune","Debussy: Suite bergamasque, L. 75, 3. Clair de...",spotify:track:1cmigB9I6IRpFqjIbzvSQB
1,11922,Simon & Garfunkel,Simon & Garfunkel,Scarborough Fair,Scarborough Fair / Canticle,"Parsley, Sage, Rosemary And Thyme",spotify:track:3g2fYZW5v2od8KIF7VktT0
2,9049,The Beatles,The Beatles,In my Life,In My Life - Remastered 2009,Rubber Soul (Remastered),spotify:track:3KfbEIOC7YIv90FIfNSZpo
3,7620,Erik Satie,Erik Satie,Gymnopédies,3 Gymnopédies: No. 1 Lent et douloureux,Satie: The Magic of Satie,spotify:track:7kTVe6XhIveidvkt8nb7jK
4,7305,The Cranberries,The Cranberries,Dreams,Dreams,"Everybody Else Is Doing It, So Why Can't We?",spotify:track:4JGKZS7h4Qa16gOU3oNETV
5,6681,Israel Kamakawiwoole,Israel Kamakawiwo'ole,Over the Rainbow,Over the Rainbow,Alone In Iz World,spotify:track:3oQomOPRNQ5NVFUmLJHbAV
6,5285,Jim Croce,Jim Croce,Time in a Bottle,Time in a Bottle,You Don't Mess Around With Jim,spotify:track:561F1zqRwGPCTMRsLsXVtL
7,4357,Mazzy Star,Mazzy Star,Fade into you,Fade Into You,So Tonight That I Might See,spotify:track:1LzNfuep1bnAUR9skqdHCK
8,3989,Don McLean,Don McLean,"Vincent (Starry, Starry Night)","Vincent (Starry, Starry Night)",Rearview Mirror: An American Musical Journey,spotify:track:2YDyH60Vro33KkDtNZCXIk
9,3882,The Beach Boys,The Beach Boys,God Only Knows,God Only Knows - Mono,Pet Sounds (Original Mono & Stereo Mix),spotify:track:6iGU74CwXuT4XVepjc9Emf


In [125]:
# inspect where the track name differs
with pd.option_context("display.max_rows", 999):
    display(gold_df.loc[gold_df['input_artist'].str.lower().str[:8] != gold_df['artist'].str.lower().str[:8]])
    

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
10,3819,Sigur Ros,Sigur Rós,Hoppipolla,Hoppípolla,Takk...,spotify:track:0yQPpUq5BJyqah5m2Q5Stt
12,3205,Edith Piaf,Édith Piaf,La Vie en Rose,La Vie en rose,Edith Piaf - The Best Of,spotify:track:3lAun9V0YdTlCSIEXPvfsY
28,1388,Ben E King,Ben E. King,Stand By Me,Stand by Me,Don't Play That Song (Mono),spotify:track:3SdTKo2uVsxFblQjpScoHy
38,848,Sinead O'connor,Sinéad O'Connor,Nothing Compares 2 U,Nothing Compares 2 U,I Do Not Want What I Haven't Got,spotify:track:5GHY1DFWKz3Prg2V0Iodqo
50,720,Sigur Ros,Sigur Rós,staralfur,Starálfur,Ágætis byrjun,spotify:track:55gISxV37mffOW2DbSskT3
51,668,Queen,Queensrÿche,Silent lucidity,Silent Lucidity - Remastered 2003,Empire - 20th Anniversary Edition,spotify:track:6OSyCAmXT4Gkd3OQ2aPOaF
57,565,Dream Academy,The Dream Academy,Life in a Northern Town,Life in a Northern Town,The Dream Academy,spotify:track:4m3OS54KWywYhP7WD7z1cg
67,441,Emerson,"Emerson, Lake & Palmer",Lucky Man,Lucky Man - 2012 Remaster,"Emerson, Lake & Palmer",spotify:track:6qj0OV5w2PlGTASSx34Lrl
73,395,Annie,Annie Lennox,Into The West,Into the West,The Lord of the Rings: The Return of the King ...,spotify:track:0gSEyG7pOFuHM05433EibX
79,365,Hollies,The Hollies,"He Ain't Heavy, He's My Brother","He Ain't Heavy, He's My Brother",Super Hits,spotify:track:41iPmvB2ogl3dzEHw9EZh0


In [126]:
# these are songs that look like covers or otherwise not the expected response from spotify search 
# (which is a bit wonky, doesn't like quotes and such)
# remove from df and add manually
bad_lookups = [
421,
494,
557,
598,
669,
823,
]

for i in bad_lookups:
    print(gold_df.iloc[i])
    
# add manually, plus 'not found'


score                                             13
input_artist                                    Pink
artist                                The Pink Noise
input_track                             Crystal Ball
track                    Crystal Ball, Crystal Skull
album                                       Birdland
uri             spotify:track:6xVzY4D3mwWzT5mR9ue4ak
Name: 421, dtype: object
score                                             10
input_artist                            Garth Brooks
artist                                 Brandon Garth
input_track                                The Dance
track                                      The Dance
album                             The Best of Brooks
uri             spotify:track:08MhzVza8Qc4M172KZ37JT
Name: 494, dtype: object
score                                                       9
input_artist                                     Dixie Chicks
artist                                 Karaoke - Dixie Chicks
input_track           

In [127]:
gold_df = gold_df.drop(
    axis='index',
    labels=bad_lookups)


In [None]:
gold_df

In [128]:
# this you could upload and make a new playlist
# existing playlist is result of multiple iterations

gold_df[['artist', 'track', 'score']].to_csv('gold.csv', index=False)

with pd.option_context("display.max_rows", 999):
    display(gold_df)

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
0,22088,Claude Debussy,Claude Debussy,Clair de Lune,"Suite bergamasque, L. 75: III. Clair de lune","Debussy: Suite bergamasque, L. 75, 3. Clair de...",spotify:track:1cmigB9I6IRpFqjIbzvSQB
1,11922,Simon & Garfunkel,Simon & Garfunkel,Scarborough Fair,Scarborough Fair / Canticle,"Parsley, Sage, Rosemary And Thyme",spotify:track:3g2fYZW5v2od8KIF7VktT0
2,9049,The Beatles,The Beatles,In my Life,In My Life - Remastered 2009,Rubber Soul (Remastered),spotify:track:3KfbEIOC7YIv90FIfNSZpo
3,7620,Erik Satie,Erik Satie,Gymnopédies,3 Gymnopédies: No. 1 Lent et douloureux,Satie: The Magic of Satie,spotify:track:7kTVe6XhIveidvkt8nb7jK
4,7305,The Cranberries,The Cranberries,Dreams,Dreams,"Everybody Else Is Doing It, So Why Can't We?",spotify:track:4JGKZS7h4Qa16gOU3oNETV
5,6681,Israel Kamakawiwoole,Israel Kamakawiwo'ole,Over the Rainbow,Over the Rainbow,Alone In Iz World,spotify:track:3oQomOPRNQ5NVFUmLJHbAV
6,5285,Jim Croce,Jim Croce,Time in a Bottle,Time in a Bottle,You Don't Mess Around With Jim,spotify:track:561F1zqRwGPCTMRsLsXVtL
7,4357,Mazzy Star,Mazzy Star,Fade into you,Fade Into You,So Tonight That I Might See,spotify:track:1LzNfuep1bnAUR9skqdHCK
8,3989,Don McLean,Don McLean,"Vincent (Starry, Starry Night)","Vincent (Starry, Starry Night)",Rearview Mirror: An American Musical Journey,spotify:track:2YDyH60Vro33KkDtNZCXIk
9,3882,The Beach Boys,The Beach Boys,God Only Knows,God Only Knows - Mono,Pet Sounds (Original Mono & Stereo Mix),spotify:track:6iGU74CwXuT4XVepjc9Emf


# Get Spotify playlist and add songs

In [130]:
# get playlist id
# first create a playlist in Spotify UI to load songs
playlists = sp.user_playlists(os.getenv('SPOTIFY_USERNAME'))
while playlists:
    for i, playlist in enumerate(playlists['items']):
        if playlist['name'] != 'RPS2':
            continue
        print(playlist['id'])
        playlist_id = playlist['id']
        print("%4d %s %s" % (i + 1 + playlists['offset'], playlist['uri'],  playlist['name']))
    if playlists['next']:
        playlists = sp.next(playlists)
    else:
        playlists = None
        

7rY5fhzbW7wCTXGMhEbZwk
   1 spotify:playlist:7rY5fhzbW7wCTXGMhEbZwk RPS2


In [131]:
# must follow an oauth workflow to write a playlist in Spotify
# running this cell should request a spotify login and then redirect to an url
# paste whole url with id into form to authenticate

scope = "playlist-modify-public"

sp = spotipy.Spotify(auth_manager=spotipy.SpotifyOAuth(scope=scope,
                                                       client_id=os.getenv('SPOTIFY_CLIENT_ID'),
                                                       client_secret=os.getenv('SPOTIFY_CLIENT_SECRET'),
                                                       redirect_uri="https://druce.ai"
                                                      ))


In [None]:
# add songs to playlist 

addlist = gold_df['uri'].to_list()
print (len(addlist))

# while(addlist):
#     sp.user_playlist_add_tracks(os.getenv('SPOTIFY_USERNAME'), 
#                                 playlist_id=playlist_id, 
#                                 tracks=addlist[-100:])
#     addlist = addlist[:-100]
#     print("added items, remaining ", len(addlist))


In [None]:
# manually add the ones that weren't found for some reason


# Compare Spotify playlist to gold data
after initial population, we may want to run again and add new songs 


In [132]:
# compare to existing playlist
# can run again and add any new tracks, either because OpenAI is a bit random, or new replies in thread
results = sp.user_playlist(os.getenv('SPOTIFY_USERNAME'), playlist_id,
                                fields='tracks,next,name')
tracks = results['tracks']

playlist_dict_by_uri = {}
playlist_dict_by_str = {}

artist_list = []
track_list = []
uri_list = []
popularity_list = []
album_list=[]

while True:
    for track_item in tracks['items']:
        track_dict = track_item['track']
        track_str = track_dict['artists'][0]['name']  + ' | ' + track_dict['name'][:15]
        uri = track_dict['uri']
        if track_str in playlist_dict_by_str:
            print(track_str)
        playlist_dict_by_str[track_str] = uri
        playlist_dict_by_uri[uri] = track_str
        
        uri_list.append(uri)
        artist_list.append(track_dict['artists'][0]['name'])
        track_list.append(track_dict['name'])
        album_list.append(track_dict['album']['name'])
        popularity_list.append(track_dict['popularity'])
        
    # check if there are more pages
    if tracks['next']:
        tracks = sp.next(tracks)
    else:
        break

print (len(list(playlist_dict_by_str.keys())))
print (len(list(playlist_dict_by_uri.keys())))


Enter the URL you were redirected to: https://druce.ai/?code=AQAA7XDCar6jlpEbzhomc-FNHnORkQ_RjXl2J3jG1QracWfvNOYf_MokSPnTz2sCJvdNJrme8-do8D66-4E907D7CYyqd13ZXU8BaDrRtOp3sR01tVzTcAPzntqlfbYH7g1tNWo-1xmsisXxeIkg8tX6amTQAob_3066lhwyBX8VVXlg29wFICY_EFo
The Beatles | Golden Slumbers
Pyotr Ilyich Tchaikovsky | Tchaikovsky: Th
Israel Kamakawiwo'ole | Over the Rainbo
Claude Debussy | Préludes / Book
Tori Amos | Little Earthqua
Simon & Garfunkel | For Emily, When
Linkin Park | One More Light
Blur | Tender
Daft Punk | Touch (feat. Pa
Franz Schubert | Ave Maria, D. 8
Yusuf / Cat Stevens | Wild World
Procol Harum | A Whiter Shade 
Jeff Buckley | Just Like a Wom
Nine Inch Nails | A Warm Place
Joni Mitchell | Both Sides Now
Joni Mitchell | Both Sides Now
Edvard Grieg | Peer Gynt Suite
Darude | Sandstorm
The Cure | Untitled - 2010
The Kinks | Waterloo Sunset
Van Morrison | Madame George -
The Beatles | A Day In The Li
1624
1646


In [133]:
with pd.option_context("display.max_rows", 9999):
    display(gold_df.loc[~gold_df['uri'].isin(playlist_dict_by_uri.keys())])
    

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
87,251,George Harrison,George Harrison,While My Guitar Gently Weeps,While My Guitar Gently Weeps - Live At Madison...,Let It Roll - Songs of George Harrison,spotify:track:4Egi6XuC0rbLlXfqmQeuFa
106,176,Bruce Springsteen,Bruce Springsteen,Born To Run,Born to Run,Born To Run,spotify:track:6hTcuIQa0sxrrByu9wTD7s
115,142,Elvis Presley,Elvis Presley,Unchained Melody,"Unchained Melody - Live at Ann Arbor, MI",Moody Blue,spotify:track:0OavtQSojULqejmC4Qbstr
202,45,Slipknot,Slipknot,Disasterpiece,Disasterpiece,Iowa,spotify:track:47VSmPTydr0saGjbQGwCeg
203,44,Birdy,Birdy,Skinny Love (cover),Skinny Love,Ballad Covers,spotify:track:7pLZTYZJgT0kO6lrYrlTrS
216,41,Antonio Vivaldi,Antonio Vivaldi,Vivaldis Four,"Vivaldi, De Courson & Le Berre: Eirin Sonata (...","O'stravaganza, Fantasy on Vivaldi and the Celt...",spotify:track:3lDMDHV6hxv9j4pS4CsgRB
220,40,Sergei Rachmaninoff,Sergei Rachmaninoff,Adagio sostenuto piano concerto no. 2,"Piano Concerto No. 2 in C Minor, Op. 18: 2. Ad...",Rachmaninov: Piano Concerto No.2,spotify:track:4rrrn8OLrttq7r9RgNXalU
227,38,Samuel Barber,Samuel Barber,Agnus Dei,Agnus Dei,Enchanted Isle,spotify:track:02vw0tjLamMJAzMlCSiNH3
234,37,Paul Simon,Paul Simon,The Boxer,"The Boxer - Live at Central Park, New York, NY...","Paul Simon's Concert In The Park August 15, 1991",spotify:track:3MUHmdHaQSQ79hnmJRk5uk
243,34,Everly Brothers,The Everly Brothers,Cathy's Clown,Cathy's Clown - 2007 Remaster,A Date with The Everly Brothers,spotify:track:27bw4i8LnECzVlbMZ4kFHw


In [None]:
playlist_df = pd.DataFrame({'artist': artist_list,
                           'track': track_list,
                           'album': album_list,
                           'popularity': popularity_list,
                           })



In [None]:
with pd.option_context("display.max_rows", 9999):
    display(playlist_df.sort_values('popularity'))
    

In [None]:
gold_dict_by_uri = {}
gold_dict_by_str = {}
addlist = []
c = 0
for i, artist, track, uri in gold_df[['artist', 'track', 'uri']].itertuples():
    # print(artist, track, uri)
    track_str = artist + ' | ' + track[:15]
    if track_str not in playlist_dict_by_str:
        addlist.append([artist, track, uri])
        print(artist, track, uri)
    gold_dict_by_uri[uri]=track_str
    gold_dict_by_str['track_str']= uri
#     if track_str not in playlist_dict_by_str:
#         c += 1
#         print (c, track_str)
        
print(len(gold_dict_by_str.items()))
print(len(gold_dict_by_uri.items()))

In [None]:
addlist

In [None]:
addlist = [['ABBA', 'One Of Us', 'spotify:track:6zgtBUEkAfilJ2YEOvNexR'],
 ['Gregorio Allegri',
  'Miserere mei, Deus',
  'spotify:track:6es7DmrhnDoKj5rsFvh3XU'],
 ['Amy Winehouse',
  'Love Is A Losing Game',
  'spotify:track:3uliGwmB52ZA7brgpZMzyH'],
 ['Barbara',
  "Ma plus belle histoire d'amour",
  'spotify:track:0qBVET4VkHsQAoboWlQ2pJ'],
 ['Ludwig van Beethoven',
  'Symphony No. 5 in C Minor, Op. 67: I. Allegro con brio',
  'spotify:track:2ygeBLTP9uu3OW3VTulD8N'],
 ['Benny Goodman', 'Sing, Sing, Sing', 'spotify:track:5L8ta4ECl5zeA6bGqY7G38'],
 ['Bill Withers', 'Lean on Me', 'spotify:track:3M8FzayQWtkvOhqMn2V4T2'],
 ['Billy Joel', 'Piano Man', 'spotify:track:70C4NyhjD5OZUMzvWZ3njJ'],
 ['Bob Dylan', 'Ballad of a Thin Man', 'spotify:track:0f5N14nB8xi0p3o4BlVvbx'],
 ['Bob Dylan', "Blowin' in the Wind", 'spotify:track:18GiV1BaXzPVYpp9rmOg0E'],
 ['Bob Dylan', 'Desolation Row', 'spotify:track:4n1ZGm3TxYmoYe1YR8cMus'],
 ['Bob Dylan', 'Duquesne Whistle', 'spotify:track:5kKW4bszhKSCYVPDO0sMbX'],
 ['Bob Dylan',
  'Forever Young - Slow Version',
  'spotify:track:4yWl0tnEanf3zmZzl9kbQn'],
 ['Bob Dylan', 'Gotta Serve Somebody', 'spotify:track:760420tYNmNjFgi8bWvbop'],
 ['Bob Dylan', 'Highway 61 Revisited', 'spotify:track:6os5B6xjuke9YfBKH3tu1e'],
 ['Bob Dylan',
  'I Shall Be Released - Studio Outtake - 1971',
  'spotify:track:5vyw005QQ42hrzrLxb3xEX'],
 ['Bob Dylan', 'I Want You', 'spotify:track:7tJQ4Ekp2vN3NlI3vJJW3v'],
 ['Bob Dylan', "It Ain't Me Babe", 'spotify:track:5nbNWAfT1S6V1vqj3snHxS'],
 ['Bob Dylan', 'Jokerman', 'spotify:track:6cuHkcRUqtQhtJ4sWCkd1q'],
 ['Bob Dylan',
  "Knockin' On Heaven's Door",
  'spotify:track:6HSXNV0b4M4cLJ7ljgVVeh'],
 ['Bob Dylan', 'Lay, Lady, Lay', 'spotify:track:4uYwlMp841PLJmj1gJJwIq'],
 ['Bob Dylan', 'Like a Rolling Stone', 'spotify:track:3AhXZa8sUQht0UEdBJgpGc'],
 ['Bob Dylan', 'Love Sick', 'spotify:track:3O1hpSOaJDW4SelgUG2XT3'],
 ['Bob Dylan', "Maggie's Farm", 'spotify:track:5rGD8FFgHw74cp3RPhucyg'],
 ['Bob Dylan',
  'Make You Feel My Love',
  'spotify:track:6rfGPGghQL7SJmZPXprXIc'],
 ['Bob Dylan',
  'Mississippi - Version 2',
  'spotify:track:6JWHNd8QMxTvojYkmZtKGI'],
 ['Bob Dylan', 'Mr. Tambourine Man', 'spotify:track:3RkQ3UwOyPqpIiIvGVewuU'],
 ['Bob Dylan', 'Murder Most Foul', 'spotify:track:1LfTvT9JPYuuZanwxLtZCr'],
 ['Bob Dylan', 'Not Dark Yet', 'spotify:track:1qbn6QrHG8XfnqVFKgNzKP'],
 ['Bob Dylan',
  'Rainy Day Women #12 & 35',
  'spotify:track:7BkAlVpGwXXl3sYNn5OoJ7'],
 ['Bob Dylan',
  'Sad-Eyed Lady of the Lowlands',
  'spotify:track:4jdtLLyEL7wY0TlCdMKhxq'],
 ['Bob Dylan', 'She Belongs to Me', 'spotify:track:2itBkHBUxGl4VfDj4HNyoD'],
 ['Bob Dylan',
  'Stuck Inside of Mobile with the Memphis Blues Again',
  'spotify:track:1NYTj6JEw3IOh4ggiBh82h'],
 ['Bob Dylan',
  'Subterranean Homesick Blues',
  'spotify:track:6k9DUKMJpWvu6eFG3O64Lg'],
 ['Bob Dylan', 'Tangled up in Blue', 'spotify:track:6Vcwr9tb3ZLO63F8DL8cqu'],
 ['Bob Dylan', 'Tempest', 'spotify:track:19scNzd4ogVsHrNWsms8Rg'],
 ['Bob Dylan',
  "The Times They Are A-Changin'",
  'spotify:track:52vA3CYKZqZVdQnzRrdZt6'],
 ['Bob Dylan',
  'Things Have Changed - Single Version',
  'spotify:track:5KOi77ameCimkAdw0DMNoy'],
 ['Bob Dylan',
  'Thunder on the Mountain',
  'spotify:track:4wo2eRp6aHcAlmhmfwiTAH'],
 ['Bob Dylan', 'Visions of Johanna', 'spotify:track:2rslQV48gNv3r9pPrQFPW1'],
 ['Brian Wilson', 'God Only Knows', 'spotify:track:2SznAUigFh6rMdGpcS5d7e'],
 ['Bright Eyes',
  'First Day of My Life',
  'spotify:track:0eBryM7ePQH3Klt3jz8xZd'],
 ['Crowded House',
  'Don’t Dream It’s Over - Home Demo',
  'spotify:track:0fiSpF9mvRFQWy0ca64d1g'],
 ['Léo Delibes', 'Flower Duet', 'spotify:track:5K8jqeLAxZIqHR6e5w5so1'],
 ['Dire Straits', 'Brothers In Arms', 'spotify:track:6XYBbVpu455ZdGWZNRLGbG'],
 ['Don McLean',
  'Vincent (Starry, Starry Night)',
  'spotify:track:2YDyH60Vro33KkDtNZCXIk'],
 ['Ed Sheeran', 'Photograph', 'spotify:track:41xNsY82OWtWbIfnRMK2ky'],
 ['Elvis Presley',
  'Can’t Help Falling in Love - Acoustic Cover',
  'spotify:track:0ghQkNDYLSl4GsqfkjTjWx'],
 ['Enya', 'Amarantine', 'spotify:track:0VmzazQQ0Mo1vJldr5NxTW'],
 ['Evan Rachel Wood', 'If I Fell', 'spotify:track:0gd3hRBQAEAw096YOcUrmR'],
 ['Fleetwood Mac', 'Rhiannon', 'spotify:track:05oETzWbd4SI33qK2gbJfR'],
 ['George Harrison',
  'All Things Must Pass - 2014 Remaster',
  'spotify:track:16OwZQuzMqnwn3FZsCBZly'],
 ['George Harrison',
  'Apple Scruffs - 2014 Remaster',
  'spotify:track:2K7WhpfZX3TCCMiwebp0W7'],
 ['George Harrison',
  'Art of Dying - 2014 Remaster',
  'spotify:track:6Jod7qrtYBhU3HcUmKk4hX'],
 ['George Harrison',
  'Awaiting on You All - 2014 Remaster',
  'spotify:track:0b65WkrBrg2qOkzQeDtQ9d'],
 ['George Harrison',
  'Ballad of Sir Frankie Crisp (Let It Roll) - 2014 Remaster',
  'spotify:track:0FWeRrB8T5R6maHbWQw4Kk'],
 ['George Harrison',
  'Behind That Locked Door',
  'spotify:track:2VVbLn8nMcWJzjcL1tZsUr'],
 ['George Harrison',
  'Beware of Darkness - 2014 Remaster',
  'spotify:track:606MCyZFMBlc52Ojnn1nvU'],
 ['George Harrison',
  'Give Me Love (Give Me Peace on Earth)',
  'spotify:track:71fXxvXqo1zxWDtBmjoEVk'],
 ['George Harrison',
  'Hear Me Lord - 2014 Remaster',
  'spotify:track:3kopbNyRj10XO1actGZexP'],
 ['George Harrison',
  'I Dig Love - 2014 Remaster',
  'spotify:track:42yK1Wy62c7malKSRwy0Qk'],
 ['George Harrison',
  'I Remember Jeep - 2014 Remaster',
  'spotify:track:058AE5M3ifbCh8VWOV7903'],
 ['George Harrison',
  "It's Johnny's Birthday - 2014 Remaster",
  'spotify:track:6Cv05rcW8HWwCC6wyEp1fC'],
 ['George Harrison',
  'Let It Down - 2014 Remaster',
  'spotify:track:5FFruMKbVg8AhwHnX4xBov'],
 ['George Harrison',
  'My Sweet Lord - 2014 Remaster',
  'spotify:track:6vE90mi4yKsQGY3YD2OOv1'],
 ['George Harrison',
  'Out of the Blue - 2014 Remaster',
  'spotify:track:1KHMyFaGvwVQ7ax4yjq4BZ'],
 ['George Harrison',
  'Plug Me In - 2014 Remaster',
  'spotify:track:0tyk2xHVjBd3nk16cGktTG'],
 ['George Harrison',
  'Run of the Mill - 2014 Remaster',
  'spotify:track:4uSlUBg3NVOA77E7wwKFTO'],
 ['George Harrison',
  'Thanks for the Pepperoni - 2014 Remaster',
  'spotify:track:3smkwfPqFsTmwfnBztMXaM'],
 ['George Harrison',
  'The Inner Light (Alternative Take) - Instrumental',
  'spotify:track:7gWPnvhaBFMlQsTBWEGcSC'],
 ['George Harrison',
  'Wah-Wah - 2014 Remaster',
  'spotify:track:5j3aqkMO2fl0s5eaSuVnQ8'],
 ['George Harrison',
  'What Is Life - 2014 Remaster',
  'spotify:track:44fw7RulJyj7dGIi9qR86N'],
 ['George Harrison',
  'While My Guitar Gently Weeps - Live At Madison Square Garden; 2009 Remaster',
  'spotify:track:4Egi6XuC0rbLlXfqmQeuFa'],
 ['Glenn Miller', 'In the Mood', 'spotify:track:1xsY8IFXUrxeet1Fcmk4oC'],
 ['Hans Zimmer', 'Cornfield Chase', 'spotify:track:6pWgRkpqVfxnj3WuIcJ7WP'],
 ['Hans Zimmer',
  'Day One (Interstellar Theme)',
  'spotify:track:4WmB04GBqS4xPMYN9dHgBw'],
 ["Israel Kamakawiwo'ole",
  'Maui Medley',
  'spotify:track:6TSJ3L9pBQsYIlCD5pk7ju'],
 ['James Taylor',
  'You’ve Got a Friend',
  'spotify:track:3nK4hWsTEr7fVXziI5bTmh'],
 ['Jay Ungar', 'Ashoken Farewell', 'spotify:track:2s6pqLeVialgt5l5TTSeas'],
 ['Jeff Buckley',
  'If You Knew - Live at Sin-é, New York, NY - July/August 1993',
  'spotify:track:1nd2JEHXbUuQFDiQzCBpsv'],
 ['Jimi Hendrix', 'One Rainy Wish', 'spotify:track:5Zyv0v4rPcrXjkaeImuodv'],
 ['Jimi Hendrix',
  'Spanish Castle Magic',
  'spotify:track:2KFE98Iw0X23sf4vJYcbLH'],
 ['Jimi Hendrix',
  'Wait Until Tomorrow',
  'spotify:track:2YtVzmZzew1ILUdNueyWd7'],
 ['John Lennon',
  'Imagine - Remastered 2010',
  'spotify:track:7pKfPomDEeI4TPT6EOYjn9'],
 ['John Mayer', 'Queen of California', 'spotify:track:0CETmgFGt8Ne8vLnaLcduU'],
 ['Johnny Cash',
  'I Walk The Line - Single Version',
  'spotify:track:1TKPfF2fvn6gVLVfp3iG4j'],
 ['Joni Mitchell',
  'Mitchell: Urge for Going (Instrumental Arrangement of the B-Side Track of the Joni Mitchell Single "You Turn Me on I\'m a Radio")',
  'spotify:track:1I1u9aTdxxQ7SDLgBB3V7b'],
 ['Kanye West', 'Come to Life', 'spotify:track:5xvXeuxISyXJDRbZZf4uzd'],
 ['Leonard Cohen', 'Chelsea Hotel #2', 'spotify:track:4krhCfJg0znykZoyjeMXRe'],
 ['Leonard Cohen', 'Dear Heather', 'spotify:track:3MTKMphPprAcBFG1uIhzPZ'],
 ['Leonard Cohen',
  "Death of a Ladies' Man",
  'spotify:track:5wrylUGwZugelovhryPYg2'],
 ['Leonard Cohen', 'The Future', 'spotify:track:5l8lYrnPEM1ln3J4XaTcy5'],
 ['Leonard Cohen',
  'You Want It Darker',
  'spotify:track:5zb7npjQqoJ7Kcpq4yD9qn'],
 ['Lingers.On', 'In Lingerie', 'spotify:track:6FH3kGlJbFVJDCG9RcERf7'],
 ['Louis Armstrong',
  'La vie en rose - Single Version',
  'spotify:track:3yYfoYGVpriV4fG9L1ogsD'],
 ['The Lovecats', 'The Lovecats', 'spotify:track:7iJUiiTfnuY5cTIeEBnqHr'],
 ['Ludovico Einaudi', 'Primavera', 'spotify:track:4BMHp3DkI8VLsuB9Kr0pzu'],
 ['Mazzy Star', 'Flowers In December', 'spotify:track:0G6Ws8Gbdt0S7pZeuYmkmm'],
 ['Metallica',
  'Fade To Black (Remastered)',
  'spotify:track:0dqGfCMAGyDgpUAgLNOjWd'],
 ['Wolfgang Amadeus Mozart',
  'Requiem in D Minor, K. 626: III. Sequenz No. 6, Lacrimosa dies illa',
  'spotify:track:4bvzJZXpkI3bkjxMCWOSu1'],
 ['My Chemical Romance',
  'The Light Behind Your Eyes',
  'spotify:track:3HyDpKAuR3e4l6QB7hSB2l'],
 ['Paul McCartney',
  'Here Today - Remixed 2015',
  'spotify:track:0QtnwXDziZN1K55fXuLN6q'],
 ['Paul McCartney',
  'I’ll Follow The Sun - Live At Amoeba 2007',
  'spotify:track:3xT59EeQdq0TPGtOlXXI8t'],
 ['Puscifer', 'The Humbling River', 'spotify:track:69GE6yPZZldvqtgBHrKXxg'],
 ['Ray LaMontagne',
  'Such A Simple Thing',
  'spotify:track:4PuUa8e5s7P3Zv1IdCGIsa'],
 ['Ray Manzarek',
  'Riders on the Storm',
  'spotify:track:3FvYcTXO2QtDY7kZQHku2d'],
 ['Red Hot Chili Peppers', 'Dosed', 'spotify:track:1iFIZUVDBCCkWe705FLXto'],
 ['Sky Cries Mary',
  "Don't Forget The Sky",
  'spotify:track:4sVpjCJRClVetRrdxVBolP'],
 ['Stevie Nicks', 'Landslide', 'spotify:track:5fprEY6WEN1wvFXkgfb22C'],
 ['Stevie Wonder', 'Isn’t She Lovely', 'spotify:track:6wGlAaMfyhKdEPr2zycAnN'],
 ['Taylor Swift',
  'Fearless (Taylor’s Version)',
  'spotify:track:77sMIMlNaSURUAXq5coCxE'],
 ['Taylor Swift',
  'the lakes - bonus track',
  'spotify:track:0eFQWVz0qIxDOvhLpZ40P7'],
 ['The Band',
  'When I Paint My Masterpiece - Remastered',
  'spotify:track:76WChUuOPeIK027IeUgr0l'],
 ['The Beach Boys',
  "I Just Wasn't Made For These Times - Mono",
  'spotify:track:4CuO8TINNqM3D7aUdNQ3zG'],
 ['The Beach Boys',
  "Let's Go Away For A While - Mono",
  'spotify:track:3GsgJI1aBrvUtqX8f3MhKT'],
 ['The Beatles',
  "Don't Let Me Down - Naked Version / Remastered 2013",
  'spotify:track:5BhMoGrz5KzG2fA5uzHjZ1'],
 ['The Beatles',
  'Love Me Do - Remastered 2009',
  'spotify:track:3VbGCXWRiouAq8VyMYN2MI'],
 ['The Chemical Brothers',
  'The Boxer',
  'spotify:track:1EUeDFq2zNP784GPaRs9aH'],
 ['The Cure',
  'A Night like This - 2006 Remaster',
  'spotify:track:7cKCz7gG84i1XLvDeM3ByT'],
 ['The Cure',
  'Disintegration - 2010 Remaster',
  'spotify:track:0zY8t5dC1KQXcPUKByWMJM'],
 ['The Cure',
  'From the Edge of the Deep Green Sea',
  'spotify:track:2vwBL9RVyr0vA4Og5VH0i3'],
 ['The Cure',
  'In Between Days - 2006 Remaster',
  'spotify:track:07CyrZF9eVd02zzIse7tZA'],
 ['The Cure', 'A Letter to Elise', 'spotify:track:4DdXOLc1VMAY34ourCn1Xa'],
 ['The Cure',
  'Lullaby - 2010 Remaster',
  'spotify:track:4d4oXk7O2lEhZ83ivV93li'],
 ['The Cure', 'Underneath The Stars', 'spotify:track:0PKVjYlKw7z3IvKAoxrYTR'],
 ['The Eagles', 'The Desperadoes', 'spotify:track:10ppF835WJMYI5v65gFLZ3'],
 ['The Helio Sequence',
  'Keep Your Eyes Ahead',
  'spotify:track:3yatRBsGMJ7wMoUIgDBzzo'],
 ['The Moldy Peaches',
  'Anyone Else But You',
  'spotify:track:2pKi1lRvXNASy7ybeQIDTy'],
 ['The Strokes', 'Someday', 'spotify:track:7hm4HTk9encxT0LYC0J6oI'],
 ['Traditional',
  'Scarborough Fair (Arr. Parkin)',
  'spotify:track:4wlNPczIullwvmwb4x0ltz'],
 ['Van Morrison',
  'Madame George - 1999 Remaster',
  'spotify:track:1N4MKISvC1ddfRCRQDXDd2'],
 ['Various Artists',
  'The Girl From Ipanema',
  'spotify:track:0JgH7g0kwsIs1THEVqhlUS'],
 ['Víg Mihály',
  'Öreg - From "Werckmeister Harmóniák"',
  'spotify:track:63wMgkXQuomlkW4an4O9b4'],
 ['Willie Nelson', 'Crazy', 'spotify:track:0xqtcLB45iKNfHroi5y1em']]


In [None]:
len(addlist)

In [None]:
addlist2 = [a[2] for a in addlist]

print (len(addlist2), 'items')

while(addlist2):
    sp.user_playlist_add_tracks(os.getenv('SPOTIFY_USERNAME'), 
                                playlist_id=playlist_id, 
                                tracks=addlist2[-100:])
    addlist2 = addlist2[:-100]
    print("added items, remaining ", len(addlist2))


In [50]:
pd.read_csv('artist_map.csv') \
.groupby('map').count().reset_index().to_csv('q.csv', index=False)