In [1]:
# this version reads a chunk of posts with the score embedded
# ranks by summing scores

import os
import glob
import pickle
from datetime import datetime
import time
import dotenv
import pandas as pd
import re
from tqdm import tqdm

import pandas_dedupe

import requests
import requests.auth

import praw

import openai
import tiktoken

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# load secrets from .env into environment variables
dotenv.load_dotenv()

print(f"{'Praw:':<20} {praw.__version__ :>10}")
print(f"{'OpenAI:':<20} {openai.version.VERSION :>10}")


Praw:                     7.7.0
OpenAI:                  0.27.4


See README.md
 - objective is to use OpenAI for named entity extraction to extract all the songs form [this reddit thread](https://www.reddit.com/r/AskReddit/comments/12viv4v/what_is_the_prettiest_song_you_ever_heard_in_your/) and make Spotify playlist
 - use Reddit PRAW API to download all the comments (get [Reddit API key](https://www.reddit.com/prefs/apps))
 - use OpenAI API with a prompt like, extract all the songs from this text to CSV get ([OpenAI API key](https://platform.openai.com/account/api-keys))
 - use Spotify API to make a playlist (get [Spotify API key](https://developer.spotify.com/documentation/web-api/tutorials/getting-started))
 - works, needed a lot of scrubbing, but about 1 day of work, wouldn't have been possible to do a 700-song playlist manually without a team of Mechanical Turks or something
 - If I wanted to go nuts, would process comments individually, save a file for each comment's extracted songs, would make it easier to track down what OpenAI gets wrong, have a resumable, retryable, repeatable process and 
 - Spotify playist is [here](https://open.spotify.com/playlist/08YFkbtTV6GBfNtjJ4PHDu?si=f4761d983ac84091) 
 
 needs a .env file per dot-env-template
 

# Configs

In [10]:
# model
gptmodel = 'gpt-3.5-turbo'

# a thread 
submission = "12viv4v"

# minimum karma to process a reply 
minkarma = 1

# an output file to accumulate all the responses
savefile = 'bronze.txt'

# main prompt 
prompt_prefix1="""You will act as a research assistant finding all the artists and track titles mentioned in a series of messages about music, and returning them in a CSV format.
Define a post delimited below by ===
===
post_id: "abcdefg"
post_score: "6996"
I love Yesterday by the Beatles. Also Hotel California from The Eagles. And Bruce Springsteen's Born To Run!
===

Define a CSV format delimited below by ---
---
"post_id","post_score","artist","track"
"abcdefg","6996","The Beatles","Yesterday"
"abcdefg","6996","The Eagles","Hotel California"
"abcdefg","6996","Bruce Springsteen","Born To Run"
---

You will extract all artists and tracks from each post below delimited by ~~~ .
You will return a list of records containing the artist and track extracted from the input, and the post_id and post_score of the post the artist and track is mentioned in.
You will return the records in a CSV format.
The header row should contain `"post_id","post_score","artist","track"`. 
The input is:
"""

# an output file to accumulate all the responses
savefile = 'bronze.txt'

# to speed things we'll cumulate posts til we get to nposts posts or maxchars total chars, whichever comes first
max_post_size=300  # redditor needs to put any songs in 1st couple hundred chars
maxtokens = 1024   # max tokens to send to get_response (with room for response)
# maxchars = 6000  # max tokens (words/fragments) is 4096 but I think stuffing the prompt maybe reduces quality?
nposts = 1000 # max posts to combine into a chunk



# Get all comments from a reddit posting

In [4]:
def getPraw():
    return praw.Reddit(user_agent="prettiest_song/0.001", 
                       client_id=os.getenv('CLIENT_ID'), 
                       client_secret=os.getenv('CLIENT_SECRET'))


def getAll(r, submissionId, verbose=True):
    submission = r.submission(submissionId)
    submission.comments.replace_more(limit=None)
    commentsList=submission.comments.list()
    return commentsList


In [5]:
# print(datetime.now())
# r = getPraw()
# res = getAll(r, submission)
# print(datetime.now())

# print("retrieved ", len(res), 'comments')


In [6]:
# # we have a list of comment objects
# # filter comments with at least some karma
# res3 = [r for r in res if r.score >= minkarma]
# print('filtered to ', len(res3), 'comments')
# res3[0].body, res3[0].score


In [4]:
# save so we can reload it later without downloading

# with open('reddit_full.pkl', 'wb') as f:
#     pickle.dump(res3, f)
    
with open('reddit_full.pkl', 'rb') as f:
    res3 = pickle.load(f)


# Extract artists and song titles using OpenAI

In [5]:
# check lengths of posts
shorties = []
big_ones = []
for i in range(len(res3)):
    if len(res3[i].body) <3:
        print (i, res3[i].body)
        shorties.append(i)
    if len(res3[i].body) > 4096:
        print(i, len(res3[i].body))
        big_ones.append(i)
        

423 4162
812 26
8405 4399
11597 Up
17225 5225
21450 W
21621 -🤓
21626 W
21977 :)
22240 t
23405 <3
23481 ✨️
24440 4543
24443 Ye


In [6]:
# avg length
sum([len(r.body) for r in res3]) / len(res3)

78.88987480247964

In [None]:
[i for i in range(len(res3)) if res3[i].score <= 0]

In [7]:
print (res3[big_ones[0]].body[:500])

Saturn by Sleeping at Last:
https://www.youtube.com/watch?v=dzNvk80XY9s

The version they did with Tim Fain is even more beautiful: 
https://www.youtube.com/watch?v=0nRpeAiur9Q

I'm not good at choosing one thing from a list of favorites as the best, so I've got about 30+ answers that are really a 30+ -way tie, and the one that I would consider as "prettiest" at any given moment is heavily influenced by my current mood. So, it could be any one of these from my "Heart Wrenchingly Beautiful" playl


In [3]:
csv_validate_re = re.compile(r'''
    \s*                # Any whitespace.
    (                  # Start capturing here.
      [^,"']+?         # Either a series of non-comma non-quote characters.
      |                # OR
      "(?:             # A double-quote followed by a string of characters...
          [^"\\]|\\.   # That are either non-quotes or escaped...
       )*              # ...repeated any number of times.
      "                # Followed by a closing double-quote.
      |                # OR
      '(?:[^'\\]|\\.)*'# Same as above, for single quotes.
    )                  # Done capturing.
    \s*                # Allow arbitrary space before the comma.
    (?:,|$)            # Followed by a comma or the end of a string.
    ''', re.VERBOSE)


In [None]:
# tokenizer to get accurate token count

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model(gptmodel)
assert enc.decode(enc.encode("hello world")) == "hello world"

def count_tokens(s):
    return len(enc.encode(s))

count_tokens('four score and 7 years go our forefathers brought forth')

In [79]:
openai.api_key = os.getenv('OPENAI_API_KEY')

models = openai.Model.list()
print([(i, m.id) for i, m in enumerate(models["data"])])
models['data'][2]

[(0, 'whisper-1'), (1, 'babbage'), (2, 'gpt-3.5-turbo'), (3, 'davinci'), (4, 'text-davinci-edit-001'), (5, 'text-davinci-003'), (6, 'babbage-code-search-code'), (7, 'text-similarity-babbage-001'), (8, 'code-davinci-edit-001'), (9, 'text-davinci-001'), (10, 'ada'), (11, 'babbage-code-search-text'), (12, 'babbage-similarity'), (13, 'code-search-babbage-text-001'), (14, 'text-curie-001'), (15, 'code-search-babbage-code-001'), (16, 'text-ada-001'), (17, 'text-embedding-ada-002'), (18, 'text-similarity-ada-001'), (19, 'curie-instruct-beta'), (20, 'ada-code-search-code'), (21, 'ada-similarity'), (22, 'code-search-ada-text-001'), (23, 'text-search-ada-query-001'), (24, 'davinci-search-document'), (25, 'ada-code-search-text'), (26, 'text-search-ada-doc-001'), (27, 'davinci-instruct-beta'), (28, 'text-similarity-curie-001'), (29, 'code-search-ada-code-001'), (30, 'ada-search-query'), (31, 'text-search-davinci-query-001'), (32, 'curie-search-query'), (33, 'davinci-search-query'), (34, 'babbage-s

<Model model id=gpt-3.5-turbo at 0x7fc6d3e499f0> JSON: {
  "created": 1677610602,
  "id": "gpt-3.5-turbo",
  "object": "model",
  "owned_by": "openai",
  "parent": null,
  "permission": [
    {
      "allow_create_engine": false,
      "allow_fine_tuning": false,
      "allow_logprobs": true,
      "allow_sampling": true,
      "allow_search_indices": false,
      "allow_view": true,
      "created": 1684434433,
      "group": null,
      "id": "modelperm-Gsp3SyIu7GamHB3McQv3rMf5",
      "is_blocking": false,
      "object": "model_permission",
      "organization": "*"
    }
  ],
  "root": "gpt-3.5-turbo"
}

In [8]:
MAX_TOKENS = 4096   # https://platform.openai.com/docs/models

def get_response(messages, prompt_prefix="", verbose=False):

    prompt = prompt_prefix
    
    if type(messages) == list:
        for msg in messages:
            prompt += f"""
~~~
{msg}
~~~
"""
    else:
        prompt += messages
        
    if verbose:
        print(prompt)
        
    # retry loop, have received untrapped 502 error
    if count_tokens(prompt) > MAX_TOKENS:
        print("WARNING: %d tokens > %d" % (count_tokens(prompt), MAX_TOKENS))
        
    RETRIES = 3
    success = False    
    for i in range(RETRIES):
        try:
            response = openai.ChatCompletion.create(
                model=gptmodel,
                messages=[{"role":"user", 
                           "content": prompt}],
                temperature=0,
            )
            # no exception thrown
            success=True
            break   
        except Exception as error:
            print("An exception occurred:", error)
            print("Retrying chunk...")
            time.sleep(5)
            continue  # try again
    if success:
        # check response payload for any error message?
        response_msg = response['choices'][0]['message']
        if len(response_msg['content'])==0:
            print("there was a problem, content is empty, full payload follows:")
            print(response)
        if verbose:
            print(response_msg)
        return response_msg['content']
    else:
        return None



In [14]:
# for each comment object we will extract the body 
# then submit as part of a prompt to chatgpt
print(datetime.now())

nposts = 1000
slist = res3.copy()
total_posts = len(slist)
print("processing %d posts" % total_posts)

outdir = 'out'
logdir = 'logs'
# make sure out and logs are empty
for f in glob.glob('%s/*' % outdir):
    os.remove(f)
for f in glob.glob('%s/*' % logdir):
    os.remove(f)
count = 0
c = 0
maxtokens=2048

while(slist):  # still comments to process
    tokens_to_date = count_tokens(prompt_prefix1)
    reply_ids = []
    messages = []
    for _ in range(nposts):  # add up to this many posts to the prompt
        if slist:
            # make sure no single post > max_post_size, truncate in place as nec 
            slist[0].body = slist[0].body[:max_post_size]
            if tokens_to_date + count_tokens(slist[0].body) < maxtokens:
            # total post content < maxchars
            # if chars_to_date + len(slist[0].body) < maxchars:
                reply = slist.pop(0)
                reply_ids.append(reply.id)
                body = reply.body
                
                messages.append(f"""
post_id: "{reply.id}"
post_score: "{reply.score}"
{body}
"""
                )
                tokens_to_date += count_tokens(messages[-1])
                # chars_to_date += len(messages[-1])
                c += 1
            
    response = get_response(messages, prompt_prefix1, verbose=False)
    if response is None:   # FAIL - retries exhausted
        print('Bailing to next chunk')
        continue

    # do basic validation and cleanup
    # should check first line is valid header and doesn't reverse columns
    csv_valid, csv_err = [], []
    for line in response.split("\n"):
        try:
            csv_values = csv_validate_re.findall(line)
            if len(csv_values) == 4:
                csv_valid.append(line)
            else:
                csv_err.append(line)
        except:
            csv_err.append(line)
    csv_output = "\n".join(csv_valid)
        
    with open("%s/%04d.csv" % (outdir, count), 'w') as outfile:
        outfile.write(csv_output)
    
    if csv_err:
        with open("%s/%04d.err" % (outdir, count), 'w') as outfile:
            outfile.write("\n".join(csv_err))
        
    with open("%s/%04d.log" % (logdir, count), 'w') as logfile:
        logfile.write(str(reply_ids))
        logfile.write('\n\n===== raw prompt =====\n\n')        
        logfile.write("\n=====\n".join(messages))
        logfile.write('\n\n===== raw response =====\n\n')
        logfile.write(response)
        logfile.write('\n\n===== failed validation =====\n\n')
        logfile.write("\n".join(csv_err))
 
    count += 1
#     print(c)
    outcount = total_posts-len(slist)
    print(outcount, end=' ')
    
    
print()
print(datetime.now())



2023-05-20 01:03:03.547203
processing 24681 posts
59 107 167 216 280 332 384 427 481 534 578 633 689 733 779 831 887 938 991 1050 1101 1150 1203 1247 An exception occurred: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying chunk...
1304 1352 1404 1454 1501 1547 1602 1659 1705 1758 1816 1873 1924 1971 2023 2081 2129 2179 2227 2272 2322 2377 An exception occurred: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID de1b3cfe1f6e91f2d868ccd128777aaa in your message.)
Retrying chunk...
2428 2480 2536 2595 2656 2713 2767 2816 2869 2931 2984 3038 3103 3164 3220 3272 3322 3377 3434 3490 3543 3598 3655 3703 3759 3818 An exception occurred: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying chunk...
3880 3934 3990 4042 4096 

In [None]:
## concatenate outputs as bronze.txt
# may still have to tweak the files to get them to load
# should inspect .err files and clean up as necessary


In [None]:
# filelist = glob.glob('%s/*.csv' % outdir)

# output_df = None
# count = 0
# for f in sorted(filelist):
#     print(f)
#     try:
#         tempdf = pd.read_csv("%s" % (f), header=None)
#     except Exception as exc:
#         print(str(exc))
#         continue
#     colcount = len(tempdf.columns)
#     if len(tempdf.columns) != 4:
#         print('%s has %d columns, skipped' % (f, colcount))
#         continue
        
#     # ok
#     # truncate header row if it looks like a header
#     if tempdf.iloc[0][0]=='post_id':
#         tempdf = tempdf[1:]
#     # set the header explicitly
#     tempdf.columns=["post_id","post_score","artist","track"]

#     if output_df is not None:        
#         output_df = pd.concat([output_df, tempdf], axis=0)
#     else:
#         output_df = tempdf
#     count += 1
#     if count % 10 == 0:
#         print(count, end=' ')

        
        
        

In [15]:
filelist = glob.glob('%s/*.csv' % outdir)

with open(savefile, 'w') as outfile:
    for f in tqdm(filelist, desc = 'File concat'):
        with open(f, 'r') as infile:
            data = infile.read().strip()
            outfile.write(data)
            outfile.write("\n")
    
    

File concat: 100%|██████████| 481/481 [00:00<00:00, 2422.95it/s]


In [20]:
tempdf = pd.read_csv(savefile, header=None)
tempdf.columns=['post_id','post_score','artist','track']
tempdf

Unnamed: 0,post_id,post_score,artist,track
0,post_id,post_score,artist,track
1,jhfyzqd,1,REM,Find the River
2,jhfyzqd,1,Chris Cornell,Sunshower
3,jhfyzqd,1,Chris Cornell,Moonchild
4,jhfz0ky,1,Irma Thomas,Anyone who knows what love is
...,...,...,...,...
27240,jhfvmdt,1,Queen,Teo Torriate
27241,jhfvmdt,1,Queen,Who Wants to Live Forever
27242,jhfvmdt,1,Queen,Dear Friends
27243,jhfvoqv,1,Disturbed,The Sound of Silence


In [22]:
tempdf = tempdf.drop_duplicates() \
    .sort_values("post_score", ascending=False)
# drop header row
tempdf = tempdf.loc[~(tempdf['post_id'].str.strip()=='post_id')]
# na to ""
tempdf.loc[tempdf['post_id'].isna(), 'post_id'] = ''
tempdf.loc[tempdf['post_score'].isna(), 'post_score'] = ''
tempdf.loc[tempdf['artist'].isna(), 'artist'] = ''
tempdf.loc[tempdf['track'].isna(), 'track'] = ''
# strip spaces
tempdf['post_id'] = tempdf['post_id'].str.strip()
tempdf['post_score'] = tempdf['post_score'].str.strip()
tempdf['artist'] = tempdf['artist'].str.strip()
tempdf['track'] = tempdf['track'].str.strip()
# clean up post_score to valid int
tempdf['post_score'] = tempdf['post_score'].apply(lambda s: "".join([c for c in s if c.isdigit()]))
tempdf['post_score'] = tempdf['post_score'].apply(lambda x: x[-5:])
tempdf['post_score'] = tempdf['post_score'].apply(lambda s: int(s) if s else 1)
# drop missing tracks, cleanup track
tempdf = tempdf.drop(tempdf.loc[tempdf['track']==''].index)
tempdf = tempdf.drop(tempdf.loc[tempdf['track'].str.lower()=='unknown'].index)
tempdf = tempdf.drop(tempdf.loc[tempdf['track'].str.lower()=='track'].index)
tempdf = tempdf.sort_values('post_score', ascending=False)
tempdf.loc[tempdf['post_score']==0, 'post_score'] = 1
tempdf


Unnamed: 0,post_id,post_score,artist,track
14071,REM,3,1,Night swimming
14070,Gene Kelly,306,1,singing in the rain
14069,Andrzej Żarnecki,306,1,róza i bez
14068,Ellie Goulding,3,1,Mirrors
14066,Adagio for Strings,3,1,Agnus Dei
...,...,...,...,...
6383,jhcppp7,0,Unknown,Somewhere over the rainbow
7837,jheyi2f,0,Amy Lee,Broken
19450,jhcf41p,0,Dave Matthews,In My Life
17914,jhdf3sj,0,The Rolling Stones,Beast of Burden


In [35]:
tempdf.to_csv('bronze.csv', index=False)

tempdf.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(tempdf, f)

len(tempdf)



23134

In [71]:
df = tempdf


In [38]:
artist_df = df[['artist','track']].groupby('artist') \
    .count() \
    .sort_values('track', ascending=False) \
    .reset_index()
artist_df = artist_df.drop(artist_df.loc[artist_df['artist'].str.lower().str.startswith('unknown')].index)
artist_df = artist_df.drop(artist_df.loc[artist_df['artist'].str.lower().str.startswith('various')].index)
artist_df = artist_df.drop(artist_df.loc[artist_df['artist']==''].index)

artist_df

Unnamed: 0,artist,track
2,The Beatles,326
3,Radiohead,216
5,Fleetwood Mac,149
6,Pink Floyd,138
7,Jeff Buckley,131
...,...,...
6897,Jean-Jaques Goldman,1
6898,Jean-Jacques Goldman,1
6899,Jean Mouton,1
6900,Jean Michel Blais,1


# Impute missing artists
if someone just says 'Clair de Lune', or 'Let it be', without specifying the artist, maybe we can impute that?

In [94]:
# for missing artists, try to impute the artist based on the track
missing_artist_df = df.loc[(df['artist'].isna()) | (df['artist']=='')]
missing_artist_df


Unnamed: 0,post_id,post_score,artist,track,artist2
20912,jhc6oqe,4163,,Clair de Lune,
20913,jhbxc5p,2812,,Claire de Lune,
17873,jhdrv5t,731,,Avril 14th,
9946,jhc7g4r,573,,Linger,
13323,jhclthe,356,,Little Green,
...,...,...,...,...,...
4036,jhe4d8r,1,,real,
1904,jhegm4p,1,,Murmaider,
19703,jhd1f37,1,,Nocturne in E flat major,
18026,jhcmea3,1,,The Blackbird,


In [97]:
prompt_prefix3 = """I will provide you a list of well-known recordings.
I would like you to review each recording, and provide the name of the artist most closely associated with the recording.
You will provide them in CSV format, one record per line in the following order: recording, artist. Enclose each field in double-quotes.
The input is:

"""

def missing_artists(missing_artist_df):
    
    missing_track_map = {}
    
    slist = missing_artist_df['track'] \
        .dropna() \
        .str.lower() \
        .str.strip() \
        .drop_duplicates() \
        .tolist()

    slist.sort()
    n_missing = len(slist)

    while(slist):  # still artists to process
        print(datetime.now(), end=" ")

        tokens_to_date = count_tokens(prompt_prefix3)
        prompt = ''
        rows = 0
        for _ in range(nposts):  # add up to nposts posts to the prompt
            if slist and tokens_to_date + count_tokens(slist[0]) < 1024:
                track = f'"{slist.pop(0)}"\n'
                prompt += track
                tokens_to_date += count_tokens(track)
                rows += 1
            else:
                break
        print(f"sending {rows} rows...", end=" ")
        response = get_response(prompt, prompt_prefix3, verbose=False)

        if response is None:   # FAIL - retries exhausted
            print('Bailing to next chunk')
            continue

        if not response:
            print("nothing returned ... check returned dict for errors")

        lines = response.split("\n")
        print(f"received {len(lines)} lines...")
        c=0        
        for line in lines:
            try:
                csv_values = csv_validate_re.findall(line)
                if len(csv_values) != 2:
                    print(f"{len(csv_values)} values found: ", line)
                    continue
                track_input, artist_correct = csv_values[0], csv_values[1]

                if len(artist_correct) == 0:
                    print("empty artist returned")
                    continue
                # fix artist enclosed in quotes, parens, etc.
                while len(artist_correct) >=2 and (not artist_correct[0].isalnum()) and artist_correct[0] == artist_correct[-1]:
                    artist_correct = artist_correct[1:-1]
                while len(track_input) >=2 and (not track_input[0].isalnum()) and track_input[0] == track_input[-1]:
                    track_input = track_input[1:-1]
                # if it wasn't found then skip
                if artist_correct.lower().startswith('unknown'):
                    continue
                if artist_correct.lower() == 'n/a':
                    continue
                # store in dict to update df
                c += 1
                missing_track_map[track_input]=artist_correct
                print(f'{track_input}: {artist_correct}')                    
            except Exception as error:
                print('error', line)
                print(error)
                continue
                
        print(f"{c} lines processed, total {n_missing-len(slist)}, {len(slist)} of {n_missing} remaining")
        
    return missing_track_map
                
missing_track_map = missing_artists(missing_artist_df)



2023-05-20 19:31:41.034822 sending 156 rows... received 156 lines...
""( )"", "Sigur Rós"
""..... in paris"", "Frank Sinatra"
""23"", "Jimmy Eat World"
""26"", "Paramore"
""3 little birds"", "Bob Marley"
""74-75"", "The Connells"
""86d - no escort"", "Mitski"
""a bitter sweet genesis for him and her"", "The Dear Hunter"
""a case of you"", "Joni Mitchell"
""a change is gonna come"", "Sam Cooke"
""a la claire fontaine"", "Traditional"
""a lack of color"", "Death Cab for Cutie"
""a million dreams"", "The Greatest Showman Cast"
""a nightingale sang in berkeley square"", "Vera Lynn"
""a pillow of winds"", "Pink Floyd"
""a river runs through you"", "Yiruma"
""a summer place theme"", "Percy Faith"
""a thing on strings"", "The American Dollar"
""a thousand years"", "Christina Perri"
""a tine for us"", "Nino Rota"
""a voz do violão"", "João Gilberto"
""a whiter shade of pale"", "Procol Harum"
""a whole new world"", "Peabo Bryson and Regina Belle"
""achilles come down"", "Gang of Youths"
""acros

2023-05-20 19:34:55.787615 sending 140 rows... received 140 lines...
""Don't Break My Heart"", "UB40"
""Don't Know Much"", "Linda Ronstadt and Aaron Neville"
""Don't Think Twice, It's All Right"", "Bob Dylan"
""Don’t L"", "Missy Elliott"
""Don’t Let Me Down"", "The Beatles"
""Don’t Look Back"", "Boston"
""Don’t Talk"", "The Beach Boys"
""Doschitaii"", "Tatu"
""Down in a Hole"", "Alice in Chains"
""Down to the River to Pray"", "Alison Krauss"
""Down to You"", "Joni Mitchell"
""Dream a Little Dream"", "The Mamas & The Papas"
""Dream Sweet in Sea Major"", "Miracle Musical"
""Dreaming Again"", "Jim Croce"
""Dreaming My Dreams"", "Waylon Jennings"
""Dreams"", "Fleetwood Mac"
""Drips//Auntie’s Harp"", "Flying Lotus"
""Dry Hands"", "C418"
""Duo des Fluers"", "Léo Delibes"
""Dust in the Wind"", "Kansas"
""Duvet"", "Boa"
""Dylan Version"", "The Avett Brothers"
""Earth Song"", "Michael Jackson"
""Easy Way Out"", "Elliott Smith"
""Ebb Tide"", "The Righteous Brothers"
""Ecailles de Lune - Part 1 &

2023-05-20 19:39:57.957797 sending 137 rows... received 137 lines...
""liebestraum no 3"", "Franz Liszt"
""life is strange ost"", "Jonathan Morali"
""life letters"", "Lucas King"
""like real people do"", "Hozier"
""lillium"", "Kumiko Noma"
""linger"", "The Cranberries"
""listen to this"", "John Williams"
""listen to your heart"", "Roxette"
""little dru"", "Lucas King"
""little earthquakes"", "Tori Amos"
""little green"", "Joni Mitchell"
""little ship of dreams"", "Dean Martin"
""little things mean a lot"", "Kitty Kallen"
""little wing"", "Jimi Hendrix"
""living life in the night"", "Drew Holcomb & The Neighbors"
""location"", "Khalid"
""loch lomond"", "Traditional"
""lochs of the tay"", "Traditional"
""lonesome prison blues version"", "Traditional"
""long after you're gone"", "Chris Jones"
""long and winding road"", "The Beatles"
""look on down from the bridge"", "Mazzy Star"
""lord of cinder"", "Yuka Kitamura"
""lord of the rings"", "J.R.R. Tolkien"
""lord of the rings soundtrack"", "

2023-05-20 19:43:07.176518 sending 144 rows... received 144 lines...
""rachmaninoff piano concerto 2"", "Sergei Rachmaninoff"
""rachmaninoff’s piano concerto no. 2"", "Sergei Rachmaninoff"
""rachmaninov 2nd piano concerto"", "Sergei Rachmaninoff"
""rain song"", "Led Zeppelin"
""rainbow connection"", "Kermit the Frog"
""rainbow road n64 version :3"", "Nintendo"
""raindrop prelude"", "Frederic Chopin"
""rainy night in soho"", "The Pogues"
""ram ranch"", "Grant MacDonald"
""real"", "Khalid"
""reba"", "Phish"
""reckoner"", "Radiohead"
""red dress"", "Lucy Hale"
""red rain"", "Peter Gabriel"
""red swan"", "YOSHIKI feat. HYDE"
""reflection"", "Christina Aguilera"
""release"", "Pearl Jam"
""reluctant heroes"", "Hiroyuki Sawano"
""remember me"", "Coco Soundtrack"
""reminiscing"", "Little River Band"
""restless"", "Alison Krauss & Union Station"
""rhapsody in blue"", "George Gershwin"
""rhapsody on a theme of paganini"", "Sergei Rachmaninoff"
""rhapsody on a theme of paganini, variation no. 18 

2023-05-20 19:46:20.590719 sending 121 rows... received 121 lines...
""the middle"", "Jimmy Eat World"
""the moon represents my heart"", "Teresa Teng"
""the night we met"", "Lord Huron"
""the nightingale"", "Demi Lovato"
""the noose"", "A Perfect Circle"
""the noose!!!"", "A Perfect Circle"
""the numbers"", "Radiohead"
""the nurse who loved me (cover)"", "A Perfect Circle"
""the one"", "Kodaline"
""the one about blonde hair and blue eyes."", "The Avett Brothers"
""the one in my head"", "The 1975"
""the opening from jobless reincarnation season 1"", "Yuiko Oohara"
""the opening song of tonikawa: over the moon for you"", "Akari Kitou"
""the parting glass"", "Traditional"
""the perfect girl"", "Mareux"
""the piano"", "Michael Nyman"
""the planets"", "Gustav Holst"
""the planets, op. 32: iv. jupiter, the bringer of jollity"", "Gustav Holst"
""the prayer"", "Celine Dion and Andrea Bocelli"
""the prettiest song in the world"", "Ween"
""the rain song"", "Led Zeppelin"
""the recitation of the 

In [98]:
missing_track_map 


{'"( )"': 'Sigur Rós',
 '"..... in paris"': 'Frank Sinatra',
 '"23"': 'Jimmy Eat World',
 '"26"': 'Paramore',
 '"3 little birds"': 'Bob Marley',
 '"74-75"': 'The Connells',
 '"86d - no escort"': 'Mitski',
 '"a bitter sweet genesis for him and her"': 'The Dear Hunter',
 '"a case of you"': 'Joni Mitchell',
 '"a change is gonna come"': 'Sam Cooke',
 '"a la claire fontaine"': 'Traditional',
 '"a lack of color"': 'Death Cab for Cutie',
 '"a million dreams"': 'The Greatest Showman Cast',
 '"a nightingale sang in berkeley square"': 'Vera Lynn',
 '"a pillow of winds"': 'Pink Floyd',
 '"a river runs through you"': 'Yiruma',
 '"a summer place theme"': 'Percy Faith',
 '"a thing on strings"': 'The American Dollar',
 '"a thousand years"': 'Christina Perri',
 '"a tine for us"': 'Nino Rota',
 '"a voz do violão"': 'João Gilberto',
 '"a whiter shade of pale"': 'Procol Harum',
 '"a whole new world"': 'Peabo Bryson and Regina Belle',
 '"achilles come down"': 'Gang of Youths',
 '"across the universe"': 'T

In [100]:
missing_track_map

{'"( )"': 'Sigur Rós',
 '"..... in paris"': 'Frank Sinatra',
 '"23"': 'Jimmy Eat World',
 '"26"': 'Paramore',
 '"3 little birds"': 'Bob Marley',
 '"74-75"': 'The Connells',
 '"86d - no escort"': 'Mitski',
 '"a bitter sweet genesis for him and her"': 'The Dear Hunter',
 '"a case of you"': 'Joni Mitchell',
 '"a change is gonna come"': 'Sam Cooke',
 '"a la claire fontaine"': 'Traditional',
 '"a lack of color"': 'Death Cab for Cutie',
 '"a million dreams"': 'The Greatest Showman Cast',
 '"a nightingale sang in berkeley square"': 'Vera Lynn',
 '"a pillow of winds"': 'Pink Floyd',
 '"a river runs through you"': 'Yiruma',
 '"a summer place theme"': 'Percy Faith',
 '"a thing on strings"': 'The American Dollar',
 '"a thousand years"': 'Christina Perri',
 '"a tine for us"': 'Nino Rota',
 '"a voz do violão"': 'João Gilberto',
 '"a whiter shade of pale"': 'Procol Harum',
 '"a whole new world"': 'Peabo Bryson and Regina Belle',
 '"achilles come down"': 'Gang of Youths',
 '"across the universe"': 'T

In [117]:
# check for reasonableness, clean up and apply
df['track']=df['track'].astype(str)
df['artist2'] = df.apply(lambda row: missing_track_map[row.track.lower().strip()] if row.artist=="" and row.track.lower().strip() in missing_track_map else row.artist, axis=1)
df.loc[df['artist'] != df['artist2']]



Unnamed: 0,post_id,post_score,artist,track,artist2
20912,jhc6oqe,4163,,Clair de Lune,Claude Debussy
20913,jhbxc5p,2812,,Claire de Lune,Claude Debussy
17873,jhdrv5t,731,,Avril 14th,Aphex Twin
9946,jhc7g4r,573,,Linger,The Cranberries
13323,jhclthe,356,,Little Green,Joni Mitchell
...,...,...,...,...,...
4036,jhe4d8r,1,,real,Khalid
1904,jhegm4p,1,,Murmaider,Dethklok
19703,jhd1f37,1,,Nocturne in E flat major,Chopin
18026,jhcmea3,1,,The Blackbird,The Beatles


In [211]:
df['artist'] = df.apply(lambda row: missing_track_map[row.track.lower()] if row.artist=="" and row.track.lower() in missing_track_map else row.artist, axis=1)



# Fix typos, abbreviations, missing artists using ChatGPT

In [39]:
prompt_prefix2 = """You will act as a proofreader. I will provide you a list of recording artists or composers.
You will review each input artist for any spelling errors or abbreviations and provide the corrected full artist without abbreviation. 
You will provide them in CSV format, one record per line in the following order: input_artist, corrected_artist. Enclose each field in double-quotes.
The input is:

"""


In [48]:
# proofread / dedupe artists
# may want to run this whole sequence a couple of times and update df, silver.csv

def dedupe_artists(artist_df):
    
    nposts = 1000
    artist_map = {}

    slist = sorted(artist_df['artist'].tolist())
    n_artists=len(slist)

    while(slist):  # still artists to process
        print(datetime.now(), end=" ")

        prompt = ""
        tokens_to_date = count_tokens(prompt_prefix2)
        rows = 0
        for _ in range(nposts):  # add up to 100 posts to the prompt
            if slist:
                if tokens_to_date + count_tokens(slist[0]) < 1024:
                    artist = f'{slist.pop(0)}\n'
                    prompt += artist
                    tokens_to_date += count_tokens(artist)
                    rows += 1
            else:
                break
        print(f"sending {rows} rows...", end=" ")

        response = get_response(prompt, prompt_prefix2, verbose=False)
        if response is None:   # FAIL - retries exhausted
            print('Bailing to next chunk')
            continue

        if not response:
            print("there was a problem, check the payload")


        lines = response.split("\n")
        print(f"received {len(lines)} lines...")
        # sometimes doesn't match, chatgpt monkeys skip some

        c=0
        for line in lines:
            try:
                c+=1
                csv_values = csv_validate_re.findall(line) 
                if len(csv_values) != 2:
                    print('%d values found' % len(csv_values), line)
                    continue
                artist_input, artist_correct = csv_values[0].strip(), csv_values[1].strip()
                if len(artist_correct) == 0:
                    print("empty artist returned")
                    continue
                # fix artist enclosed in quotes, parens, etc.
                while (not artist_input[0].isalnum()) and artist_input[0] == artist_input[-1]:
                    artist_input = artist_input[1:-1]
                while (not artist_correct[0].isalnum()) and artist_correct[0] == artist_correct[-1]:
                    artist_correct = artist_correct[1:-1]
                # if it matches modulo case then skip
                if artist_input.lower() == artist_correct.lower():
                    continue
                # if it wasn't found then skip
                if artist_correct.lower().startswith('unknown'):
                    continue
                if artist_correct.lower() == 'n/a':
                    continue
                if artist_correct.lower() == "no correction needed":
                    continue
                # store in dict to update df
                artist_map[artist_input]=artist_correct
                print(f'"{artist_input}", "{artist_correct}"')
            except Exception as error:
                print('error', line)
                print(error)
                continue
        print(f"{c} lines processed, total {n_artists-len(slist)}, {len(slist)} of {n_artists} remaining")
        
    return artist_map

artist_map=dedupe_artists(artist_df)

print(datetime.now())


2023-05-20 15:52:23.810711 sending 105 rows... received 105 lines...
"$not", "Snot"
"uicideboy", "Suicideboys"
"(G)-IDLE", "(G)I-DLE"
"*nsync", "NSYNC"
error "-","-"
string index out of range
"1", "One"
"1975", "The 1975"
"2 cellos", "2Cellos"
"2Pac", "Tupac Shakur"
"2pac", "Tupac Shakur"
"3", "Three"
"30 seconds to mars", "Thirty Seconds to Mars"
"3rd secret", "Third Secret"
"411", "Four One One"
"42 dougg and lil baby", "42 Dugg and Lil Baby"
"5sos", "5 Seconds of Summer"
"8485", "88rising"
"88lien", "88rising"
"A R Rahman", "A. R. Rahman"
"A Silver Mt. Zion", "Thee Silver Mt. Zion Memorial Orchestra"
"A Touch of Class aka ATC", "ATC"
"A$AP Rocky", "ASAP Rocky"
"A.R. Rahman", "A. R. Rahman"
"A.R.Rehman", "A. R. Rahman"
"ALanis Morisette", "Alanis Morissette"
"APC", "A Perfect Circle"
"AR Rehman", "A. R. Rahman"
"ASMZ", "Thee Silver Mt. Zion Memorial Orchestra"
"Aaron Copeland", "Aaron Copland"
"Ab soul", "Ab-Soul"
"Abel Korzeniowsku", "Abel Korzeniowski"
"Above and Beyond", "Above & 

2023-05-20 16:07:50.028063 sending 88 rows... received 88 lines...
"Carle commando", "Carl Craig"
"Carlos Nunez", "Carlos Núñez"
"Carole and Tuesday", "Carole & Tuesday"
"Caroline Polacheck", "Caroline Polachek"
"Caroline Spence & Matt Berninger", "Caroline Spence, Matt Berninger"
"Carrie Underwood and Vince Gill", "Carrie Underwood, Vince Gill"
"Carti and Summertime Sadness", "Carti, Summertime Sadness"
"Case, lang, Veirs", "Case / Lang / Veirs"
"Casey Muskgraves", "Kacey Musgraves"
"Cass Eliot", "Cass Elliot"
"Cass Elliott", "Cass Elliot"
"Cat Steven's", "Cat Stevens"
"Cat Stevenson", "Cat Stevens"
"Cbool", "C-Bool"
"Cecelia Castleman", "Cecelia Condit"
"Cecile Corbel", "Cécile Corbel"
"Celine Dion & Barbara Streisand", "Celine Dion, Barbra Streisand"
"Celtic Women", "Celtic Woman"
"Celtic women", "Celtic Woman"
"Celtric woman", "Celtic Woman"
"Cerys Mathews", "Cerys Matthews"
"Cesaria Evora", "Cesária Évora"
"Chappelle Show", "Chappelle's Show"
88 lines processed, total 1088, 5808 o

2023-05-20 16:20:19.991489 sending 84 rows... received 84 lines...
"Emerson, Lake and Palmer", "Emerson, Lake & Palmer"
"Emerson, Lake, and Palmer", "Emerson, Lake & Palmer"
"Emiliana Torini", "Emiliana Torrini"
"Emilíana Torrini", "Emiliana Torrini"
"Emmett Otter's Jug-Band Christmas", "Emmet Otter's Jug-Band Christmas"
"Emmy Lou Harris", "Emmylou Harris"
"Ennis Morricone", "Ennio Morricone"
"Eric Clapman", "Eric Clapton"
"Eric Satie", "Erik Satie"
"Eric Whitaker", "Eric Whitacre"
"Esther García", "Esther Garcia"
"Everley Brothers", "Everly Brothers"
84 lines processed, total 1890, 5006 of 6896 remaining
2023-05-20 16:21:41.571126 sending 94 rows... received 94 lines...
"F. Liszt", "Franz Liszt"
"FFIV", "Final Fantasy IV"
"FFXIV", "Final Fantasy XIV"
"Fabrizio de Andrè", "Fabrizio de André"
"Fairport Conversation", "Fairport Convention"
"Fairuz", "Fairouz"
"Fantasie Impromptu", "Fantaisie-Impromptu"
"Felix Mendelssohn-Bartholdy", "Felix Mendelssohn"
"Final Fantasy 7", "Final Fantasy V

2023-05-20 16:29:14.676874 sending 87 rows... received 87 lines...
"Herb Alpert and the Tijuana Brass", "Herb Alpert & The Tijuana Brass"
"Herb Alpert and the Tijuana brass", "Herb Alpert & The Tijuana Brass"
"Her’s", "Her's"
"Hosier", "Hozier"
"Howard Shore and London Philharmonic Orchestra", "Howard Shore & London Philharmonic Orchestra"
"Howl’s Moving Castle", "Howl's Moving Castle"
"Hudson Mohawk", "Hudson Mohawke"
"Hudson Mowhawke", "Hudson Mohawke"
"Hugo Alfven", "Hugo Alfvén"
"Human Experience, Saint Sinner", "Human Experience & Saint Sinner"
87 lines processed, total 2511, 4385 of 6896 remaining
2023-05-20 16:30:33.092030 sending 90 rows... received 90 lines...
"Imogene Heap", "Imogen Heap"
"India Aire", "India.Arie"
"India Arie", "India.Arie"
"Iron and Wine", "Iron & Wine"
"Iron and wine", "Iron & Wine"
"Israel Kamakawiwoʻole", "Israel Kamakawiwo'ole"
"Israel Kamakawiwo’ole", "Israel Kamakawiwo'ole"
"Israel Kamikawiwol’ole", "Israel Kamakawiwo'ole"
" Kamakawiwo'ole'", "Israel 

2023-05-20 16:48:05.806690 sending 84 rows... received 84 lines...
"Mancini", "Henry Mancini"
"Mancini/Hepburn", "Henry Mancini & Audrey Hepburn"
"Mano chao", "Manu Chao"
"Maranda Lambert", "Miranda Lambert"
"Marchyplayground", "Marching Playground"
"Marcos Fernandez", "Marcos Fernández"
"Marcy's Playground", "Marcy Playground"
"Maria Gadu", "Maria Gadú"
"Maria McKey", "Maria McKee"
"Marie Callas", "Maria Callas"
"Mario Galaxy", "Super Mario Galaxy"
"Mark Knophler", "Mark Knopfler"
"Mark O’Connor and Yo-Yo Ma", "Mark O'Connor & Yo-Yo Ma"
"Mark Vincent and 2 Cellos", "Mark Vincent & 2Cellos"
"Marmalade", "The Marmalade"
"Marmelade", "The Marmalade"
"Marshall Mathers", "Eminem"
"Marshall Tucker Band", "The Marshall Tucker Band"
"Marta Wainwright", "Martha Wainwright"
"Martin Garrix & Shaun Farrugia", "Martin Garrix & Shaun Frank"
"Martin Michael Murphy", "Michael Martin Murphey"
"Marty", "Marty Robbins"
"Marty O'Donnell, Stan LePard, and Michael Salvatori", "Marty O'Donnell, Stan LePard,

2023-05-20 17:01:19.941820 sending 93 rows... received 93 lines...
"Planet DOB OST", "Planet D.O.B. Original Soundtrack"
"Playboy Carti", "Playboi Carti"
"Porter Robinson, Madeon", "Porter Robinson & Madeon"
"Postal Service", "The Postal Service"
"Prince of Egypt", "The Prince of Egypt Soundtrack"
"Procal Harum", "Procol Harum"
"Procol Harem", "Procol Harum"
"Procul Harum", "Procol Harum"
"Psychedelic Furs", "The Psychedelic Furs"
"Public Image", "Public Image Ltd."
"Puccini", "Giacomo Puccini"
"Pucsifer", "Puscifer"
"Puddle of Mud", "Puddle of Mudd"
"Pure Praire League", "Pure Prairie League"
"Purtiy Ring", "Purity Ring"
"Pytor Ilch Tchaikovsky", "Pyotr Ilyich Tchaikovsky"
"Queensreich", "Queensrÿche"
"Queensryche", "Queensrÿche"
"R. E. M.", "R.E.M."
"R. Strauss", "Richard Strauss"
"R.E.M", "R.E.M."
"R.I.P", "R.I.P."
93 lines processed, total 4678, 2218 of 6896 remaining
2023-05-20 17:02:41.957267 sending 88 rows... received 88 lines...
"RAC (Katie Herzig)", "RAC (feat. Katie Herzig)"

2023-05-20 17:10:36.884681 sending 88 rows... received 82 lines...
"Sinnead O'Connor and The Chieftans", "Sinéad O'Connor"
"Siouxsie & the Banshees", "Siouxsie and the Banshees"
"Sir Mix A Lot", "Sir Mix-A-Lot"
"Sisko", "Sissel"
"Sissel Kyrkjebo", "Sissel"
"Sister Soliel", "Sister Soleil"
"Sivert Høyem", "Sivert Hoyem"
"Six Pence, none the richer", "Sixpence None The Richer"
"Slaughter Beach Dog", "Slaughter Beach, Dog"
"Slim Shady", "Eminem"
"Slow Show", "The Slow Show"
"Smetana", "Bedrich Smetana"
"Smith-Thell", "Smith & Thell"
82 lines processed, total 5299, 1597 of 6896 remaining
2023-05-20 17:11:47.646061 sending 91 rows... received 90 lines...
"Sonny & Cher", "Sonny and Cher"
"Sophie B Hawkins", "Sophie B. Hawkins"
"Sopor Aeternus & The Ensemble of Shadows", "Sopor Aeternus and The Ensemble of Shadows"
"Sort Sol feat. Sissel Kyrkjebø", "Sort Sol featuring Sissel Kyrkjebø"
"Sparklehourse", "Sparklehorse"
"Spell Songs feat. Julie Fowlis", "Spell Songs featuring Julie Fowlis"
"Sprit

2023-05-20 17:17:27.057172 sending 89 rows... received 89 lines...
"The King’s Singers", "The King's Singers"
"The La’s", "The La's"
"The Mamas and the Papa", "The Mamas and the Papas"
"The Maria's", "The Marias"
"The Marías", "The Marias"
"The Oh Hello's", "The Oh Hellos"
"The Oh Hello’s", "The Oh Hellos"
"The Oh, Hellos", "The Oh Hellos"
"The Onc", "The Once"
89 lines processed, total 5846, 1050 of 6896 remaining
2023-05-20 17:18:38.594845 sending 90 rows... received 90 lines...
"The Purity Ring", "Purity Ring"
"The Pyrates Royale", "Pyrates Royale"
"The Red Hot Chili Peppers", "Red Hot Chili Peppers"
"The Red Jumpsuit Apparatus", "Red Jumpsuit Apparatus"
"The Scissor Sisters", "Scissor Sisters"
"The Spice Girls", "Spice Girls"
"The Sunday's", "The Sundays"
"The Talking Heads", "Talking Heads"
"The Tedeschi Trucks Band", "Tedeschi Trucks Band"
"The Villagers", "Villagers"
"The Wailin’ Jennys", "The Wailin' Jennys"
90 lines processed, total 5936, 960 of 6896 remaining
2023-05-20 17:19

In [69]:
# most variations
z = pd.DataFrame(zip(artist_map.keys(), artist_map.values()))
z.groupby(1).count().reset_index().sort_values([0], ascending=False).head(20)

Unnamed: 0,1,0
165,"Crosby, Stills & Nash",13
113,Camille Saint-Saëns,8
545,Ludovico Einaudi,7
297,Florence and The Machine,7
459,Johann Sebastian Bach,7
366,Guns N' Roses,6
421,Israel Kamakawiwo'ole,6
817,Sigur Rós,6
516,Led Zeppelin,5
792,Sarah McLachlan,5


In [51]:
print(artist_map)

{'$not': 'Snot', 'uicideboy': 'Suicideboys', '(G)-IDLE': '(G)I-DLE', '*nsync': 'NSYNC', '1': 'One', '1975': 'The 1975', '2 cellos': '2Cellos', '2Pac': 'Tupac Shakur', '2pac': 'Tupac Shakur', '3': 'Three', '30 seconds to mars': 'Thirty Seconds to Mars', '3rd secret': 'Third Secret', '411': 'Four One One', '42 dougg and lil baby': '42 Dugg and Lil Baby', '5sos': '5 Seconds of Summer', '8485': '88rising', '88lien': '88rising', 'A R Rahman': 'A. R. Rahman', 'A Silver Mt. Zion': 'Thee Silver Mt. Zion Memorial Orchestra', 'A Touch of Class aka ATC': 'ATC', 'A$AP Rocky': 'ASAP Rocky', 'A.R. Rahman': 'A. R. Rahman', 'A.R.Rehman': 'A. R. Rahman', 'ALanis Morisette': 'Alanis Morissette', 'APC': 'A Perfect Circle', 'AR Rehman': 'A. R. Rahman', 'ASMZ': 'Thee Silver Mt. Zion Memorial Orchestra', 'Aaron Copeland': 'Aaron Copland', 'Ab soul': 'Ab-Soul', 'Abel Korzeniowsku': 'Abel Korzeniowski', 'Above and Beyond': 'Above & Beyond', 'Acker Bill': 'Acker Bilk', 'Adam Baldych and Yaron Herman': 'Adam Ba

In [72]:
# check the map for reasonableness
# it does pretty smart stuff like map nin to Nine Inch Nails 
# but if it screws up that artist probably won't show up in spotify
df['artist2'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)

In [73]:
df.loc[df['artist'] != df['artist2']]

Unnamed: 0,post_id,post_score,artist,track,artist2
20582,jhcc2q8,5014,Israel Kamakawiwoʻole,Over the Rainbow,Israel Kamakawiwo'ole
20571,jhbm8ne,2461,Sigur Ros,Hoppipolla,Sigur Rós
20590,jhc4ym3,1372,Ben E. King,Stand By Me,Ben E King
20008,jhc1tl5,832,Israel Kamakawiwoʻole,Somewhere Over the Rainbow/Wonderful World,Israel Kamakawiwo'ole
20607,jhc8m5y,553,The Dream Academy,Life in a Northern Town,Dream Academy
...,...,...,...,...,...
3174,jhhhfrx,1,Ziggy Albert,Simple things,Ziggy Alberts
3176,jhhoaxo,1,Israel Kamakawiwoʻole,Somewhere Over The Rainbow/What A Wonderful World,Israel Kamakawiwo'ole
3155,jhd30g7,1,Tobias Jesso Jr,True Love,Tobias Jesso Junior
3146,jhd1ecx,1,City and Color,Woman,City and Colour


In [74]:
# apply the map
df['artist'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)

In [None]:
# run again if desired

In [93]:
artist_df = df[['artist','track']].groupby('artist') \
    .count() \
    .sort_values('track', ascending=False) \
    .reset_index()
artist_df = artist_df.drop(artist_df.loc[artist_df['artist'].str.lower().str.startswith('unknown')].index)
artist_df = artist_df.drop(artist_df.loc[artist_df['artist'].str.lower().str.startswith('various')].index)
artist_df = artist_df.drop(artist_df.loc[artist_df['artist']==''].index)

artist_df

Unnamed: 0,artist,track
2,The Beatles,335
3,Radiohead,216
4,Simon & Garfunkel,203
6,Fleetwood Mac,150
7,Pink Floyd,138
...,...,...
6081,Jay Foreman,1
6082,Jay Chou,1
6083,Jax,1
6084,Javier Solís,1


# Dedupe with pandas_dedupe

In [237]:
def fix_leading_trailing(s):
    """First and last should be alphanumeric"""
    # regex prob better if re.match('^\W+(.*)\W+$',playerName): 

    while len(s) >= 2 and (not s[0].isalnum()) and s[0] == s[-1]:
        s = s[1:-1]
            
    return s.lower().strip()


In [219]:
df[['artist', 'artist2', 'post_score']] \
    .groupby('artist') \
    .agg( \
         count=('post_score', 'count'), \
        ) \
    .reset_index() \
    .sort_values('count', ascending=False) 

Unnamed: 0,artist,count
5800,The Beatles,97
4861,Radiohead,65
4718,Pink Floyd,51
5369,Simon & Garfunkel,45
1167,Chopin,44
...,...,...
2755,James Blake & Bon Iver,1
2752,James,1
2751,Jal,1
2750,Jake Xerxes Fussell,1


In [238]:
df['artist_dedupe'] = df['artist'].apply(fix_leading_trailing)
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('unknown')].index)
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('various')].index)
df = df.drop(df.loc[df['artist_dedupe']=='none'].index)
df = df.drop(df.loc[df['artist_dedupe']==''].index)
df = df.drop(df.loc[df['artist_dedupe']=='post_score'].index)


In [239]:
df['artist_dedupe'] = df['artist_dedupe'].apply(lambda s: s[4:] if s[:4]=='the ' else s)

df.loc[df['artist_dedupe']=='band', 'artist_dedupe']='the band'



In [240]:
dedupe_df = df[['artist', 'artist_dedupe', 'post_score']] \
    .groupby(['artist', 'artist_dedupe']) \
    .count() \
    .sort_values('post_score', ascending=False) \
    .reset_index() \
    .reset_index() 

dedupe_df


Unnamed: 0,index,artist,artist_dedupe,post_score
0,0,The Beatles,beatles,104
1,1,Radiohead & Hans Zimmer,radiohead & hans zimmer,69
2,2,Sigur Rós,sigur rós,64
3,3,Lana Del Rey,lana del rey,56
4,4,Pink Floyd,pink floyd,53
...,...,...,...,...
6225,6225,Jamie Duffy,jamie duffy,1
6226,6226,Jamestown,jamestown,1
6227,6227,James Vincent mcmorrow,james vincent mcmorrow,1
6228,6228,James Vincent McMarrow,james vincent mcmarrow,1


In [241]:
# reset dedupe learned settings
# !rm dedupe_dataframe_learned_settings 
# !rm dedupe_dataframe_training.json   
dedupe_df2 = pandas_dedupe.dedupe_dataframe(dedupe_df, ['artist_dedupe'])


Importing data ...
Reading from dedupe_dataframe_learned_settings
Clustering...


  dedupe_df2 = pandas_dedupe.dedupe_dataframe(dedupe_df, ['artist_dedupe'])


# duplicate sets 6209


In [242]:
dedupe_df2

Unnamed: 0,index,artist,artist_dedupe,post_score,cluster id,confidence
0,0,the beatles,beatles,104,21,1.0
1,1,radiohead hans zimmer,radiohead hans zimmer,69,22,1.0
2,2,sigur ros,sigur ros,64,23,1.0
3,3,lana del rey,lana del rey,56,24,1.0
4,4,pink floyd,pink floyd,53,25,1.0
...,...,...,...,...,...,...
6225,6225,jamie duffy,jamie duffy,1,6204,1.0
6226,6226,jamestown,jamestown,1,6205,1.0
6227,6227,james vincent mcmorrow,james vincent mcmorrow,1,6206,1.0
6228,6228,james vincent mcmarrow,james vincent mcmarrow,1,6207,1.0


In [243]:
dedupe_df['cluster id'] = dedupe_df2['cluster id']
name2i = {a: i for i, a in zip(dedupe_df['cluster id'].tolist(), dedupe_df['artist_dedupe'].tolist())}
df['artist_index'] = df['artist_dedupe'].apply(lambda s: name2i[s])
df


Unnamed: 0,artist,track2,score,track,artist2,post_score,artist_dedupe,artist_index
0,Claude Debussy,claire de lune,13966,Claire de Lune,Claude Debussy,13966,claude debussy,89
1,Bruce Springsteen,born to run,7129,Born To Run,Bruce Springsteen,7129,bruce springsteen,205
2,Erik Satie,gymnopédies,7124,Gymnopédies,Erik Satie,7124,erik satie,61
3,Simon & Garfunkel,scarborough fair,6155,Scarborough Fair,Simon & Garfunkel,6155,simon & garfunkel,28
4,Neil Young,harvest moon,5328,Harvest Moon,Neil Young,5328,neil young,73
...,...,...,...,...,...,...,...,...
15444,Halsey,love you more,1,Love You More,Halsey,1,halsey,1393
15445,Hamid Al Shaeri,ayonha,1,ayonha,Hamid Al Shaeri,1,hamid al shaeri,5636
15446,Hamilton Leithauser,in a black out,1,In a Black out,Hamilton Leithauser,1,hamilton leithauser,5613
15447,Hammock,now and not yet,1,Now and Not Yet,Hammock,1,hammock,1477


In [244]:
df.loc[(df['artist_index'].isna())]
df.loc[(df['artist_index']==0)]

Unnamed: 0,artist,track2,score,track,artist2,post_score,artist_dedupe,artist_index
59,Simon and G,the boxer,599,The Boxer,Simon and Garfunkel,599,simon and g,0
163,Simon and G,the 59th street bridge song,151,The 59th Street Bridge Song,Simon and Garfunkel,151,simon and g,0
180,Simon and G,bridge over troubled water,125,Bridge Over Troubled Water,Simon and Garfunkel,125,simon and g,0
181,Simon and G,the sound of silence,124,The Sound of Silence,Simon and Garfunkel,124,simon and g,0
200,Simon and G,the only living boy in new york,93,The Only Living Boy in New York,Simon and Garfunkel,93,simon and g,0
245,Simon and G,april come she will,63,April Come She Will,Simon and Garfunkel,63,simon and g,0
368,Simon and G,scarborough fair/canticle,37,Scarborough Fair/Canticle,Simon and Garfunkel,37,simon and g,0
430,Simon and G,feeling groovy,27,Feeling Groovy,Simon and Garfunkel,27,simon and g,0
623,Simon and G,scarborough fair,17,Scarborough Fair,Simon and Garfunkel,17,simon and g,0
721,Simon and G,kathy’s song,14,Kathy’s Song,Simon and Garfunkel,14,simon and g,0


In [245]:
dedupe_df

Unnamed: 0,index,artist,artist_dedupe,post_score,cluster id
0,0,The Beatles,beatles,104,21
1,1,Radiohead & Hans Zimmer,radiohead & hans zimmer,69,22
2,2,Sigur Rós,sigur rós,64,23
3,3,Lana Del Rey,lana del rey,56,24
4,4,Pink Floyd,pink floyd,53,25
...,...,...,...,...,...
6225,6225,Jamie Duffy,jamie duffy,1,6204
6226,6226,Jamestown,jamestown,1,6205
6227,6227,James Vincent mcmorrow,james vincent mcmorrow,1,6206
6228,6228,James Vincent McMarrow,james vincent mcmarrow,1,6207


In [246]:
df.loc[(df['artist_index'].isna())]

Unnamed: 0,artist,track2,score,track,artist2,post_score,artist_dedupe,artist_index


In [247]:
# map to artist 
tempdf = dedupe_df[['artist_dedupe', 'artist', 'cluster id', 'post_score']] \
    .groupby(['artist_dedupe', 'cluster id']) \
    .agg( \
         count=('post_score', 'count'), \
         artist=('artist', 'first') \
        ) \
    .reset_index() \
    .sort_values('count', ascending=False) 
tempdf

Unnamed: 0,artist_dedupe,cluster id,count,artist
0,$not,2492,1,$not
4138,my useless life,2096,1,My Useless Life
4160,nameless song,2377,1,Nameless Song
4159,name taken,2380,1,Name Taken
4158,nalin and kane,2381,1,Nalin and Kane
...,...,...,...,...
2073,francoise hardy,886,1,Francoise Hardy
2072,franck keller & ygal amar,5785,1,Franck Keller & Ygal Amar
2071,francisco tarrega,710,1,Francisco Tarrega
2070,francisca valenzuela,5786,1,Francisca Valenzuela


In [248]:
i2name = {i: a for i, a in zip(tempdf['cluster id'].tolist(), tempdf['artist'].tolist())}
df['artist'] = df.apply(lambda r: i2name[r.artist_index], axis=1)
df

Unnamed: 0,artist,track2,score,track,artist2,post_score,artist_dedupe,artist_index
0,Claude Debussy,claire de lune,13966,Claire de Lune,Claude Debussy,13966,claude debussy,89
1,Bruce Springsteen,born to run,7129,Born To Run,Bruce Springsteen,7129,bruce springsteen,205
2,Erik Satie,gymnopédies,7124,Gymnopédies,Erik Satie,7124,erik satie,61
3,Simon & Garfunkel,scarborough fair,6155,Scarborough Fair,Simon & Garfunkel,6155,simon & garfunkel,28
4,Neil Young,harvest moon,5328,Harvest Moon,Neil Young,5328,neil young,73
...,...,...,...,...,...,...,...,...
15444,Halsey,love you more,1,Love You More,Halsey,1,halsey,1393
15445,Hamid Al Shaeri,ayonha,1,ayonha,Hamid Al Shaeri,1,hamid al shaeri,5636
15446,Hamilton Leithauser,in a black out,1,In a Black out,Hamilton Leithauser,1,hamilton leithauser,5613
15447,Hammock,now and not yet,1,Now and Not Yet,Hammock,1,hammock,1477


In [249]:
df.loc[df['artist'].str.find('carp') >=0]

Unnamed: 0,artist,track2,score,track,artist2,post_score,artist_dedupe,artist_index


In [250]:
df.groupby('track') \
    .count() \
    .reset_index() \
    .sort_values('artist', ascending=False) \
    .head(20)


Unnamed: 0,track,artist,track2,score,artist2,post_score,artist_dedupe,artist_index
3614,Hallelujah,24,24,24,24,24,24,24
884,Ave Maria,21,21,21,21,21,21,21
1382,Breathe,14,14,14,14,14,14,14
17,(unknown),13,13,13,13,13,13,13
6039,Moon River,12,12,12,12,12,12,12
631,Angel,12,12,12,12,12,12,12
3777,Helplessly Hoping,12,12,12,12,12,12,12
3907,Home,11,11,11,11,11,11,11
8408,Somewhere Over the Rainbow,11,11,11,11,11,11,11
5647,Lullaby,11,11,11,11,11,11,11


In [251]:
df

Unnamed: 0,artist,track2,score,track,artist2,post_score,artist_dedupe,artist_index
0,Claude Debussy,claire de lune,13966,Claire de Lune,Claude Debussy,13966,claude debussy,89
1,Bruce Springsteen,born to run,7129,Born To Run,Bruce Springsteen,7129,bruce springsteen,205
2,Erik Satie,gymnopédies,7124,Gymnopédies,Erik Satie,7124,erik satie,61
3,Simon & Garfunkel,scarborough fair,6155,Scarborough Fair,Simon & Garfunkel,6155,simon & garfunkel,28
4,Neil Young,harvest moon,5328,Harvest Moon,Neil Young,5328,neil young,73
...,...,...,...,...,...,...,...,...
15444,Halsey,love you more,1,Love You More,Halsey,1,halsey,1393
15445,Hamid Al Shaeri,ayonha,1,ayonha,Hamid Al Shaeri,1,hamid al shaeri,5636
15446,Hamilton Leithauser,in a black out,1,In a Black out,Hamilton Leithauser,1,hamilton leithauser,5613
15447,Hammock,now and not yet,1,Now and Not Yet,Hammock,1,hammock,1477


In [252]:
df['track2'] = df['track'].str.lower()


In [253]:
df = df.drop(df.loc[df['track2'].str.startswith('unknown')].index)
df = df.drop(df.loc[df['track2']=='cover'].index)
df = df.drop(df.loc[df['track2']=='version'].index)
df = df.drop(df.loc[df['track2']=='anything'].index)
df = df.drop(df.loc[df['track2']=='none'].index)
df = df.drop(df.loc[df['track2']==''].index)
df = df.drop(df.loc[df['track2'].str.startswith('no track')].index)
df = df.drop(df.loc[df['track2'].str.startswith('no artist')].index)
df = df.drop(df.loc[df['track2'].str.startswith('various')].index)


In [263]:
df.loc[df['artist']=='Fleetwood', 'artist']='Fleetwood Mac'
df.loc[df['track']=='Claire de Lune', 'track']='Clair de Lune'


In [267]:
df = df[['artist', 'track', 'post_score', 'track2']] \
    .groupby(['artist', 'track2']) \
    .agg(sum= ('post_score', 'sum'), \
         track= ('track', 'first') \
        ) \
    .reset_index() \
    .sort_values('sum', ascending=False) \
    .reset_index(drop=True) \
    .rename({'sum': 'score', axis=1})
df

Unnamed: 0,artist,track,score
2389,Claude Debussy,Clair de Lune,16850
1828,Bruce Springsteen,Born To Run,7129
3754,Erik Satie,Gymnopédies,7124
10429,Simon & Garfunkel,Scarborough Fair,6157
8264,Neil Young,Harvest Moon,5334
...,...,...,...
5324,Jada Facer,7 Minutes in Heaven,1
5325,Jada Facer,Creep,1
5327,Jaden,Ninety,1
5328,Jades Goudreault,Purple Circles,1


In [256]:
df.loc[df['artist']=='']

Unnamed: 0,artist,track2,score,track


In [69]:
# tempdf = df[['artist', 'post_score']] \
#     .groupby('artist') \
#     .sum() \
#     .reset_index() 

# tempdf.loc[tempdf['post_score']> 2].to_csv('x.csv', index=False)

In [258]:
df

Unnamed: 0,artist,track2,score,track
0,Claude Debussy,claire de lune,16850,Claire de Lune
1,Bruce Springsteen,born to run,7129,Born To Run
2,Erik Satie,gymnopédies,7124,Gymnopédies
3,Simon & Garfunkel,scarborough fair,6157,Scarborough Fair
4,Neil Young,harvest moon,5334,Harvest Moon
...,...,...,...,...
13390,Jada Facer,7 minutes in heaven,1,7 Minutes in Heaven
13391,Jada Facer,creep,1,Creep
13392,Jaden,ninety,1,Ninety
13393,Jades Goudreault,purple circles,1,Purple Circles


In [259]:
df = df[['artist', 'track', 'score']].groupby(["artist", "track"]) \
    .sum() \
    .reset_index() \
    .sort_values('score', ascending=False)

df.head(20)



Unnamed: 0,artist,track,score
2389,Claude Debussy,Claire de Lune,16850
1828,Bruce Springsteen,Born To Run,7129
3754,Erik Satie,Gymnopédies,7124
10429,Simon & Garfunkel,Scarborough Fair,6157
8264,Neil Young,Harvest Moon,5334
2386,Claude Debussy,Clair de Lune,5130
5217,Israel Kamakawiwoʻole,Over the Rainbow,5075
11439,The Cranberries,Dreams,4408
7672,Mazzy Starr,Fade into you,3982
11298,The Beatles,In my Life,3954


## Filter by minimum score


In [274]:
df = df.loc[df['score'] >4]
df

Unnamed: 0,artist,track,score
0,Claude Debussy,Clair de Lune,21980
1,Bruce Springsteen,Born To Run,7129
2,Erik Satie,Gymnopédies,7124
3,Simon & Garfunkel,Scarborough Fair,6157
4,Neil Young,Harvest Moon,5334
...,...,...,...
1447,Keane,Russian Farmers Song,5
1448,Death Cab,What a Wonderful World,5
1449,Dethklok,Detharmonic,5
1450,Katamari Damacy,Cherry blossom color season,5


In [275]:
df.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(df, f)


In [276]:
with open('silver.pkl', 'rb') as f:
    df = pickle.load(f)
df

Unnamed: 0,artist,track,score
0,Claude Debussy,Clair de Lune,21980
1,Bruce Springsteen,Born To Run,7129
2,Erik Satie,Gymnopédies,7124
3,Simon & Garfunkel,Scarborough Fair,6157
4,Neil Young,Harvest Moon,5334
...,...,...,...
1447,Keane,Russian Farmers Song,5
1448,Death Cab,What a Wonderful World,5
1449,Dethklok,Detharmonic,5
1450,Katamari Damacy,Cherry blossom color season,5


# Load into a Spotify playlist


In [277]:
# log in
client_credentials_manager = SpotifyClientCredentials(client_id=os.getenv('SPOTIFY_CLIENT_ID'), 
                                                      client_secret=os.getenv('SPOTIFY_CLIENT_SECRET'),
                                                      )

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


In [305]:
df = pd.read_csv("silver.csv")
df = df.groupby(['artist','track']).sum().reset_index().sort_values('score', ascending=False)
df.to_csv('silver.csv', index=False)


In [293]:
# check artists
# update artists to spotify canonical name as necessary

df = pd.read_csv("silver.csv")
df = df.groupby(['artist','track']).sum().reset_index().sort_values('score', ascending=False)

dedupe = {}
fail_list = []
artist_map = {}
for index, artist, title, score in df.itertuples():
    artist = str(artist)
    if artist in dedupe:
        continue
    dedupe[artist]=1
    query_str = 'artist:%s' % (artist)
    artist_results = sp.search(q=query_str, type='artist', limit=3, offset=0, market='US')
    artist_names = [artist['name'] for artist in artist_results['artists']['items']]
    if artist_names:
        if artist.lower() != artist_names[0].lower():
            artist_map[artist] = artist_names[0]
            print(artist, '->', artist_names[0])
    else:
        fail_list.append((artist, title))
        print("not found:", artist, "-", title)

# then clean up manually as appropriate

The Band -> The Band CAMINO
1 -> One Direction
not found: Harry Waters Jr., Marvin Berry, and Starlighters - Earth Angel (Will You Be Mine)
America -> The All-American Rejects
Seal -> Seals and Crofts
not found: Iron & Wine x Calexico - The Trapeze Swinger
not found: Rufus Wainwright feat. PSP - Hallelujah
not found: Des'ree - Kissing You
not found: Frank Ocean / James Blake - pink + white
Acoustic -> Acoustic Alchemy
Al Stewart -> Alexander Stewart
a-ha -> Daryl Hall & John Oates
Alison -> Alison Krauss
Phil -> Phil Collins
not found: Paul McCartney and Wings - Here Today
La La Land Soundtrack -> LAND Soundtrack
Jewel -> Run The Jewels
The La’s -> The Kid LAROI
Rainbow -> Rainbow Kitten Surprise
Meatloaf -> meatloafi
Frente -> Frente Cumbiero
not found: Fellowship of the Ring - Passing of the Elves
not found: Guitars and Dragons - Concerning Hobbits
The Promise -> Lukas Nelson and Promise of the Real
not found: college a capella - Landslide
Dallas Green -> Jimmy Carter and Dallas Coun

In [287]:
artist_map

{'Israel Kamakawiwoʻole': "Israel Kamakawiwo'ole",
 'Edith Piaf': 'Édith Piaf',
 'Righteous Brother’s': 'The Righteous Brothers',
 'Joni Mitchel': 'Joni Mitchell',
 'Death Cab': 'Death Cab for Cutie',
 'Goo Goo Dolls': 'The Goo Goo Dolls',
 'Pachelbel': 'Johann Pachelbel',
 'Sinead O’Conner': 'Sinead O Conner',
 'The Band': 'The Band CAMINO',
 'Emerson, Lake, and Palmer': 'Emerson, Lake & Palmer',
 '1': 'One Direction',
 'Queensryche': 'Queensrÿche',
 'Crosby, Stills, Nash': 'Crosby, Stills & Nash',
 'Beethoven': 'Ludwig van Beethoven',
 'America': 'The All-American Rejects',
 "Sinead O'Connor": "Sinéad O'Connor",
 'Seal': 'Seals and Crofts',
 'Yaz': 'Yazoo',
 'Israel Kamakawiwo’ole': "Israel Kamakawiwo'ole",
 "Cat Steven's": 'Yusuf / Cat Stevens',
 "The Sunday's": 'The Sundays',
 'Sophie Hawkins': 'Sophie B. Hawkins',
 'Heart': 'Tom Petty and the Heartbreakers',
 'Puccini': 'Giacomo Puccini',
 'Selena': 'Selena Gomez',
 'K. D. Lang': 'k.d. lang',
 'The Mamas and the Papa': 'The Mamas 

In [289]:
ignore_list = [ 'Vince',     #Vince Staples',
 'Eileen',     #Eileen Walker',
 'IZ',     #Izzamuzzic',
 'Choir Choir Choir!',     #Mav City Gospel Choir',
 'LP',     #LP Giobbi',
 'The Philadelphia Orchestra',     #The Philadelphia Virtuosi Chamber Orchestra',
 'Live',     #DPR LIVE',
 'South Park',     #South Park Mexican',
 'Nico',     #Nicki Nicole',
 'CSNY',     #Csnyee_',
 'Brian Wilson',     #Brian Courtney Wilson',
 'Adeem',     #Adeem the Artist',
 'Train',     #Meghan Trainor',
 'MCR',     #Tate McRae',
 'Berlin',     #Berliner Philharmoniker',
 'Pink',     #PinkPantheress',
 'Múm',     #Mumford & Sons',
 'The Band',     #The Band CAMINO',
 '1',     #One Direction',
 'America',     #The All-American Rejects',
 'Seal',     #Seals and Crofts',
 'Acoustic',     #Acoustic Alchemy',
 'Al Stewart',     #Alexander Stewart',
 'a-ha',     #Daryl Hall & John Oates',
 'Alison',     #Alison Krauss',
 'Phil',     #Phil Collins',
 'La La Land Soundtrack',     #LAND Soundtrack',
 'Jewel',     #Run The Jewels',
 'The La’s',     #The Kid LAROI',
 'Rainbow',     #Rainbow Kitten Surprise',
 'Meatloaf',     #meatloafi',
 'Frente',     #Frente Cumbiero',
 'The Promise',     #Lukas Nelson and Promise of the Real',
 'Dallas Green',     #Jimmy Carter and Dallas County Green',
 'Arrow',     #Arrows in Action',
 'Eric Johnson',     #Eric D. Johnson',
 'Dixie Chicks',     #Karaoke - Dixie Chicks',
 'Death',     #Five Finger Death Punch',

]

for k in ignore_list:
    try:
        print(k, artist_map.get(k))
        artist_map.pop(k)
    except:
        print('error', k)
        pass





Vince Vince Staples
Eileen Eileen Walker
IZ Izzamuzzic
Choir Choir Choir! Mav City Gospel Choir
LP LP Giobbi
The Philadelphia Orchestra The Philadelphia Virtuosi Chamber Orchestra
Live DPR LIVE
South Park South Park Mexican
Nico Nicki Nicole
CSNY Csnyee_
Brian Wilson Brian Courtney Wilson
Adeem Adeem the Artist
Train Meghan Trainor
MCR Tate McRae
Berlin Berliner Philharmoniker
Pink PinkPantheress
Múm Mumford & Sons
The Band The Band CAMINO
1 One Direction
America The All-American Rejects
Seal Seals and Crofts
Acoustic Acoustic Alchemy
Al Stewart Alexander Stewart
a-ha Daryl Hall & John Oates
Alison Alison Krauss
Phil Phil Collins
La La Land Soundtrack LAND Soundtrack
Jewel Run The Jewels
The La’s The Kid LAROI
Rainbow Rainbow Kitten Surprise
Meatloaf meatloafi
Frente Frente Cumbiero
The Promise Lukas Nelson and Promise of the Real
Dallas Green Jimmy Carter and Dallas County Green
Arrow Arrows in Action
Eric Johnson Eric D. Johnson
Dixie Chicks Karaoke - Dixie Chicks
Death Five Finger D

In [290]:
artist_map.get('Meatloaf')

In [291]:
df['artist'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)
df.head(20)


Unnamed: 0,artist,track,score
249,Claude Debussy,Clair de Lune,21980
213,Bruce Springsteen,Born To Run,7129
386,Erik Satie,Gymnopédies,7124
1123,Simon & Garfunkel,Scarborough Fair,6175
859,Neil Young,Harvest Moon,5334
568,Israel Kamakawiwo'ole,Over the Rainbow,5075
1273,The Cranberries,Dreams,4408
815,Mazzy Star,Fade into you,3982
1246,The Beatles,In my Life,3954
1215,The Beach Boys,God Only Knows,3678


In [292]:
df.to_csv('silver.csv', index=False)


In [306]:
# check tracks
# possibly update tracks to spotify canonical name

df = pd.read_csv("silver.csv")

dedupe = {}
mylist = []
fail_list = []
artist_list, track_list, uri_list, album_list, score_list = [], [], [], [], []
orig_artist, orig_track = [], []

for index, artist, title, score in df.itertuples():
    query_str = 'artist:%s track:%s' % (artist, title)
    track_results = sp.search(q=query_str, type='track', limit=1, offset=0, market='US')
    results = track_results['tracks']['items']
    
    if results:
        r = results[0]
        # failsafe to never put same track twice
        if dedupe.get(r['id']):
            continue
        dedupe[r['id']]=True
        if title.lower() != r['name'].lower():
            print ("%s|%s : %s|%s" % (artist, title, r['artists'][0]['name'], r['name']))
        uri_list.append(r['uri'])
        artist_list.append(r['artists'][0]['name'])
        track_list.append(r['name'])
        album_list.append(r['album']['name'])
        orig_artist.append(artist)
        orig_track.append(title)
        score_list.append(score)
#         print('  ',
#               r['artists'][0]['name'],'|',
#               r['name'], '|',
#               r['album']['name'],'|',
#               r['album']['release_date'],'|',
#               r['popularity'])
    else:
        fail_list.append((artist, title))
        print("not found:", artist, "-", title)
        

Claude Debussy|Clair de Lune : Claude Debussy|Suite bergamasque, L. 75: III. Clair de lune
Erik Satie|Gymnopédies : Erik Satie|3 Gymnopédies: No. 1 Lent et douloureux
Simon & Garfunkel|Scarborough Fair : Simon & Garfunkel|Scarborough Fair / Canticle
not found: Neil Young - Harvest Moon
The Beatles|In my Life : The Beatles|In My Life - Remastered 2009
The Beach Boys|God Only Knows : The Beach Boys|God Only Knows - Mono
Sigur Rós|Hoppipolla : Sigur Rós|Hoppípolla
The Beatles|Blackbird : The Beatles|Blackbird - Remastered 2009
John Denver|Annie’s Song : John Denver|Annie's Song
Jim Croce|I Got a Name : Jim Croce|I'll Have To Say I Love You In A Song
Samuel Barber|Adagio for Strings : Samuel Barber|Barber: Adagio for Strings
Israel Kamakawiwo'ole|Somewhere Over the Rainbow/Wonderful World : Israel Kamakawiwo'ole|Somewhere Over The Rainbow_What A Wonderful World
Peter Gabriel|In Your Eyes : Peter Gabriel|In Your Eyes - 2012 Remaster
not found: Hans Zimmer - Interstellar Intro
Jim Croce|Oper

not found: Les Misérables Cast - I dreamed a dream
not found: Rufus Wainwright feat. PSP - Hallelujah
Carpenters|We've only just begun : Carpenters|We’ve Only Just Begun
The Beatles|A Day In The Life : The Beatles|A Day In The Life - Remastered 2009
The Smashing Pumpkins|1979 : The Smashing Pumpkins|1979 - Remastered 2012
not found: Des'ree - Kissing You
not found: Johann Pachelbel - Canon in 7 versions
The Beatles|Across the universe : The Beatles|Across The Universe - Remastered 2009
Camille Saint-Saëns|The Swan : Camille Saint-Saëns|The Carnival of the Animals, R. 125: XIII. The Swan (Arr. for Cello and Piano)
Antonio Vivaldi|Vivaldis Four : Antonio Vivaldi|Vivaldi, De Courson & Le Berre: Eirin Sonata (After Vivaldi's Cello Concerto, RV 407)
Aphex Twin|Rhubarb : Aphex Twin|Donkey Rhubarb
Claude Debussy|Arabesque No. 1 : Claude Debussy|2 Arabesques, L. 66: No. 1 in E Major
Led Zeppelin|Going to California : Led Zeppelin|Going to California - Remaster
Hozier|Cherry Wine : Hozier|Cherr

not found: Neil Young - Old Man
The Beatles|Golden Slumbers : The Beatles|Golden Slumbers - Remastered 2009
The Beatles|Golden Slumbers/Carry That Weight/The End : The Beatles|The Long One - Comprising of ‘You Never Give Me Your Money’, ’Sun King’/’Mean Mr Mustard’, ‘Her Majesty’, ‘Polythene Pam’/’She Came In Through The Bathroom Window’, ’Golden Slumbers’/ ’Carry That Weight’, ’The End’
The Beach Boys|In My Room : The Beach Boys|In My Room - Remastered
Bonnie Raitt|I can’t make you love me : Bonnie Raitt|I Can't Make You Love Me
Claude Debussy|La fille aux cheveux de lin : Claude Debussy|Préludes / Book 1, L. 117: 8. La fille aux cheveux de lin
Deadmau5|Strobe : deadmau5|Strobe - Radio Edit
not found: Fellowship of the Ring - Passing of the Elves
Joaquín Rodrigo|Concerto de Aranjuez : Joaquín Rodrigo|Aranjuez Concerto BWV 1056
not found: Jimmy Eat World - May Angels You In
Simon & Garfunkel|Bookends : Simon & Garfunkel|Bookends Theme - Reprise
Tomaso Albinoni|Adagio : Tomaso Albinoni|

Franz Liszt|Liebestraum : Franz Liszt|Liebestraum No. 3 in A-Flat Major, S. 541 / 3
not found: Les Misérables Cast - Bring Him Home
not found: Missy Elliott - Angeles
Morrissey|Everyday is like Sunday : Morrissey|Everyday Is Like Sunday - 2011 Remaster
not found: Ella Fitzgerald - Stars over Alabama
Pyotr Ilyich Tchaikovsky|Waltz of the Flowers : Pyotr Ilyich Tchaikovsky|Tchaikovsky: The Nutcracker, Op. 71, Act II: No. 13, Waltz of the Flowers
not found: Bon Iver - Roslyn
John Prine|In Spite of Ourselves : John Prine|In Spite of Ourselves (feat. Iris DeMent)
John Williams|Across the Stars : John Williams|Across the Stars (Love Theme from "Star Wars: Attack of the Clones")
Jeff Buckley|Calling You : Jeff Buckley|Calling You - Live at Sin-é, New York, NY - July/August 1993
not found: Jim Croce - Croce Plays Croce
Black Sabbath|Planet Caravan : Black Sabbath|Planet Caravan - 2012 - Remaster
Berlin|Take My Breath Away : Berlin|Take My Breath Away - Love Theme from "Top Gun"
Pyotr Ilyich Tc

## Save gold.csv


In [307]:

gold_df = pd.DataFrame({'score': score_list,
                        'input_artist': orig_artist,
                        'artist': artist_list,
                        'input_track': orig_track,
                        'track': track_list,
                        'album': album_list,
                        'uri': uri_list})

with pd.option_context("display.max_rows", 9999):
    display(gold_df)



Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
0,21997,Claude Debussy,Claude Debussy,Clair de Lune,"Suite bergamasque, L. 75: III. Clair de lune","Debussy: Suite bergamasque, L. 75, 3. Clair de...",spotify:track:1cmigB9I6IRpFqjIbzvSQB
1,7129,Bruce Springsteen,Bruce Springsteen,Born To Run,Born to Run,Born To Run,spotify:track:6hTcuIQa0sxrrByu9wTD7s
2,7124,Erik Satie,Erik Satie,Gymnopédies,3 Gymnopédies: No. 1 Lent et douloureux,Satie: The Magic of Satie,spotify:track:7kTVe6XhIveidvkt8nb7jK
3,6175,Simon & Garfunkel,Simon & Garfunkel,Scarborough Fair,Scarborough Fair / Canticle,"Parsley, Sage, Rosemary And Thyme",spotify:track:3g2fYZW5v2od8KIF7VktT0
4,5075,Israel Kamakawiwo'ole,Israel Kamakawiwo'ole,Over the Rainbow,Over the Rainbow,Alone In Iz World,spotify:track:3oQomOPRNQ5NVFUmLJHbAV
5,4408,The Cranberries,The Cranberries,Dreams,Dreams,"Everybody Else Is Doing It, So Why Can't We?",spotify:track:4JGKZS7h4Qa16gOU3oNETV
6,3982,Mazzy Star,Mazzy Star,Fade into you,Fade Into You,So Tonight That I Might See,spotify:track:1LzNfuep1bnAUR9skqdHCK
7,3954,The Beatles,The Beatles,In my Life,In My Life - Remastered 2009,Rubber Soul (Remastered),spotify:track:3KfbEIOC7YIv90FIfNSZpo
8,3678,The Beach Boys,The Beach Boys,God Only Knows,God Only Knows - Mono,Pet Sounds (Original Mono & Stereo Mix),spotify:track:6iGU74CwXuT4XVepjc9Emf
9,3383,Don McLean,Don McLean,"Vincent (Starry, Starry Night)","Vincent (Starry, Starry Night)",Rearview Mirror: An American Musical Journey,spotify:track:2YDyH60Vro33KkDtNZCXIk


In [308]:
# inspect where the track name differs
with pd.option_context("display.max_rows", 999):
    display(gold_df.loc[gold_df['input_artist'].str.lower().str[:8] != gold_df['artist'].str.lower().str[:8]])
    

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
68,466,Sinead O'Connor,Sinéad O'Connor,Nothing Compares 2 U,Nothing Compares 2 U,I Do Not Want What I Haven't Got,spotify:track:5GHY1DFWKz3Prg2V0Iodqo
90,315,The Eagles,The Illegal Eagles,Hotel California,Hotel California,Live & Symphonic,spotify:track:5Viurwg2LYZjky18hqJN7J
92,311,The Verve,The Cover Crew,Bittersweet Symphony,Bittersweet Symphony (Acoustic Version) [The V...,"Acoustified Hits, Vol. 2",spotify:track:714doH50K9qbrE9py6izzV
362,23,Alison,Alison Krauss,Baby Mine,Baby Mine,A Hundred Miles Or More: A Collection,spotify:track:7GJ8aXAGdfjo2OAh6XreB6
461,16,The La’s,The La's,There She Goes,There She Goes,The La's,spotify:track:0SMkzFGJOBFDI9KfYD55L0
462,16,The Beatles,"Peel, David & The Apple Band",Abbey Road,The wonderful world of Abbey Road,Bring Back The Beatles,spotify:track:6zZU0UM3IWCGqUnEJBI8cT
491,15,Björk,Björkliden,Yoga,Yoga Nidra,Meditation Spa yoga healing therapy relaxation...,spotify:track:3AdIPVFDkqSSvBjKFcN0mr
539,13,Frente,Frente!,Bizarre Love Triangle,Bizarre Love Triangle - 2014 Remaster,Marvin The Album - 21st Anniversary Edition,spotify:track:7sgi66biRYpAXuRZJBDuli
610,11,The Promise,The Karaoke Channel,When in Rome,The Promise (In the Style of When in Rome) [Ka...,The Karaoke Channel - Sing the Promise Like Wh...,spotify:track:0DcP8Iqq4lYMsdzLHojSwY
646,10,Garth Brooks,Brandon Garth,The Dance,The Dance,The Best of Brooks,spotify:track:08MhzVza8Qc4M172KZ37JT


In [309]:
# these are songs that look like covers or otherwise not the expected response from spotify search 
# (which is a bit wonky, doesn't like quotes and such)
# remove from df and add manually
bad_lookups = [
     90, 92, 462, 491, 610,	 646, 646, 911, 944, 947,
    
]

for i in bad_lookups:
    print(gold_df.iloc[i])
    
# add manually, plus 'not found'


score                                            315
input_artist                              The Eagles
artist                            The Illegal Eagles
input_track                         Hotel California
track                               Hotel California
album                               Live & Symphonic
uri             spotify:track:5Viurwg2LYZjky18hqJN7J
Name: 90, dtype: object
score                                                         311
input_artist                                            The Verve
artist                                             The Cover Crew
input_track                                  Bittersweet Symphony
track           Bittersweet Symphony (Acoustic Version) [The V...
album                                    Acoustified Hits, Vol. 2
uri                          spotify:track:714doH50K9qbrE9py6izzV
Name: 92, dtype: object
score                                             16
input_artist                             The Beatles
artist       

In [310]:
gold_df = gold_df.drop(
    axis='index',
    labels=bad_lookups)


In [311]:
gold_df

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
0,21997,Claude Debussy,Claude Debussy,Clair de Lune,"Suite bergamasque, L. 75: III. Clair de lune","Debussy: Suite bergamasque, L. 75, 3. Clair de...",spotify:track:1cmigB9I6IRpFqjIbzvSQB
1,7129,Bruce Springsteen,Bruce Springsteen,Born To Run,Born to Run,Born To Run,spotify:track:6hTcuIQa0sxrrByu9wTD7s
2,7124,Erik Satie,Erik Satie,Gymnopédies,3 Gymnopédies: No. 1 Lent et douloureux,Satie: The Magic of Satie,spotify:track:7kTVe6XhIveidvkt8nb7jK
3,6175,Simon & Garfunkel,Simon & Garfunkel,Scarborough Fair,Scarborough Fair / Canticle,"Parsley, Sage, Rosemary And Thyme",spotify:track:3g2fYZW5v2od8KIF7VktT0
4,5075,Israel Kamakawiwo'ole,Israel Kamakawiwo'ole,Over the Rainbow,Over the Rainbow,Alone In Iz World,spotify:track:3oQomOPRNQ5NVFUmLJHbAV
...,...,...,...,...,...,...,...
985,6,Billy Joel,Billy Joel,She’s Always A Woman,She's Always a Woman,The Stranger (Legacy Edition),spotify:track:5RgFlk1fcClZd0Y4SGYhqH
986,6,Jimi Hendrix,Jimi Hendrix,All Along the Watchtower,All Along the Watchtower,Electric Ladyland,spotify:track:2aoo2jlRnM3A0NyLQqMN2f
987,6,The National,The National,Light Years,Light Years,I Am Easy to Find,spotify:track:2GNj9KRwpxBWgEiPQc3jEj
988,6,Henry Mancini,Henry Mancini,Lujon,Lujon,Mr. Lucky Goes Latin,spotify:track:37N37WJQvXqplFdCwkNgX3


In [312]:
# this you could upload and make a new playlist
# my playlist is result of multiple iterations

gold_df[['artist', 'track', 'score']].to_csv('gold.csv', index=False)

with pd.option_context("display.max_rows", 999):
    display(gold_df)

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
0,21997,Claude Debussy,Claude Debussy,Clair de Lune,"Suite bergamasque, L. 75: III. Clair de lune","Debussy: Suite bergamasque, L. 75, 3. Clair de...",spotify:track:1cmigB9I6IRpFqjIbzvSQB
1,7129,Bruce Springsteen,Bruce Springsteen,Born To Run,Born to Run,Born To Run,spotify:track:6hTcuIQa0sxrrByu9wTD7s
2,7124,Erik Satie,Erik Satie,Gymnopédies,3 Gymnopédies: No. 1 Lent et douloureux,Satie: The Magic of Satie,spotify:track:7kTVe6XhIveidvkt8nb7jK
3,6175,Simon & Garfunkel,Simon & Garfunkel,Scarborough Fair,Scarborough Fair / Canticle,"Parsley, Sage, Rosemary And Thyme",spotify:track:3g2fYZW5v2od8KIF7VktT0
4,5075,Israel Kamakawiwo'ole,Israel Kamakawiwo'ole,Over the Rainbow,Over the Rainbow,Alone In Iz World,spotify:track:3oQomOPRNQ5NVFUmLJHbAV
5,4408,The Cranberries,The Cranberries,Dreams,Dreams,"Everybody Else Is Doing It, So Why Can't We?",spotify:track:4JGKZS7h4Qa16gOU3oNETV
6,3982,Mazzy Star,Mazzy Star,Fade into you,Fade Into You,So Tonight That I Might See,spotify:track:1LzNfuep1bnAUR9skqdHCK
7,3954,The Beatles,The Beatles,In my Life,In My Life - Remastered 2009,Rubber Soul (Remastered),spotify:track:3KfbEIOC7YIv90FIfNSZpo
8,3678,The Beach Boys,The Beach Boys,God Only Knows,God Only Knows - Mono,Pet Sounds (Original Mono & Stereo Mix),spotify:track:6iGU74CwXuT4XVepjc9Emf
9,3383,Don McLean,Don McLean,"Vincent (Starry, Starry Night)","Vincent (Starry, Starry Night)",Rearview Mirror: An American Musical Journey,spotify:track:2YDyH60Vro33KkDtNZCXIk


# Get Spotify playlist and add songs

In [316]:
# get playlist id
# first create a playlist in Spotify UI to load songs
playlists = sp.user_playlists(os.getenv('SPOTIFY_USERNAME'))
while playlists:
    for i, playlist in enumerate(playlists['items']):
        if playlist['name'] != 'Reddit Prettiest Songs':
            continue
        print(playlist['id'])
        playlist_id = playlist['id']
        print("%4d %s %s" % (i + 1 + playlists['offset'], playlist['uri'],  playlist['name']))
    if playlists['next']:
        playlists = sp.next(playlists)
    else:
        playlists = None
        

08YFkbtTV6GBfNtjJ4PHDu
   2 spotify:playlist:08YFkbtTV6GBfNtjJ4PHDu Reddit Prettiest Songs


In [314]:
# must follow an oauth workflow to write a playlist in Spotify
# running this cell should request a spotify login and then redirect to an url
# paste whole url with id into form to authenticate

scope = "playlist-modify-public"

sp = spotipy.Spotify(auth_manager=spotipy.SpotifyOAuth(scope=scope,
                                                       client_id=os.getenv('SPOTIFY_CLIENT_ID'),
                                                       client_secret=os.getenv('SPOTIFY_CLIENT_SECRET'),
                                                       redirect_uri="https://druce.ai"
                                                      ))


In [315]:
# add songs to playlist 

# addlist = gold_df['uri'].to_list()
# print (len(addlist))

# while(addlist):
#     sp.user_playlist_add_tracks(os.getenv('SPOTIFY_USERNAME'), 
#                                 playlist_id=playlist_id, 
#                                 tracks=addlist[-100:])
#     addlist = addlist[:-100]
#     print("added items, remaining ", len(addlist))


981
Enter the URL you were redirected to: https://druce.ai/?code=AQAK_Rvv-mF3fGdfdvUt1CW6KYBFDsOff9JrfElb8G3OByV4aCNe6JD_G7YyRv5GBR-e-lw6KNiAcnmwEVFfe_DaD7zBGsN-6RVNZUS4O_2f4-I92IIjVLkI_Nw1Yg7LKsIjIDK9yQqfbXCtwXxBPFq2wpi8jqWtnHnDvw6rONW35eTbkw7hAgWNPvs
added items, remaining  881
added items, remaining  781
added items, remaining  681
added items, remaining  581
added items, remaining  481
added items, remaining  381
added items, remaining  281
added items, remaining  181
added items, remaining  81
added items, remaining  0


In [None]:
# manually add the ones that weren't found for some reason


# Compare Spotify playlist to gold data
after initial population, we may want to run again and add new songs 


In [91]:
# compare to existing playlist
# can run again and add any new tracks, either because OpenAI is a bit random, or new replies in thread
results = sp.user_playlist(os.getenv('SPOTIFY_USERNAME'), playlist_id,
                                fields='tracks,next,name')
tracks = results['tracks']

playlist_dict_by_uri = {}
playlist_dict_by_str = {}

artist_list = []
track_list = []
uri_list = []
popularity_list = []
album_list=[]

while True:
    for track_item in tracks['items']:
        track_dict = track_item['track']
        track_str = track_dict['artists'][0]['name']  + ' | ' + track_dict['name'][:15]
        uri = track_dict['uri']
        if track_str in playlist_dict_by_str:
            print(track_str)
        playlist_dict_by_str[track_str] = uri
        playlist_dict_by_uri[uri] = track_str
        
        uri_list.append(uri)
        artist_list.append(track_dict['artists'][0]['name'])
        track_list.append(track_dict['name'])
        album_list.append(track_dict['album']['name'])
        popularity_list.append(track_dict['popularity'])
        
    # check if there are more pages
    if tracks['next']:
        tracks = sp.next(tracks)
    else:
        break

print (len(list(playlist_dict_by_str.keys())))
print (len(list(playlist_dict_by_uri.keys())))


Nine Inch Nails | A Warm Place
Joni Mitchell | Both Sides Now
1156
1158


In [92]:
with pd.option_context("display.max_rows", 9999):
    display(gold_df.loc[~gold_df['uri'].isin(playlist_dict_by_uri.keys())])
    

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
0,14007,claude debussy,Claude Debussy,claire de lune,Claire de lune,Träumerei - Liebestraum - Für Elise - Clair de...,spotify:track:6kf7ZCJjEbjZXikivKOsvJ
3,7003,erik satie,Erik Satie,gymnopédies,3 Gymnopédies: No. 1 Lent et douloureux,Satie: The Magic of Satie,spotify:track:7kTVe6XhIveidvkt8nb7jK
10,3383,don mclean,Don McLean,"vincent (starry, starry night","Vincent (Starry, Starry Night)",Rearview Mirror: An American Musical Journey,spotify:track:2YDyH60Vro33KkDtNZCXIk
18,2531,joni mitchell,Joni Mitchell,both sides now,Both Sides Now,Feathers.,spotify:track:5EsPLgSs1UQIDJG0U00RuJ
44,763,Simon & Garfunkel,Simon & Garfunkel,"for emily, whenever i may find her","For Emily, Whenever I May Find Her","Parsley, Sage, Rosemary And Thyme",spotify:track:76Fcbx3T7fUgA7phUkmsn7
45,757,glen hansard and marketa irglova,Glen Hansard,falling slowly,Falling Slowly,Perhaps Love,spotify:track:2lpNVkZb7e1k7IeW8MOzLe
47,693,wolfgang amadeus mozart,Wolfgang Amadeus Mozart,lacrimosa,Lacrimosa,Requiem - Music To Die For,spotify:track:1UvaZaHkh3D9AkmBrrnbFg
58,584,louis armstrong,Louis Armstrong,it's a wonderful world,It's Wonderful - Single Version,What A Wonderful World,spotify:track:7tmOKoxLQFLvQWwxzYaodT
61,504,henry mancini,Henry Mancini,moon river,Moon River(Vocal Audrey Hepburn),Breakfast At Tiffany's (50th Anniversary Edition),spotify:track:5iGleL7HpEThuuYQ3us2jh
87,311,The Verve,The Cover Crew,bittersweet symphony,Bittersweet Symphony (Acoustic Version) [The V...,"Acoustified Hits, Vol. 2",spotify:track:714doH50K9qbrE9py6izzV


In [141]:
playlist_df = pd.DataFrame({'artist': artist_list,
                           'track': track_list,
                           'album': album_list,
                           'popularity': popularity_list,
                           })



In [142]:
with pd.option_context("display.max_rows", 9999):
    display(playlist_df.sort_values('popularity'))
    

Unnamed: 0,artist,track,album,popularity
909,Yusuf / Cat Stevens,Wild World,The Best Of Cat Stevens 20th Century Masters T...,0
1138,John Williams,Missing You,My World,0
1137,ZaZa,Only You,Nights One and a Thousand,0
769,Jaakko Aukusti,What If All Else Fails?,What If All Else Fails?,0
999,Joni Mitchell,Both Sides Now,Clouds,0
300,Jay-Jay Johanson,Poison,Poison,1
42,The Smiths,Asleep,mental health: look out for yourself,2
31,The Niro,No One Must Find You Here,The Complete Jeff Buckley and Gary Lucas Songbook,2
1008,Johann Sebastian Bach,"Orchestral Suite No. 3 in D Major, BWV 1068: I...",Classical Music In the Background,2
504,Claude Debussy,Debussy: Arabesque no. 2 in G major (Deux Arab...,Debussy: Arabesque no. 2 in G major (Deux Arab...,4


In [143]:
gold_dict_by_uri = {}
gold_dict_by_str = {}
addlist = []
c = 0
for i, artist, track, uri in gold_df[['artist', 'track', 'uri']].itertuples():
    # print(artist, track, uri)
    track_str = artist + ' | ' + track[:15]
    if track_str not in playlist_dict_by_str:
        addlist.append([artist, track, uri])
        print(artist, track, uri)
    gold_dict_by_uri[uri]=track_str
    gold_dict_by_str['track_str']= uri
#     if track_str not in playlist_dict_by_str:
#         c += 1
#         print (c, track_str)
        
print(len(gold_dict_by_str.items()))
print(len(gold_dict_by_uri.items()))

Claude Debussy Claire de lune spotify:track:6kf7ZCJjEbjZXikivKOsvJ
Erik Satie 3 Gymnopédies: No. 1 Lent et douloureux spotify:track:7kTVe6XhIveidvkt8nb7jK
Don McLean Vincent (Starry, Starry Night) spotify:track:2YDyH60Vro33KkDtNZCXIk
Wolfgang Amadeus Mozart Lacrimosa spotify:track:1UvaZaHkh3D9AkmBrrnbFg
Louis Armstrong It's Wonderful - Single Version spotify:track:7tmOKoxLQFLvQWwxzYaodT
Henry Mancini Moon River(Vocal Audrey Hepburn) spotify:track:5iGleL7HpEThuuYQ3us2jh
The Cover Crew Bittersweet Symphony (Acoustic Version) [The Verve Cover] spotify:track:714doH50K9qbrE9py6izzV
The Beatles Golden Slumbers - Remastered 2009 spotify:track:01SfTM5nfCou5gQL70r6gs
Louis Armstrong La vie en rose - Single Version spotify:track:3yYfoYGVpriV4fG9L1ogsD
Johann Sebastian Bach Herz und Mund und Tat und Leben, Cantata BWV 147: Jesu, Joy of Man’s Desiring (Transcr. Hess for Piano) spotify:track:2zl2AqEdHVLhjzStuRulGY
Giacomo Puccini Madama Butterfly, SC 74, Act II Pt. 1: No. 17, Un bel di vedremo (But

In [144]:
addlist

[['Claude Debussy', 'Claire de lune', 'spotify:track:6kf7ZCJjEbjZXikivKOsvJ'],
 ['Erik Satie',
  '3 Gymnopédies: No. 1 Lent et douloureux',
  'spotify:track:7kTVe6XhIveidvkt8nb7jK'],
 ['Don McLean',
  'Vincent (Starry, Starry Night)',
  'spotify:track:2YDyH60Vro33KkDtNZCXIk'],
 ['Wolfgang Amadeus Mozart',
  'Lacrimosa',
  'spotify:track:1UvaZaHkh3D9AkmBrrnbFg'],
 ['Louis Armstrong',
  "It's Wonderful - Single Version",
  'spotify:track:7tmOKoxLQFLvQWwxzYaodT'],
 ['Henry Mancini',
  'Moon River(Vocal Audrey Hepburn)',
  'spotify:track:5iGleL7HpEThuuYQ3us2jh'],
 ['The Cover Crew',
  'Bittersweet Symphony (Acoustic Version) [The Verve Cover]',
  'spotify:track:714doH50K9qbrE9py6izzV'],
 ['The Beatles',
  'Golden Slumbers - Remastered 2009',
  'spotify:track:01SfTM5nfCou5gQL70r6gs'],
 ['Louis Armstrong',
  'La vie en rose - Single Version',
  'spotify:track:3yYfoYGVpriV4fG9L1ogsD'],
 ['Johann Sebastian Bach',
  'Herz und Mund und Tat und Leben, Cantata BWV 147: Jesu, Joy of Man’s Desiring 

In [None]:
addlist = [['ABBA', 'One Of Us', 'spotify:track:6zgtBUEkAfilJ2YEOvNexR'],
 ['Gregorio Allegri',
  'Miserere mei, Deus',
  'spotify:track:6es7DmrhnDoKj5rsFvh3XU'],
 ['Amy Winehouse',
  'Love Is A Losing Game',
  'spotify:track:3uliGwmB52ZA7brgpZMzyH'],
 ['Barbara',
  "Ma plus belle histoire d'amour",
  'spotify:track:0qBVET4VkHsQAoboWlQ2pJ'],
 ['Ludwig van Beethoven',
  'Symphony No. 5 in C Minor, Op. 67: I. Allegro con brio',
  'spotify:track:2ygeBLTP9uu3OW3VTulD8N'],
 ['Benny Goodman', 'Sing, Sing, Sing', 'spotify:track:5L8ta4ECl5zeA6bGqY7G38'],
 ['Bill Withers', 'Lean on Me', 'spotify:track:3M8FzayQWtkvOhqMn2V4T2'],
 ['Billy Joel', 'Piano Man', 'spotify:track:70C4NyhjD5OZUMzvWZ3njJ'],
 ['Bob Dylan', 'Ballad of a Thin Man', 'spotify:track:0f5N14nB8xi0p3o4BlVvbx'],
 ['Bob Dylan', "Blowin' in the Wind", 'spotify:track:18GiV1BaXzPVYpp9rmOg0E'],
 ['Bob Dylan', 'Desolation Row', 'spotify:track:4n1ZGm3TxYmoYe1YR8cMus'],
 ['Bob Dylan', 'Duquesne Whistle', 'spotify:track:5kKW4bszhKSCYVPDO0sMbX'],
 ['Bob Dylan',
  'Forever Young - Slow Version',
  'spotify:track:4yWl0tnEanf3zmZzl9kbQn'],
 ['Bob Dylan', 'Gotta Serve Somebody', 'spotify:track:760420tYNmNjFgi8bWvbop'],
 ['Bob Dylan', 'Highway 61 Revisited', 'spotify:track:6os5B6xjuke9YfBKH3tu1e'],
 ['Bob Dylan',
  'I Shall Be Released - Studio Outtake - 1971',
  'spotify:track:5vyw005QQ42hrzrLxb3xEX'],
 ['Bob Dylan', 'I Want You', 'spotify:track:7tJQ4Ekp2vN3NlI3vJJW3v'],
 ['Bob Dylan', "It Ain't Me Babe", 'spotify:track:5nbNWAfT1S6V1vqj3snHxS'],
 ['Bob Dylan', 'Jokerman', 'spotify:track:6cuHkcRUqtQhtJ4sWCkd1q'],
 ['Bob Dylan',
  "Knockin' On Heaven's Door",
  'spotify:track:6HSXNV0b4M4cLJ7ljgVVeh'],
 ['Bob Dylan', 'Lay, Lady, Lay', 'spotify:track:4uYwlMp841PLJmj1gJJwIq'],
 ['Bob Dylan', 'Like a Rolling Stone', 'spotify:track:3AhXZa8sUQht0UEdBJgpGc'],
 ['Bob Dylan', 'Love Sick', 'spotify:track:3O1hpSOaJDW4SelgUG2XT3'],
 ['Bob Dylan', "Maggie's Farm", 'spotify:track:5rGD8FFgHw74cp3RPhucyg'],
 ['Bob Dylan',
  'Make You Feel My Love',
  'spotify:track:6rfGPGghQL7SJmZPXprXIc'],
 ['Bob Dylan',
  'Mississippi - Version 2',
  'spotify:track:6JWHNd8QMxTvojYkmZtKGI'],
 ['Bob Dylan', 'Mr. Tambourine Man', 'spotify:track:3RkQ3UwOyPqpIiIvGVewuU'],
 ['Bob Dylan', 'Murder Most Foul', 'spotify:track:1LfTvT9JPYuuZanwxLtZCr'],
 ['Bob Dylan', 'Not Dark Yet', 'spotify:track:1qbn6QrHG8XfnqVFKgNzKP'],
 ['Bob Dylan',
  'Rainy Day Women #12 & 35',
  'spotify:track:7BkAlVpGwXXl3sYNn5OoJ7'],
 ['Bob Dylan',
  'Sad-Eyed Lady of the Lowlands',
  'spotify:track:4jdtLLyEL7wY0TlCdMKhxq'],
 ['Bob Dylan', 'She Belongs to Me', 'spotify:track:2itBkHBUxGl4VfDj4HNyoD'],
 ['Bob Dylan',
  'Stuck Inside of Mobile with the Memphis Blues Again',
  'spotify:track:1NYTj6JEw3IOh4ggiBh82h'],
 ['Bob Dylan',
  'Subterranean Homesick Blues',
  'spotify:track:6k9DUKMJpWvu6eFG3O64Lg'],
 ['Bob Dylan', 'Tangled up in Blue', 'spotify:track:6Vcwr9tb3ZLO63F8DL8cqu'],
 ['Bob Dylan', 'Tempest', 'spotify:track:19scNzd4ogVsHrNWsms8Rg'],
 ['Bob Dylan',
  "The Times They Are A-Changin'",
  'spotify:track:52vA3CYKZqZVdQnzRrdZt6'],
 ['Bob Dylan',
  'Things Have Changed - Single Version',
  'spotify:track:5KOi77ameCimkAdw0DMNoy'],
 ['Bob Dylan',
  'Thunder on the Mountain',
  'spotify:track:4wo2eRp6aHcAlmhmfwiTAH'],
 ['Bob Dylan', 'Visions of Johanna', 'spotify:track:2rslQV48gNv3r9pPrQFPW1'],
 ['Brian Wilson', 'God Only Knows', 'spotify:track:2SznAUigFh6rMdGpcS5d7e'],
 ['Bright Eyes',
  'First Day of My Life',
  'spotify:track:0eBryM7ePQH3Klt3jz8xZd'],
 ['Crowded House',
  'Don’t Dream It’s Over - Home Demo',
  'spotify:track:0fiSpF9mvRFQWy0ca64d1g'],
 ['Léo Delibes', 'Flower Duet', 'spotify:track:5K8jqeLAxZIqHR6e5w5so1'],
 ['Dire Straits', 'Brothers In Arms', 'spotify:track:6XYBbVpu455ZdGWZNRLGbG'],
 ['Don McLean',
  'Vincent (Starry, Starry Night)',
  'spotify:track:2YDyH60Vro33KkDtNZCXIk'],
 ['Ed Sheeran', 'Photograph', 'spotify:track:41xNsY82OWtWbIfnRMK2ky'],
 ['Elvis Presley',
  'Can’t Help Falling in Love - Acoustic Cover',
  'spotify:track:0ghQkNDYLSl4GsqfkjTjWx'],
 ['Enya', 'Amarantine', 'spotify:track:0VmzazQQ0Mo1vJldr5NxTW'],
 ['Evan Rachel Wood', 'If I Fell', 'spotify:track:0gd3hRBQAEAw096YOcUrmR'],
 ['Fleetwood Mac', 'Rhiannon', 'spotify:track:05oETzWbd4SI33qK2gbJfR'],
 ['George Harrison',
  'All Things Must Pass - 2014 Remaster',
  'spotify:track:16OwZQuzMqnwn3FZsCBZly'],
 ['George Harrison',
  'Apple Scruffs - 2014 Remaster',
  'spotify:track:2K7WhpfZX3TCCMiwebp0W7'],
 ['George Harrison',
  'Art of Dying - 2014 Remaster',
  'spotify:track:6Jod7qrtYBhU3HcUmKk4hX'],
 ['George Harrison',
  'Awaiting on You All - 2014 Remaster',
  'spotify:track:0b65WkrBrg2qOkzQeDtQ9d'],
 ['George Harrison',
  'Ballad of Sir Frankie Crisp (Let It Roll) - 2014 Remaster',
  'spotify:track:0FWeRrB8T5R6maHbWQw4Kk'],
 ['George Harrison',
  'Behind That Locked Door',
  'spotify:track:2VVbLn8nMcWJzjcL1tZsUr'],
 ['George Harrison',
  'Beware of Darkness - 2014 Remaster',
  'spotify:track:606MCyZFMBlc52Ojnn1nvU'],
 ['George Harrison',
  'Give Me Love (Give Me Peace on Earth)',
  'spotify:track:71fXxvXqo1zxWDtBmjoEVk'],
 ['George Harrison',
  'Hear Me Lord - 2014 Remaster',
  'spotify:track:3kopbNyRj10XO1actGZexP'],
 ['George Harrison',
  'I Dig Love - 2014 Remaster',
  'spotify:track:42yK1Wy62c7malKSRwy0Qk'],
 ['George Harrison',
  'I Remember Jeep - 2014 Remaster',
  'spotify:track:058AE5M3ifbCh8VWOV7903'],
 ['George Harrison',
  "It's Johnny's Birthday - 2014 Remaster",
  'spotify:track:6Cv05rcW8HWwCC6wyEp1fC'],
 ['George Harrison',
  'Let It Down - 2014 Remaster',
  'spotify:track:5FFruMKbVg8AhwHnX4xBov'],
 ['George Harrison',
  'My Sweet Lord - 2014 Remaster',
  'spotify:track:6vE90mi4yKsQGY3YD2OOv1'],
 ['George Harrison',
  'Out of the Blue - 2014 Remaster',
  'spotify:track:1KHMyFaGvwVQ7ax4yjq4BZ'],
 ['George Harrison',
  'Plug Me In - 2014 Remaster',
  'spotify:track:0tyk2xHVjBd3nk16cGktTG'],
 ['George Harrison',
  'Run of the Mill - 2014 Remaster',
  'spotify:track:4uSlUBg3NVOA77E7wwKFTO'],
 ['George Harrison',
  'Thanks for the Pepperoni - 2014 Remaster',
  'spotify:track:3smkwfPqFsTmwfnBztMXaM'],
 ['George Harrison',
  'The Inner Light (Alternative Take) - Instrumental',
  'spotify:track:7gWPnvhaBFMlQsTBWEGcSC'],
 ['George Harrison',
  'Wah-Wah - 2014 Remaster',
  'spotify:track:5j3aqkMO2fl0s5eaSuVnQ8'],
 ['George Harrison',
  'What Is Life - 2014 Remaster',
  'spotify:track:44fw7RulJyj7dGIi9qR86N'],
 ['George Harrison',
  'While My Guitar Gently Weeps - Live At Madison Square Garden; 2009 Remaster',
  'spotify:track:4Egi6XuC0rbLlXfqmQeuFa'],
 ['Glenn Miller', 'In the Mood', 'spotify:track:1xsY8IFXUrxeet1Fcmk4oC'],
 ['Hans Zimmer', 'Cornfield Chase', 'spotify:track:6pWgRkpqVfxnj3WuIcJ7WP'],
 ['Hans Zimmer',
  'Day One (Interstellar Theme)',
  'spotify:track:4WmB04GBqS4xPMYN9dHgBw'],
 ["Israel Kamakawiwo'ole",
  'Maui Medley',
  'spotify:track:6TSJ3L9pBQsYIlCD5pk7ju'],
 ['James Taylor',
  'You’ve Got a Friend',
  'spotify:track:3nK4hWsTEr7fVXziI5bTmh'],
 ['Jay Ungar', 'Ashoken Farewell', 'spotify:track:2s6pqLeVialgt5l5TTSeas'],
 ['Jeff Buckley',
  'If You Knew - Live at Sin-é, New York, NY - July/August 1993',
  'spotify:track:1nd2JEHXbUuQFDiQzCBpsv'],
 ['Jimi Hendrix', 'One Rainy Wish', 'spotify:track:5Zyv0v4rPcrXjkaeImuodv'],
 ['Jimi Hendrix',
  'Spanish Castle Magic',
  'spotify:track:2KFE98Iw0X23sf4vJYcbLH'],
 ['Jimi Hendrix',
  'Wait Until Tomorrow',
  'spotify:track:2YtVzmZzew1ILUdNueyWd7'],
 ['John Lennon',
  'Imagine - Remastered 2010',
  'spotify:track:7pKfPomDEeI4TPT6EOYjn9'],
 ['John Mayer', 'Queen of California', 'spotify:track:0CETmgFGt8Ne8vLnaLcduU'],
 ['Johnny Cash',
  'I Walk The Line - Single Version',
  'spotify:track:1TKPfF2fvn6gVLVfp3iG4j'],
 ['Joni Mitchell',
  'Mitchell: Urge for Going (Instrumental Arrangement of the B-Side Track of the Joni Mitchell Single "You Turn Me on I\'m a Radio")',
  'spotify:track:1I1u9aTdxxQ7SDLgBB3V7b'],
 ['Kanye West', 'Come to Life', 'spotify:track:5xvXeuxISyXJDRbZZf4uzd'],
 ['Leonard Cohen', 'Chelsea Hotel #2', 'spotify:track:4krhCfJg0znykZoyjeMXRe'],
 ['Leonard Cohen', 'Dear Heather', 'spotify:track:3MTKMphPprAcBFG1uIhzPZ'],
 ['Leonard Cohen',
  "Death of a Ladies' Man",
  'spotify:track:5wrylUGwZugelovhryPYg2'],
 ['Leonard Cohen', 'The Future', 'spotify:track:5l8lYrnPEM1ln3J4XaTcy5'],
 ['Leonard Cohen',
  'You Want It Darker',
  'spotify:track:5zb7npjQqoJ7Kcpq4yD9qn'],
 ['Lingers.On', 'In Lingerie', 'spotify:track:6FH3kGlJbFVJDCG9RcERf7'],
 ['Louis Armstrong',
  'La vie en rose - Single Version',
  'spotify:track:3yYfoYGVpriV4fG9L1ogsD'],
 ['The Lovecats', 'The Lovecats', 'spotify:track:7iJUiiTfnuY5cTIeEBnqHr'],
 ['Ludovico Einaudi', 'Primavera', 'spotify:track:4BMHp3DkI8VLsuB9Kr0pzu'],
 ['Mazzy Star', 'Flowers In December', 'spotify:track:0G6Ws8Gbdt0S7pZeuYmkmm'],
 ['Metallica',
  'Fade To Black (Remastered)',
  'spotify:track:0dqGfCMAGyDgpUAgLNOjWd'],
 ['Wolfgang Amadeus Mozart',
  'Requiem in D Minor, K. 626: III. Sequenz No. 6, Lacrimosa dies illa',
  'spotify:track:4bvzJZXpkI3bkjxMCWOSu1'],
 ['My Chemical Romance',
  'The Light Behind Your Eyes',
  'spotify:track:3HyDpKAuR3e4l6QB7hSB2l'],
 ['Paul McCartney',
  'Here Today - Remixed 2015',
  'spotify:track:0QtnwXDziZN1K55fXuLN6q'],
 ['Paul McCartney',
  'I’ll Follow The Sun - Live At Amoeba 2007',
  'spotify:track:3xT59EeQdq0TPGtOlXXI8t'],
 ['Puscifer', 'The Humbling River', 'spotify:track:69GE6yPZZldvqtgBHrKXxg'],
 ['Ray LaMontagne',
  'Such A Simple Thing',
  'spotify:track:4PuUa8e5s7P3Zv1IdCGIsa'],
 ['Ray Manzarek',
  'Riders on the Storm',
  'spotify:track:3FvYcTXO2QtDY7kZQHku2d'],
 ['Red Hot Chili Peppers', 'Dosed', 'spotify:track:1iFIZUVDBCCkWe705FLXto'],
 ['Sky Cries Mary',
  "Don't Forget The Sky",
  'spotify:track:4sVpjCJRClVetRrdxVBolP'],
 ['Stevie Nicks', 'Landslide', 'spotify:track:5fprEY6WEN1wvFXkgfb22C'],
 ['Stevie Wonder', 'Isn’t She Lovely', 'spotify:track:6wGlAaMfyhKdEPr2zycAnN'],
 ['Taylor Swift',
  'Fearless (Taylor’s Version)',
  'spotify:track:77sMIMlNaSURUAXq5coCxE'],
 ['Taylor Swift',
  'the lakes - bonus track',
  'spotify:track:0eFQWVz0qIxDOvhLpZ40P7'],
 ['The Band',
  'When I Paint My Masterpiece - Remastered',
  'spotify:track:76WChUuOPeIK027IeUgr0l'],
 ['The Beach Boys',
  "I Just Wasn't Made For These Times - Mono",
  'spotify:track:4CuO8TINNqM3D7aUdNQ3zG'],
 ['The Beach Boys',
  "Let's Go Away For A While - Mono",
  'spotify:track:3GsgJI1aBrvUtqX8f3MhKT'],
 ['The Beatles',
  "Don't Let Me Down - Naked Version / Remastered 2013",
  'spotify:track:5BhMoGrz5KzG2fA5uzHjZ1'],
 ['The Beatles',
  'Love Me Do - Remastered 2009',
  'spotify:track:3VbGCXWRiouAq8VyMYN2MI'],
 ['The Chemical Brothers',
  'The Boxer',
  'spotify:track:1EUeDFq2zNP784GPaRs9aH'],
 ['The Cure',
  'A Night like This - 2006 Remaster',
  'spotify:track:7cKCz7gG84i1XLvDeM3ByT'],
 ['The Cure',
  'Disintegration - 2010 Remaster',
  'spotify:track:0zY8t5dC1KQXcPUKByWMJM'],
 ['The Cure',
  'From the Edge of the Deep Green Sea',
  'spotify:track:2vwBL9RVyr0vA4Og5VH0i3'],
 ['The Cure',
  'In Between Days - 2006 Remaster',
  'spotify:track:07CyrZF9eVd02zzIse7tZA'],
 ['The Cure', 'A Letter to Elise', 'spotify:track:4DdXOLc1VMAY34ourCn1Xa'],
 ['The Cure',
  'Lullaby - 2010 Remaster',
  'spotify:track:4d4oXk7O2lEhZ83ivV93li'],
 ['The Cure', 'Underneath The Stars', 'spotify:track:0PKVjYlKw7z3IvKAoxrYTR'],
 ['The Eagles', 'The Desperadoes', 'spotify:track:10ppF835WJMYI5v65gFLZ3'],
 ['The Helio Sequence',
  'Keep Your Eyes Ahead',
  'spotify:track:3yatRBsGMJ7wMoUIgDBzzo'],
 ['The Moldy Peaches',
  'Anyone Else But You',
  'spotify:track:2pKi1lRvXNASy7ybeQIDTy'],
 ['The Strokes', 'Someday', 'spotify:track:7hm4HTk9encxT0LYC0J6oI'],
 ['Traditional',
  'Scarborough Fair (Arr. Parkin)',
  'spotify:track:4wlNPczIullwvmwb4x0ltz'],
 ['Van Morrison',
  'Madame George - 1999 Remaster',
  'spotify:track:1N4MKISvC1ddfRCRQDXDd2'],
 ['Various Artists',
  'The Girl From Ipanema',
  'spotify:track:0JgH7g0kwsIs1THEVqhlUS'],
 ['Víg Mihály',
  'Öreg - From "Werckmeister Harmóniák"',
  'spotify:track:63wMgkXQuomlkW4an4O9b4'],
 ['Willie Nelson', 'Crazy', 'spotify:track:0xqtcLB45iKNfHroi5y1em']]


In [None]:
len(addlist)

In [None]:
addlist2 = [a[2] for a in addlist]

print (len(addlist2), 'items')

while(addlist2):
    sp.user_playlist_add_tracks(os.getenv('SPOTIFY_USERNAME'), 
                                playlist_id=playlist_id, 
                                tracks=addlist2[-100:])
    addlist2 = addlist2[:-100]
    print("added items, remaining ", len(addlist2))
