In [2]:
# this version reads a chunk of posts with the score embedded
# ranks by summing scores

import os
import glob
import pickle
from datetime import datetime
import time
import dotenv
import re
from tqdm import tqdm
from schema import Schema
import csv

import pandas as pd
import pandas_dedupe

import requests
import requests.auth

import praw

import openai
import tiktoken

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# load secrets from .env into environment variables
dotenv.load_dotenv()

print(f"{'Praw:':<20} {praw.__version__ :>10}")
print(f"{'OpenAI:':<20} {openai.version.VERSION :>10}")


Praw:                     7.7.0
OpenAI:                  0.27.4


See README.md
 - objective is to use OpenAI for named entity extraction to extract all the songs form [this reddit thread](https://www.reddit.com/r/AskReddit/comments/12viv4v/what_is_the_prettiest_song_you_ever_heard_in_your/) and make Spotify playlist
 - use Reddit PRAW API to download all the comments (get [Reddit API key](https://www.reddit.com/prefs/apps))
 - use OpenAI API with a prompt like, extract all the songs from this text to CSV get ([OpenAI API key](https://platform.openai.com/account/api-keys))
 - use Spotify API to make a playlist (get [Spotify API key](https://developer.spotify.com/documentation/web-api/tutorials/getting-started))
 - works, needed a lot of scrubbing, but about 1 day of work, wouldn't have been possible to do a 700-song playlist manually without a team of Mechanical Turks or something
 - If I wanted to go nuts, would process comments individually, save a file for each comment's extracted songs, would make it easier to track down what OpenAI gets wrong, have a resumable, retryable, repeatable process and 
 - Spotify playist is [here](https://open.spotify.com/playlist/08YFkbtTV6GBfNtjJ4PHDu?si=f4761d983ac84091) 
 
 needs a .env file per dot-env-template
 

# Configs

In [2]:
# model
gptmodel = 'gpt-3.5-turbo'

# a thread 
submission = "12viv4v"

# minimum karma to process a reply 
minkarma = 1

# an output file to accumulate all the responses
savefile = 'bronze.txt'

# main prompt 
prompt_prefix1="""You will act as a research assistant extracting all the artists and track titles mentioned in a series of posts about music.
Your goal is to extract structured information from a series of posts in the form below and return them in a structured CSV format.
Define a post delimited below by ===
===
post_id: "abcdefg"
post_score: "6996"
I love Yesterday by the Beatles. Also Hotel California from The Eagles. And Bruce Springsteen's Born To Run!
===

When extracting information please make sure it matches the CSV format below exactly. Do not add any attributes that do not appear in the schema below delimited by ---
---
"post_id","post_score","artist","track"
"abcdefg","6996","The Beatles","Yesterday"
"abcdefg","6996","The Eagles","Hotel California"
"abcdefg","6996","Bruce Springsteen","Born To Run"
---

You will extract all artists and tracks from each post below delimited by ~~~ .
You will return a list of records containing the artist and track extracted from the input, and the post_id and post_score of the post the artist and track is mentioned in.
You will return the records in a CSV format.
The header row should contain `"post_id","post_score","artist","track"`. 
The input is:
"""

# an output file to accumulate all the responses
outdir = 'out'
logdir = 'logs'

savefile = 'bronze.txt'

# to speed things we'll cumulate posts til we get to nposts posts or maxchars total chars, whichever comes first
max_post_size=300  # redditor needs to put any songs in 1st couple hundred chars
maxtokens = 1024   # max tokens to send to get_response (with room for response)
# maxchars = 6000  # max tokens (words/fragments) is 4096 but I think stuffing the prompt maybe reduces quality?
nposts = 1000 # max posts to combine into a chunk



# Get all comments from a reddit posting

In [None]:
def getPraw():
    return praw.Reddit(user_agent="prettiest_song/0.001", 
                       client_id=os.getenv('CLIENT_ID'), 
                       client_secret=os.getenv('CLIENT_SECRET'))


def getAll(r, submissionId, verbose=True):
    submission = r.submission(submissionId)
    submission.comments.replace_more(limit=None)
    commentsList=submission.comments.list()
    return commentsList


In [None]:
# print(datetime.now())
# r = getPraw()
# res = getAll(r, submission)
# print(datetime.now())

# print("retrieved ", len(res), 'comments')


In [None]:
# # we have a list of comment objects
# # filter comments with at least some karma
# res3 = [r for r in res if r.score >= minkarma]
# print('filtered to ', len(res3), 'comments')
# res3[0].body, res3[0].score


In [3]:
# save so we can reload it later without downloading

# with open('reddit_full.pkl', 'wb') as f:
#     pickle.dump(res3, f)
    
with open('reddit_full.pkl', 'rb') as f:
    res3 = pickle.load(f)


# Extract artists and song titles using OpenAI

In [4]:
# check lengths of posts
shorties = []
big_ones = []
for i in range(len(res3)):
    if len(res3[i].body) <3:
        print (i, res3[i].body)
        shorties.append(i)
    if len(res3[i].body) > 1024:
        print(i, len(res3[i].body))
        big_ones.append(i)
        

812 26
11565 Up
21388 W
21557 -🤓
21562 W
21907 :)
22168 t
23326 <3
23401 ✨️
24352 Ye


In [5]:
# avg length
sum([len(r.body) for r in res3]) / len(res3)

70.90953465668727

In [6]:
[i for i in range(len(res3)) if res3[i].score <= 0]

[]

In [7]:
# already truncated
print (res3[big_ones[0]].body[:500])

IndexError: list index out of range

In [8]:
csv_validate_re = re.compile(r'''
    \s*                # Any whitespace.
    (                  # Start capturing here.
      [^,"']+?         # Either a series of non-comma non-quote characters.
      |                # OR
      "(?:             # A double-quote followed by a string of characters...
          [^"\\]|\\.   # That are either non-quotes or escaped...
       )*              # ...repeated any number of times.
      "                # Followed by a closing double-quote.
      |                # OR
      '(?:[^'\\]|\\.)*'# Same as above, for single quotes.
    )                  # Done capturing.
    \s*                # Allow arbitrary space before the comma.
    (?:,|$)            # Followed by a comma or the end of a string.
    ''', re.VERBOSE)


In [9]:
# use tokenizer to get accurate token count

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model(gptmodel)
assert enc.decode(enc.encode("hello world")) == "hello world"

def count_tokens(s):
    return len(enc.encode(s))

count_tokens('four score and 7 years go our forefathers brought forth')

13

In [86]:
openai.api_key = os.getenv('OPENAI_API_KEY')

models = openai.Model.list()
print([(i, m.id,) for i, m in enumerate(models["data"])])
models['data'][2]

[(0, 'whisper-1'), (1, 'babbage'), (2, 'davinci'), (3, 'text-davinci-edit-001'), (4, 'text-davinci-003'), (5, 'babbage-code-search-code'), (6, 'text-similarity-babbage-001'), (7, 'code-davinci-edit-001'), (8, 'text-davinci-001'), (9, 'ada'), (10, 'babbage-code-search-text'), (11, 'babbage-similarity'), (12, 'code-search-babbage-text-001'), (13, 'text-curie-001'), (14, 'code-search-babbage-code-001'), (15, 'text-ada-001'), (16, 'text-embedding-ada-002'), (17, 'text-similarity-ada-001'), (18, 'curie-instruct-beta'), (19, 'ada-code-search-code'), (20, 'ada-similarity'), (21, 'code-search-ada-text-001'), (22, 'text-search-ada-query-001'), (23, 'davinci-search-document'), (24, 'ada-code-search-text'), (25, 'text-search-ada-doc-001'), (26, 'davinci-instruct-beta'), (27, 'text-similarity-curie-001'), (28, 'code-search-ada-code-001'), (29, 'ada-search-query'), (30, 'text-search-davinci-query-001'), (31, 'curie-search-query'), (32, 'davinci-search-query'), (33, 'babbage-search-document'), (34, 

<Model model id=davinci at 0x7fca34a3d400> JSON: {
  "created": 1649359874,
  "id": "davinci",
  "object": "model",
  "owned_by": "openai",
  "parent": null,
  "permission": [
    {
      "allow_create_engine": false,
      "allow_fine_tuning": false,
      "allow_logprobs": true,
      "allow_sampling": true,
      "allow_search_indices": false,
      "allow_view": true,
      "created": 1669066355,
      "group": null,
      "id": "modelperm-U6ZwlyAd0LyMk4rcMdz33Yc3",
      "is_blocking": false,
      "object": "model_permission",
      "organization": "*"
    }
  ],
  "root": "davinci"
}

In [11]:
MAX_TOKENS = 4096   # https://platform.openai.com/docs/models

def get_response(messages, prompt_prefix="", verbose=False):

    prompt = prompt_prefix
    
    if type(messages) == list:
        for msg in messages:
            prompt += f"""
~~~
{msg}
~~~
"""
    else:
        prompt += messages
        
    if verbose:
        print(prompt)
        
    # retry loop, have received untrapped 502 error
    if count_tokens(prompt) > MAX_TOKENS:
        print("WARNING: %d tokens > %d" % (count_tokens(prompt), MAX_TOKENS))
        
    RETRIES = 3
    success = False    
    for i in range(RETRIES):
        try:
            response = openai.ChatCompletion.create(
                model=gptmodel,
                messages=[{"role":"user", 
                           "content": prompt}],
                temperature=0,
            )
            # no exception thrown
            success=True
            break   
        except Exception as error:
            print("An exception occurred:", error)
            print("Retrying get_response...")
            time.sleep(5)
            continue  # try again
    if success:
        # check response payload for any error message?
        response_msg = response['choices'][0]['message']
        if len(response_msg['content'])==0:
            print("there was a problem, content is empty, full payload follows:")
            print(response)
        if verbose:
            print(response_msg)
        return response_msg['content']
    else:
        return None



In [12]:
def file_validate(response):
    header_array = []
    lines = response.split("\n")
    if header_array:
        inp_array = csv_validate_re.findall(lines[0])
        if len(header_array) != len(inp_array) \
            or any(validation != inp.strip() for (validation, inp) in zip(header_array, inp_array)):
                print("bad header: ")
                print("got:    ", lines[0])
                print("expected:", ",".join(header_array))
                return False
    return True

def row_validate(row, header_array=None, schema=None):
    csv_values = csv_validate_re.findall(row)
    if header_array:
        if len(csv_values) != len(header_array):
            return False
    if schema:
        try:
            schema.validate([row])
        except Exception as error:
            print(row)
            print(error)
            return False
    return True

def get_csv_from_chat_gpt(message, header_array):
    
    # maybe make more general by passing 2 validation functions, file_validate, row_validate 
    # could use csv module and pydantic to validate, pass only pydantic class, construct expected header from pydantic
    for i in range(3):
        if i > 0:
            print(f"attempt {i}")
        response = get_response(messages, prompt_prefix1, verbose=False)
        if response is None:   # FAIL - retries exhausted
            print('retrying get_csv')
            continue

        # do basic validation and cleanup
        # should check first line is valid header and doesn't reorder columns
        csv_valid, csv_err = [], []
        lines = response.split("\n")
        if header_array:
            inp_array = csv_validate_re.findall(lines[0])
            if len(header_array) != len(inp_array) \
                or any(validation != inp.strip() for (validation, inp) in zip(header_array, inp_array)):
                print("bad header: ")
                print("got:    ", lines[0])
                print("expected:", ",".join(header_array))
                continue

        for line in lines:
            try:
                # csv_values = csv_validate_re.findall(line)
                if True: # better workflow to put everything in one file and schema validate file later, I think
                    # at cost of getting some lines like 'nothing found'
                    # if len(csv_values) == len(header_array):
                    csv_valid.append(line)
                else:
                    csv_err.append(line)
            except:
                csv_err.append(line)
        return csv_valid, csv_err, response


In [13]:
# for each comment object we will extract the body 
# then submit as part of a prompt to chatgpt
print(datetime.now())

nposts = 1000
slist = res3.copy()
total_posts = len(slist)
print("processing %d posts" % total_posts)

# make sure out and logs are empty
for f in glob.glob('%s/*' % outdir):
    os.remove(f)
for f in glob.glob('%s/*' % logdir):
    os.remove(f)
file_index = 0
maxtokens=2048

while(slist):  # still comments to process
    tokens_to_date = count_tokens(prompt_prefix1)
    reply_ids = []
    messages = []
    for _ in range(nposts):  # add up to this many posts to the prompt
        if slist:
            # make sure no single post > max_post_size, truncate in place as nec 
            slist[0].body = slist[0].body[:max_post_size]
            if tokens_to_date + count_tokens(slist[0].body) < maxtokens:
            # total post content < maxchars
            # if chars_to_date + len(slist[0].body) < maxchars:
                reply = slist.pop(0)
                reply_ids.append(reply.id)
                body = reply.body
                
                messages.append(f"""
post_id: "{reply.id}"
post_score: "{reply.score}"
{body}
"""
                )
                tokens_to_date += count_tokens(messages[-1])
                # chars_to_date += len(messages[-1])
            
    expected_header = ['"post_id"', '"post_score"', '"artist"', '"track"']
    csv_valid, csv_err, response = get_csv_from_chat_gpt(messages, header_array=expected_header)
    csv_output = "\n".join(csv_valid)

    with open("%s/%04d.csv" % (outdir, file_index), 'w') as outfile:
        outfile.write(csv_output)
    
    if csv_err:
        with open("%s/%04d.err" % (outdir, file_index), 'w') as outfile:
            outfile.write("\n".join(csv_err))
        
    with open("%s/%04d.log" % (logdir, file_index), 'w') as logfile:
        logfile.write(str(reply_ids))
        logfile.write('\n\n===== raw prompt =====\n\n')        
        logfile.write("\n=====\n".join(messages))
        logfile.write('\n\n===== raw response =====\n\n')
        logfile.write(response)
        logfile.write('\n\n===== failed validation =====\n\n')
        logfile.write("\n".join(csv_err))
 
    file_index += 1
    outcount = total_posts-len(slist)
    print(outcount, end=' ')
    
    
print()
print(datetime.now())



2023-05-23 11:44:11.661680
processing 24584 posts
59 107 167 216 280 332 384 427 481 534 578 633 689 An exception occurred: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying get_response...
733 779 831 887 938 991 1050 1101 1150 1203 1247 An exception occurred: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID aaa3f98bbc69686ccc7c3295d9e5b6ef in your message.)
Retrying get_response...
1304 1352 An exception occurred: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 11be0f417e05934f083a91e28cd31977 in your message.)
Retrying get_response...
1404 1454 1501 1547 1602 1659 1705 1758 1816 1873 1924 1971 2023 2081 2129 2179 2227 2273 2329 2

In [None]:
## concatenate outputs as bronze.txt
# may still have to tweak the files to get them to load


In [None]:
# filelist = glob.glob('%s/*.csv' % outdir)

# output_df = None
# count = 0
# for f in sorted(filelist):
#     print(f)
#     try:
#         tempdf = pd.read_csv("%s" % (f), header=None)
#     except Exception as exc:
#         print(str(exc))
#         continue
#     colcount = len(tempdf.columns)
#     if len(tempdf.columns) != 4:
#         print('%s has %d columns, skipped' % (f, colcount))
#         continue
        
#     # ok
#     # truncate header row if it looks like a header
#     if tempdf.iloc[0][0]=='post_id':
#         tempdf = tempdf[1:]
#     # set the header explicitly
#     tempdf.columns=["post_id","post_score","artist","track"]

#     if output_df is not None:        
#         output_df = pd.concat([output_df, tempdf], axis=0)
#     else:
#         output_df = tempdf
#     count += 1
#     if count % 10 == 0:
#         print(count, end=' ')

        
        
        

In [3]:
def valid_post_id(s):
    s = s.strip()
    valid = 3 < len(s) < 10
    return valid
# validator.add_record_check(check_post_id)

def valid_post_score(s):
    s = s.strip()
    valid = all([c.isdigit() for c in s]) and int(s) < 99999
    return valid
    
schema = Schema([{'post_id': valid_post_id,
                  'post_score': valid_post_score, 
                  'artist': str,
                  'track': str,
                 }])

filelist = glob.glob('%s/*.csv' % outdir)

objlist = []
with open(savefile, 'w') as outfile:
    for f in tqdm(filelist, desc = 'File concat'):
        with open(f, 'r') as csvfile:
            reader = csv.DictReader(csvfile, delimiter=",", fieldnames=[
                "post_id",
                "post_score",
                "artist",
                "track"
            ])
            data=[row for row in reader]
                                   
        data = data[1:]
        
        try:
            objlist.extend(schema.validate(data))
        except Exception as error:
            print(f)
            print(error)
            break



File concat: 100%|██████████| 480/480 [00:05<00:00, 93.94it/s] 


In [5]:
tempdf = pd.DataFrame.from_dict(objlist)
tempdf.columns=['post_id','post_score','artist','track']

with open('tempdf.pkl', 'wb') as f:
    pickle.dump(tempdf, f)


tempdf


Unnamed: 0,post_id,post_score,artist,track
0,jhg01tn,1,Claire de lune,
1,jhg023n,1,Lauren Daigle,You Say
2,jhg02kp,1,Neil Young,Old Man
3,jhg02z0,1,Alice In Chains,Junkhead
4,jhg03e4,1,Oscar Anton and Clementine,Reflet
...,...,...,...,...
26610,jhfwn7i,1,Brian Culbertson,The Look
26611,jhfwnfz,1,Young Thug,420
26612,jhfwnfz,1,Young Thug,Yes Indeed
26613,jhfwnfz,1,Young Thug,Dub Shit


In [6]:
def fix_leading_trailing(s):
    """Fix where it encloses in quotes etc."""
    # regex prob better if re.match('^\W+(.*)\W+$',playerName): 
    closers={'(': ')', # rest prob no factor but anyway
             '“':'”',
             '‘':'’',
             '{': '}',
             '[': ']',
             '<': '>'}
    s = str(s).strip()
    while len(s) >= 2 and (not s[0].isalnum()) and (s[0] == s[-1] or closers.get(s[0])==s[-1]):
        s = s[1:-1]
        s = s.strip()
            
    return s


In [7]:
tempdf = tempdf.drop_duplicates() \
    .sort_values(["post_score", "artist", "track"], ascending=False)
# drop header row
tempdf = tempdf.loc[~(tempdf['post_id'].str.strip()=='post_id')]
# na to ""
tempdf.loc[tempdf['post_id'].isna(), 'post_id'] = ''
tempdf.loc[tempdf['post_score'].isna(), 'post_score'] = ''
tempdf.loc[tempdf['artist'].isna(), 'artist'] = ''
tempdf.loc[tempdf['track'].isna(), 'track'] = ''
# strip spaces
tempdf['post_id'] = tempdf['post_id'].apply(fix_leading_trailing)
tempdf['post_score'] = tempdf['post_score'].apply(fix_leading_trailing)
tempdf['artist'] = tempdf['artist'].apply(fix_leading_trailing)
tempdf['track'] = tempdf['track'].apply(fix_leading_trailing)
# clean up post_score to valid int
tempdf['post_score'] = tempdf['post_score'].apply(lambda s: "".join([c for c in s if c.isdigit()]))
tempdf['post_score'] = tempdf['post_score'].apply(lambda x: x[-5:])
tempdf['post_score'] = tempdf['post_score'].apply(lambda s: int(s) if s else 1)
# drop missing tracks, cleanup track
tempdf = tempdf.drop(tempdf.loc[tempdf['track']==''].index)
tempdf = tempdf.drop(tempdf.loc[tempdf['track'].str.lower()=='unknown'].index)
tempdf = tempdf.drop(tempdf.loc[tempdf['track'].str.lower()=='n/a'].index)
tempdf = tempdf.drop(tempdf.loc[tempdf['track'].str.lower()=='track'].index)
tempdf = tempdf.sort_values(["post_score", "artist", "track"], ascending=False)
tempdf.loc[tempdf['post_score']==0, 'post_score'] = 1
# any test examples
tempdf = tempdf.loc[~(tempdf['post_id']=='abcdefg')]
tempdf


Unnamed: 0,post_id,post_score,artist,track
20023,jhbktrn,13941,Claude Debussy,Claire de Lune
20009,jhc2dyv,6996,Erik Satie,Gymnopédies
20014,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair
20030,jhcc2q8,5014,Israel Kamakawiwoʻole,Over the Rainbow
20042,jhbjlwj,4332,The Cranberries,Dreams
...,...,...,...,...
933,jheh9kn,1,,7 bridges road
14587,jhh5yvh,1,,5:15 AM
2717,jhef1ci,1,,12 Stout Street
917,jheh4mx,1,,10000 emerald pools


In [8]:
df = tempdf
df.loc[df['artist']=='N/A', 'artist']=''
df.loc[df['artist'].str.lower().str.startswith('unknown'), 'artist']=''
df.loc[df['artist'].str.lower().str.startswith('various'), 'artist']=''

artist_df = df[['artist','track']].groupby('artist') \
    .count() \
    .sort_values(["artist", "track"], ascending=False) \
    .reset_index()

artist_df


Unnamed: 0,artist,track
0,大鱼海棠,1
1,دلکش,1
2,увулв,1
3,мураками,1
4,Отава Ё,1
...,...,...
6886,070 Shake,1
6887,*NSYNC,1
6888,(G)-IDLE,1
6889,$not,1


In [9]:
tempdf.to_csv('bronze.csv', index=False)

tempdf.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(tempdf, f)

len(tempdf)



22928

# Impute missing artists
if someone just says 'Clair de Lune', or 'Let it be', without specifying the artist, maybe we can impute the artist

In [10]:
missing_map = {}
try:
    artist_map = pd.read_csv("missing_artists.csv")
    missing_map = dict(zip(artist_map['track'],artist_map['artist']))
except:   # doesn't exist
    pass

missing_map

{'what comes to mind is either simple and clean, or dear sunshine': 'Utada Hikaru',
 '23': 'Jimmy Eat World',
 '26': 'Paramore',
 '3 little birds': 'Bob Marley',
 '74-75': 'The Connells',
 '86d - no escort': 'Mitski',
 "Don't Break My Heart": 'UB40',
 "Don't Know Much": 'Linda Ronstadt and Aaron Neville',
 "Don't Think Twice, It's All Right": 'Bob Dylan',
 'Don’t L': 'Missy Elliott',
 'Don’t Let Me Down': 'The Beatles',
 'Don’t Look Back': 'Boston',
 'Don’t Talk': 'The Beach Boys',
 'Doschitaii': 'Tatu',
 'Down in a Hole': 'Alice in Chains',
 'Down to You': 'Joni Mitchell',
 'Down to the River to Pray': 'Alison Krauss',
 'Dream Sweet in Sea Major': 'Miracle Musical',
 'Dream a Little Dream': 'The Mamas & The Papas',
 'Dreaming Again': 'Jim Croce',
 'Dreaming My Dreams': 'Waylon Jennings',
 'Dreams': 'Fleetwood Mac',
 'Drips//Auntie’s Harp': 'Flying Lotus',
 'Dry Hands': 'C418',
 'Duo des Fluers': 'Léo Delibes',
 'Dust in the Wind': 'Kansas',
 'Duvet': 'Boa',
 'Dylan Version': 'The Avet

In [11]:
df.loc[df['artist']=='Claire de lune', 'track']='Clair de Lune'
df.loc[df['track']=='Claire de Lune', 'track']='Clair de Lune'
df.loc[df['track']=='Clair de Lune', 'artist']='Claude Debussy'


In [12]:
df['artist2'] = df.apply(lambda row: missing_map[row.track.lower().strip()] if row.artist=="" and row.track.lower().strip() in missing_map else row.artist, axis=1)
df.loc[df['artist'] != df['artist2']]



Unnamed: 0,post_id,post_score,artist,track,artist2
18889,jhc33vf,1584,,Linger,The Cranberries
11381,jhdrv5t,731,,Avril 14th,Aphex Twin
12913,jhc1ef3,619,,staralfur,Sigur Rós
14746,jhc97pn,574,,Bridge Over Troubled Water,Simon & Garfunkel
16170,jhcej4u,484,,In My Life,The Beatles
...,...,...,...,...,...
933,jheh9kn,1,,7 bridges road,The Eagles
14587,jhh5yvh,1,,5:15 AM,Patrick Watson
2717,jhef1ci,1,,12 Stout Street,The Weakerthans
917,jheh4mx,1,,10000 emerald pools,BØRNS


In [13]:
df['artist'] = df.apply(lambda row: missing_map[row.track.lower().strip()] if row.artist=="" and row.track.lower().strip() in missing_map else row.artist, axis=1)


In [14]:
missing_artist_df = df.loc[(df['artist']=='')]
missing_artist_df


Unnamed: 0,post_id,post_score,artist,track,artist2
20043,jhc94a8,495,,Someone make a Spotify playlist please!,
18518,jhc8s3i,113,,Songs my daughter would make up,
12782,jhceaue,17,,And my axe!,
15149,jhcknj0,14,,deleted,
18873,jhdofyl,9,,All Thru the Night,
...,...,...,...,...,...
22341,jhcz8tw,1,,Amor Eterno- literally any version of it,
20487,jhhylzo,1,,Abendlied,
20930,jhcw4vm,1,,ABC mixed with Isty Bisty Spider,
7933,jhdohp3,1,,A song I've seen best described as making you ...,


In [86]:
prompt_prefix3 = """I will provide a list of well-known recordings. For each recording, you will review and provide the name of the artist most closely associated with the recording. You will provide the results in CSV format, one record per line in the following order: recording, artist. Enclose each field in double-quotes.

The input is:

"""
missing_artist_df = df.loc[(df['artist']=='')]

def missing_artists(missing_artist_df):
    
    missing_track_map = {}
    
    slist = missing_artist_df['track'] \
        .dropna() \
        .str.lower() \
        .str.strip() \
        .drop_duplicates() \
        .tolist()

    slist.sort()
    n_missing = len(slist)

    while(slist):  # still artists to process
        print(datetime.now(), end=" ")

        tokens_to_date = count_tokens(prompt_prefix3)
        prompt = ''
        rows = 0
        for _ in range(nposts):  # add up to nposts posts to the prompt
            if slist and tokens_to_date + count_tokens(slist[0]) < 1024:
                track = f'"{slist.pop(0)}"\n'
                prompt += track
                tokens_to_date += count_tokens(track)
                rows += 1
            else:
                break
        print(f"sending {rows} rows...", end=" ")
        response = get_response(prompt, prompt_prefix3, verbose=False)

        if response is None:   # FAIL - retries exhausted
            print('Bailing to next chunk')
            continue

        if not response:
            print("nothing returned ... check returned dict for errors")

        lines = response.split("\n")
        print(f"received {len(lines)} lines...")
        c=0        
        for line in lines:
            try:
                csv_values = csv_validate_re.findall(line)
                if len(csv_values) != 2:
                    print(f"{len(csv_values)} values found: ", line)
                    continue
                track_input, artist_correct = csv_values[0], csv_values[1]
                # fix artist enclosed in quotes, parens, etc.
                while len(artist_correct) >=2 and (not artist_correct[0].isalnum()) and artist_correct[0] == artist_correct[-1]:
                    artist_correct = artist_correct[1:-1]
                while len(track_input) >=2 and (not track_input[0].isalnum()) and track_input[0] == track_input[-1]:
                    track_input = track_input[1:-1]
                # if it wasn't found then skip
                if len(artist_correct) == 0:
                    print("empty artist returned")
                    continue
                if artist_correct.lower().startswith('unknown'):
                    continue
                if artist_correct.lower() == 'n/a':
                    continue
                # store in dict to update df
                c += 1
                missing_track_map[track_input]=artist_correct
                print(f'{track_input}: {artist_correct}')                    
            except Exception as error:
                print('error', line)
                print(error)
                continue
                
        print(f"{c} lines processed, total {n_missing-len(slist)}, {len(slist)} of {n_missing} remaining")
        
    return missing_track_map
                
missing_track_map = missing_artists(missing_artist_df)



2023-05-24 08:48:57.549211 sending 75 rows... received 72 lines...
832 hz music: [deleted]
a beautiful song: [deleted]
a day in the: [deleted]
a playlist would be cool: [deleted]
a random piano from limewire: [deleted]
a song i've seen best described as making you feel nostalgic for a time you don't remember: [deleted]
a song they made for me: [deleted]
abc mixed with isty bisty spider: [deleted]
abendleid: [deleted]
abendlied: [deleted]
adoro te devote: [deleted]
advanced falcorny: [deleted]
aidelweiss: [deleted]
all thru the night: [deleted]
always makes me cry: [deleted]
amazing grace in bagpipes: [deleted]
amazing grace on a pipe organ: [deleted]
amazing grace with bagpipes: [deleted]
among us: [deleted]
amor eterno- literally any version of it: [deleted]
an unwavering heart: [deleted]
and my axe!: [deleted]
anfonaf angel: [deleted]
angel on ruskin live: [deleted]
another hopeful tomorrow: [deleted]
another record: [deleted]
answers: [deleted]
any own city song: [deleted]
any song 

In [87]:
missing_track_map 


{'832 hz music': '[deleted]',
 'a beautiful song': '[deleted]',
 'a day in the': '[deleted]',
 'a playlist would be cool': '[deleted]',
 'a random piano from limewire': '[deleted]',
 "a song i've seen best described as making you feel nostalgic for a time you don't remember": '[deleted]',
 'a song they made for me': '[deleted]',
 'abc mixed with isty bisty spider': '[deleted]',
 'abendleid': '[deleted]',
 'abendlied': '[deleted]',
 'adoro te devote': '[deleted]',
 'advanced falcorny': '[deleted]',
 'aidelweiss': '[deleted]',
 'all thru the night': '[deleted]',
 'always makes me cry': '[deleted]',
 'amazing grace in bagpipes': '[deleted]',
 'amazing grace on a pipe organ': '[deleted]',
 'amazing grace with bagpipes': '[deleted]',
 'among us': '[deleted]',
 'amor eterno- literally any version of it': '[deleted]',
 'an unwavering heart': '[deleted]',
 'and my axe!': '[deleted]',
 'anfonaf angel': '[deleted]',
 'angel on ruskin live': '[deleted]',
 'another hopeful tomorrow': '[deleted]',


In [89]:
# check for reasonableness, clean up and apply
df['track']=df['track'].astype(str)
df['artist2'] = df.apply(lambda row: missing_track_map[row.track.lower().strip()] if row.artist=="" and row.track.lower().strip() in missing_track_map else row.artist, axis=1)
df.loc[df['artist'] != df['artist2']].head(20)



Unnamed: 0,post_id,post_score,artist,track,artist2
18518,jhc8s3i,113,,Songs my daughter would make up,Regina Spektor
12782,jhceaue,17,,And my axe!,[deleted]
18873,jhdofyl,9,,All Thru the Night,[deleted]
7449,jhcbkzr,7,,"Fave or it’s version, very sad song",[deleted]
13539,jhcnzdy,6,,Anything my boyfriend sings,[deleted]
1990,jhd0jp7,5,,"That's rough, buddy",Avatar: The Last Airbender Soundtrack
9506,jhdpfir,4,,Just grace as an album,Grace VanderWaal
3698,jhdvza2,4,,A Day In The,[deleted]
23709,jhe5i9n,3,,steak night,Jonathan Coulton
1554,jhd28j7,3,,got some songs mixed up,[deleted]


In [47]:
df['artist'] = df.apply(lambda row: missing_track_map[row.track.lower()] if row.artist=="" and row.track.lower() in missing_track_map else row.artist, axis=1)



In [65]:
uniques = {k:v for k,v in missing_track_map.items() if k not in missing_map}
dupes = {k:v for k,v in missing_track_map.items() if k in missing_map}

# these should be equal since we already applied missing_map
len(missing_track_map), len(uniques), len(dupes)

(1097, 1097, 0)

In [68]:
# add new ones to missing_artists.csv
temp = pd.DataFrame({'track': uniques.keys(),
              'artist': uniques.values()}) \
    .sort_values(["artist", "track"])

temp.to_csv('missing_artists_new.csv',index=False)

# Fix typos, abbreviations, etc. using ChatGPT

In [15]:
artist_map = {}
try:
    artist_map = pd.read_csv("artist_map.csv")
    artist_map = dict(zip(artist_map['artist_orig'],artist_map['artist_corrected']))
except:
    pass
artist_map

{'(G)-IDLE': '(G)I-DLE',
 '10000 Maniacs': '10,000 Maniacs',
 '10CC': '10cc',
 '2 Cellos': '2Cellos',
 '2 cellos': '2Cellos',
 '2pac': '2Pac',
 '2wei': '2WEI',
 '3 doors down': '3 Doors Down',
 '3rd secret': '3rd Secret',
 '42 dougg and lil baby': '42 Dugg and Lil Baby',
 '4Hero': '4hero',
 '5sos': '5 Seconds of Summer',
 '5 seconds of summer': '5 Seconds of Summer',
 '8485': '88rising',
 '88lien': '88rising',
 'a band': 'A Band',
 'A flock of seagulls': 'A Flock of Seagulls',
 'a perfect circle': 'A Perfect Circle',
 'perfect circle': 'A Perfect Circle',
 'A perfect circle': 'A Perfect Circle',
 'APC/Tool': 'A Perfect Circle',
 'APC': 'A Perfect Circle',
 'a Swarm of the Sun': 'A Swarm of the Sun',
 'A Touch of Class aka ATC': 'A Touch of Class',
 'A Voz Do Violão': 'A Voz Do Violao',
 'Asap Rocky': 'A$AP Rocky',
 'A$AP Rocky & Lost Boy Ruth B': 'A$AP Rocky',
 'A$AP Rocky': 'A$AP Rocky',
 'ASAP Rocky': 'A$AP Rocky',
 'A R Rahman': 'A. R. Rahman',
 'AR Rehman': 'A. R. Rahman',
 'Ar Rah

In [16]:
df['artist2'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)
df.loc[df['artist'] != df['artist2']]


Unnamed: 0,post_id,post_score,artist,track,artist2
20030,jhcc2q8,5014,Israel Kamakawiwoʻole,Over the Rainbow,Israel Kamakawiwoole
20021,jhc1e1u,3675,The Beatles,In my Life,Beatles
20032,jhc74qz,2978,Edith Piaf,La Vie en Rose,Édith Piaf
20019,jhbm8ne,2461,Sigur Ros,Hoppipolla,Sigur Rós
20025,jhbf5ng,2314,The Beatles,Blackbird,Beatles
...,...,...,...,...,...
4747,jhed9g7,1,The Dear Hunter,A bitter sweet genesis for him and her,Dear Hunter
24771,jhd0m3x,1,João Gilberto,A Voz Do Violão,Joao Gilberto
3466,jhd1kxy,1,The Beatles,A Day in the Life,Beatles
10235,jhe51jw,1,The Connells,74-75,Connells


In [17]:
# apply the map
df['artist'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)


In [None]:
prompt_prefix2 = """You will act as a proofreader. I will provide you a list of recording artists or composers.
You will review each input artist for any spelling errors or abbreviations and provide the corrected full artist without abbreviation. 
You will provide them in CSV format, one record per line in the following order: input_artist, corrected_artist. Enclose each field in double-quotes.
The input is:

"""


In [None]:
# proofread / dedupe artists
# may want to run this whole sequence a couple of times and update df, silver.csv

def dedupe_artists(artist_df):
    
    nposts = 1000
    artist_map

    slist = sorted(artist_df['artist'].tolist())
    n_artists=len(slist)

    while(slist):  # still artists to process
        print(datetime.now(), end=" ")

        prompt = ""
        tokens_to_date = count_tokens(prompt_prefix2)
        rows = 0
        for _ in range(nposts):  # add up to 100 posts to the prompt
            if slist:
                if tokens_to_date + count_tokens(slist[0]) < 1024:
                    artist = f'{slist.pop(0)}\n'
                    prompt += artist
                    tokens_to_date += count_tokens(artist)
                    rows += 1
            else:
                break
        print(f"sending {rows} rows...", end=" ")

        response = get_response(prompt, prompt_prefix2, verbose=False)
        if response is None:   # FAIL - retries exhausted
            print('Bailing to next chunk')
            continue

        if not response:
            print("there was a problem, check the payload")


        lines = response.split("\n")
        print(f"received {len(lines)} lines...")
        # sometimes doesn't match, chatgpt monkeys skip some

        c=0
        for line in lines:
            try:
                csv_values = csv_validate_re.findall(line) 
                if len(csv_values) != 2:
                    print('%d values found' % len(csv_values), line)
                    continue
                artist_input, artist_correct = csv_values[0].strip(), csv_values[1].strip()
                # fix artist enclosed in quotes, parens, etc.
                while len(artist_input) >= 2 and (not artist_input[0].isalnum()) and artist_input[0] == artist_input[-1]:
                    artist_input = artist_input[1:-1]
                while len(artist_correct) and (not artist_correct[0].isalnum()) and artist_correct[0] == artist_correct[-1]:
                    artist_correct = artist_correct[1:-1]
                # if it matches modulo case then skip
                if artist_input.lower() == artist_correct.lower():
                    continue
                # if it wasn't found then skip
                if len(artist_correct) == 0:
                    print("empty artist returned")
                    continue
                if artist_correct.lower().startswith('unknown'):
                    continue
                if artist_correct.lower() == 'n/a':
                    continue
                if artist_correct.lower() == "no correction needed":
                    continue
                # store in dict to update df
                c+=1
                artist_map[artist_input]=artist_correct
                print(f'"{artist_input}", "{artist_correct}"')
            except Exception as error:
                print('error', line)
                print(error)
                continue
        print(f"{c} lines processed, total {n_artists-len(slist)}, {len(slist)} of {n_artists} remaining")
        
    return artist_map

artist_map=dedupe_artists(artist_df)

print(datetime.now())


In [None]:
print(artist_map)


In [None]:
len(old_artist_map)

In [None]:
len(artist_map)

In [None]:
# save in artist_map.csv but no dupes
old_artist_map = pd.read_csv("artist_map.csv")
old_artist_map = dict(zip(old_artist_map['artist'],old_artist_map['map']))
not_dupes = {k: artist_map[k] for k in artist_map.keys() if k not in old_artist_map}
not_dupes
len(not_dupes)


In [None]:
pd.DataFrame({'artist': not_dupes.keys(), 'map': not_dupes.values()}).to_csv('artist_map_new.csv', index=False)

In [None]:
# check the map for reasonableness
# it does pretty smart stuff like map nin to Nine Inch Nails 
# but if it screws up that artist probably won't show up in spotify
df['artist2'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)
df.loc[df['artist'] != df['artist2']]


In [None]:
# run again if desired

In [18]:
df.loc[df['artist'].isna(), 'artist']=""
df = df.drop(df.loc[df['artist'].str.lower().str.startswith('unknown')].index)
df = df.drop(df.loc[df['artist'].str.lower().str.startswith('various')].index)
df = df.drop(df.loc[df['artist'].str.lower().str.startswith('no correction needed')].index)
df = df.drop(df.loc[df['artist']==''].index)

artist_df = df[['artist','track']].groupby('artist') \
    .count() \
    .sort_values(["artist", "track"]) \
    .reset_index()

artist_df.head(20)


Unnamed: 0,artist,track
0,(G)I-DLE,1
1,*NSYNC,1
2,070 Shake,1
3,"10,000 Maniacs",5
4,100 gecs,1
5,10cc,2
6,1975,1
7,2 Live Crew,3
8,2 live crew,1
9,200 Stab Wounds,1


# Dedupe with pandas_dedupe

In [18]:
df['artist'] = df['artist'].apply(fix_leading_trailing)
df['artist_dedupe'] = df['artist'].str.lower()
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('unknown')].index)
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('various')].index)
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('no artist found')].index)
df = df.drop(df.loc[df['artist_dedupe']=='null'].index)
df = df.drop(df.loc[df['artist_dedupe']=='none'].index)
df = df.drop(df.loc[df['artist_dedupe']==''].index)
df = df.drop(df.loc[df['artist_dedupe']=='post_score'].index)



In [19]:
df['artist_dedupe'] = df['artist_dedupe'].apply(lambda s: s[4:] if s[:4].lower()=='the ' else s)

df.loc[df['artist_dedupe']=='band', 'artist_dedupe']='the band'



In [20]:
dedupe_df = df[['artist', 'artist_dedupe', 'post_score']] \
    .groupby(['artist', 'artist_dedupe']) \
    .count() \
    .sort_values('post_score', ascending=False) \
    .reset_index() \
    .reset_index() 

dedupe_df


Unnamed: 0,index,artist,artist_dedupe,post_score
0,0,Beatles,beatles,364
1,1,Radiohead,radiohead,238
2,2,Simon & Garfunkel,simon & garfunkel,208
3,3,Fleetwood Mac,fleetwood mac,183
4,4,Jeff Buckley,jeff buckley,168
...,...,...,...,...
6222,6222,Jake Thackray,jake thackray,1
6223,6223,Jake Shimabukuro,jake shimabukuro,1
6224,6224,Jake Runestad,jake runestad,1
6225,6225,Jake,jake,1


In [21]:
# reset dedupe learned settings
!rm dedupe_dataframe_learned_settings 
!rm dedupe_dataframe_training.json   
dedupe_df2 = pandas_dedupe.dedupe_dataframe(dedupe_df, ['artist_dedupe'])



Importing data ...


  dedupe_df2 = pandas_dedupe.dedupe_dataframe(dedupe_df, ['artist_dedupe'])


KeyboardInterrupt: 

In [23]:
dedupe_df2


Unnamed: 0,index,artist,artist_dedupe,post_score,cluster id,confidence
0,0,beatles,beatles,364,0,0.436605
1,1,radiohead,radiohead,238,488,1.000000
2,2,simon garfunkel,simon garfunkel,208,1,0.431659
3,3,fleetwood mac,fleetwood mac,183,489,1.000000
4,4,jeff buckley,jeff buckley,168,2,0.581508
...,...,...,...,...,...,...
6226,6226,jake wesley rogers,jake wesley rogers,1,5664,1.000000
6227,6227,jake thackray,jake thackray,1,84,0.418634
6228,6228,jake shimabukuro,jake shimabukuro,1,5665,1.000000
6229,6229,jake runestad,jake runestad,1,5666,1.000000


In [24]:
dedupe_df['cluster id'] = dedupe_df2['cluster id']
name2i = {a: i for i, a in zip(dedupe_df['cluster id'].tolist(), dedupe_df['artist_dedupe'].tolist())}
df['artist_index'] = df['artist_dedupe'].apply(lambda s: name2i[s])
df



Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index
20023,jhbktrn,13941,Claude Debussy,Clair de Lune,Claude Debussy,claude debussy,490
20009,jhc2dyv,6996,Erik Satie,Gymnopédies,Erik Satie,erik satie,511
20014,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair,Simon & Garfunkel,simon & garfunkel,1
20030,jhcc2q8,5014,Israel Kamakawiwoole,Over the Rainbow,Israel Kamakawiwoole,israel kamakawiwoole,15
20042,jhbjlwj,4332,The Cranberries,Dreams,The Cranberries,cranberries,16
...,...,...,...,...,...,...,...
933,jheh9kn,1,Eagles,7 bridges road,Eagles,eagles,554
14587,jhh5yvh,1,Patrick Watson,5:15 AM,Patrick Watson,patrick watson,74
2717,jhef1ci,1,The Weakerthans,12 Stout Street,The Weakerthans,weakerthans,2804
917,jheh4mx,1,BØRNS,10000 emerald pools,BØRNS,børns,1922


In [25]:
tempdf1 = df[['artist', 'artist_index', 'post_score']] \
    .groupby(['artist', 'artist_index']) \
    .agg(post_score=('post_score', 'sum'),
        count=('post_score', 'count')) \
    .reset_index() \
    .reset_index(drop=True) \
    .reset_index() \
    .sort_values('post_score', ascending=False)

tempdf1 



Unnamed: 0,index,artist,artist_index,post_score,count
1114,1114,Claude Debussy,490,22150,165
4931,4931,Simon & Garfunkel,1,11123,208
521,521,Beatles,0,8900,364
5341,5341,The Cranberries,16,8272,79
1722,1722,Erik Satie,511,7568,50
...,...,...,...,...,...
2615,2615,Jens Lekman,5433,1,1
2614,2614,Jenny Lewis,5440,1,1
2613,2613,Jenny Hval,5441,1,1
2612,2612,Jennifer Lopez,5442,1,1


In [85]:
with pd.option_context("display.max_rows", 9999, "display.max_cols", 999):
        display(tempdf1.loc[tempdf1['count']>1])

        

NameError: name 'tempdf1' is not defined

In [56]:
i2name = {}
for i, a, ai, s in tempdf1[['artist','artist_index', 'post_score']].sort_values('post_score').itertuples():
    i2name[ai]=a
len(i2name)

5668

In [59]:
i2name[3]

'Pink Floyd'

In [68]:
df['artist2'] = df.apply(lambda r: i2name[r.artist_index], axis=1)
(df.loc[df['artist'] !=  df['artist2']])[['artist', 'artist2']] \
    .drop_duplicates() \
    .sort_values('artist2').to_csv('z.csv', index=False)


In [74]:
artist_map2=pd.read_csv('artist_map2.csv')
artist_map2 = dict(zip(artist_map2['artist'], artist_map2['artist2']))
df['artist2'] = df.apply(lambda r: artist_map2[r['artist']] if r['artist'] in artist_map2 else r['artist'], axis=1)
df.loc[df['artist'] !=  df['artist2']]


Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index
18081,jhc80qd,184,Cranberries,Linger,The Cranberries,cranberries,16
19524,jhc9x14,63,Mamas & The Papas,Dream a little dream,The Mamas & The Papas,mamas & the papas,167
11122,jhc6fg7,53,The Jimi Hendrix Experience,Axis: Bold as Love,Jimi Hendrix,jimi hendrix experience,40
18478,jhckqh6,40,Sundays,Wild Horses,The Sundays,sundays,137
18037,jhcpyx8,31,Carpenters,We've only just begun,The Carpenters,carpenters,57
...,...,...,...,...,...,...,...
5117,jhceka0,1,Johann Sebastian Bach/Charles Gounod,Ave Maria,Johann Sebastian Bach,johann sebastian bach/charles gounod,19
5121,jhcelfa,1,Johann Sebastian Bach/Charles Gounod,Ave Maria,Johann Sebastian Bach,johann sebastian bach/charles gounod,19
9849,jhd9sxl,1,Johann Sebastian Bach/Charles Gounod,Ave Maria,Johann Sebastian Bach,johann sebastian bach/charles gounod,19
10236,jhe51p5,1,Johann Sebastian Bach/Charles Gounod,Ave Maria,Johann Sebastian Bach,johann sebastian bach/charles gounod,19


In [75]:
df['artist'] = df.apply(lambda r: artist_map2[r['artist']] if r['artist'] in artist_map2 else r['artist'], axis=1)


In [241]:
df.loc[(df['artist_index'].isna())]
df.loc[(df['artist_index']==0)]

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index
20021,jhc1e1u,3675,Beatles,In my Life,Beatles,beatles,0
20025,jhbf5ng,2314,Beatles,Blackbird,Beatles,beatles,0
16170,jhcej4u,484,Beatles,In My Life,Beatles,beatles,0
18070,jhc2oi2,347,Beatles,Let it be,Beatles,beatles,0
19506,jhc8851,218,Beatles,Let it Be,Beatles,beatles,0
...,...,...,...,...,...,...,...
17602,jhenyhj,1,Beatles,All You Need Is Love,Beatles,beatles,0
26233,jhcteuo,1,Beatles,All I have to do,Beatles,beatles,0
4142,jhcypop,1,Beatles,Across the Universe,Beatles,beatles,0
20935,jhcw5cu,1,Beatles,Across the Universe,Beatles,beatles,0


In [242]:
dedupe_df

Unnamed: 0,index,artist,artist_dedupe,post_score,cluster id
0,0,Beatles,beatles,364,0
1,1,Radiohead,radiohead,238,488
2,2,Simon & Garfunkel,simon & garfunkel,206,1
3,3,Fleetwood Mac,fleetwood mac,183,489
4,4,Jeff Buckley,jeff buckley,168,2
...,...,...,...,...,...
6240,6240,Jai-Jagdeesh,jai-jagdeesh,1,5677
6241,6241,Jai Paul,jai paul,1,5678
6242,6242,Jades Goudreault,jades goudreault,1,5679
6243,6243,Jaden,jaden,1,5680


In [277]:
dedupe_df2.loc[dedupe_df2['cluster id'].isin(z['cluster id'])].sort_values(['cluster id', 'post_score']).head(20)

Unnamed: 0,index,artist,artist_dedupe,post_score,cluster id,confidence
3279,3279,simon,simon,1,1,0.431659
1975,1975,simon garfunkel,simon garfunkel,2,1,0.500927
2,2,simon garfunkel,simon garfunkel,206,1,0.500927
2908,2908,jeff buckley,jeff buckley,1,2,0.581508
4,4,jeff buckley,jeff buckley,168,2,0.581508
3011,3011,pink floyd,pink floyd,1,3,0.500927
351,351,pink,pink,11,3,0.431659
6,6,pink floyd,pink floyd,164,3,0.500927
3038,3038,sigur ros,sigur ros,1,4,0.581508
9,9,sigur ros,sigur ros,129,4,0.581508


In [264]:
dedupe_df2.loc[dedupe_df2['cluster id'].isin(z)].sort_values(["confidence","cluster id"]).head(20)

Unnamed: 0,index,artist,artist_dedupe,post_score,cluster id,confidence


In [272]:
tempdf=dedupe_df2.groupby(['cluster id', 'artist_dedupe']).count().reset_index()
z = tempdf.loc[tempdf['index'] > 1]
z

Unnamed: 0,cluster id,artist_dedupe,index,artist,post_score,confidence
3,1,simon garfunkel,2,2,2,2
4,2,jeff buckley,2,2,2,2
6,3,pink floyd,2,2,2,2
7,4,sigur ros,2,2,2,2
8,5,bon iver,4,4,4,4
...,...,...,...,...,...,...
757,474,michelle tumes,2,2,2,2
758,475,luca turilli,2,2,2,2
765,479,honne,2,2,2,2
772,483,isis,2,2,2,2


In [311]:
df

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index
20023,jhbktrn,13941,Claude Debussy,Clair de Lune,Claude Debussy,claude debussy,490
20009,jhc2dyv,6996,Erik Satie,Gymnopédies,Erik Satie,erik satie,511
20014,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair,Simon,simon & garfunkel,1
20030,jhcc2q8,5014,Israel Kamakawiwoole,Over the Rainbow,Israel,israel kamakawiwoole,15
20042,jhbjlwj,4332,The Cranberries,Dreams,The Cranberries,cranberries,16
...,...,...,...,...,...,...,...
933,jheh9kn,1,Eagles,7 bridges road,Eagles,eagles,558
14587,jhh5yvh,1,Patrick Watson,5:15 AM,Patrick,patrick watson,75
2717,jhef1ci,1,The Weakerthans,12 Stout Street,The Weakerthans,weakerthans,2806
917,jheh4mx,1,BØRNS,10000 emerald pools,BØRNS,børns,1886


In [243]:
# map to artist 
tempdf = dedupe_df[['artist_dedupe', 'artist', 'cluster id', 'post_score']] \
    .groupby(['artist_dedupe', 'cluster id']) \
    .agg( \
         count=('post_score', 'count'), \
         artist=('artist', 'first') \
        ) \
    .reset_index() \
    .sort_values('count', ascending=False) 
with pd.option_context("display.max_rows", 9999):
    display(tempdf.head(100))

Unnamed: 0,artist_dedupe,cluster id,count,artist
756,bon iver,5,4,Bon Iver
1246,cranberries,16,4,The Cranberries
4075,national,136,4,The National
1973,frank ocean,11,3,Frank Ocean
5314,style council,291,3,Style Council
5420,tears for fears,147,3,Tears for Fears
363,arctic monkeys,160,3,Arctic Monkeys
1130,cinematic orchestra,48,3,The Cinematic Orchestra
3302,leonard cohen,37,3,Leonard Cohen
5245,staves,382,3,The Staves


In [251]:
tempdf.loc[tempdf['cluster id']==6]

Unnamed: 0,artist_dedupe,cluster id,count,artist
5416,taylor swift,6,2,Taylor Swift
5417,taylor swift ft. bon iver,6,1,Taylor Swift ft. Bon Iver


In [244]:
i2name = {i: a for i, a in zip(tempdf['cluster id'].tolist(), tempdf['artist'].tolist())}
df['artist2'] = df.apply(lambda r: i2name[r.artist_index], axis=1)
df.loc[df['artist'] !=  df['artist2']]

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index
20014,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair,Simon,simon & garfunkel,1
20030,jhcc2q8,5014,Israel Kamakawiwoole,Over the Rainbow,Israel,israel kamakawiwoole,15
20036,jhbt66b,1552,Simon & Garfunkel,April come she will,Simon,simon & garfunkel,1
20037,jhccfwe,1349,Massive Attack,Teardrop,"Massive Attack, Burial and Hope Sandoval",massive attack,80
18078,jhbxeim,924,leonard cohen,Hallelujah,Leonard Cohen,leonard cohen,37
...,...,...,...,...,...,...,...
10236,jhe51p5,1,Johann Sebastian Bach/Charles Gounod,Ave Maria,Johann Sebastian Bach,johann sebastian bach/charles gounod,19
18270,jhe739n,1,Johann Sebastian Bach/Charles Gounod,Ave Maria,Johann Sebastian Bach,johann sebastian bach/charles gounod,19
2673,jhehtqd,1,The Cinematic Orchestra,Arrival of the birds,Cinematic Orchestra and Patrick Watson,cinematic orchestra,48
23904,jhd0yp5,1,John Williams,Across the stars,John,john williams,104


In [76]:
df.loc[df['artist'].str.lower().str.find('carp') >=0]

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index
19483,jhc74vp,360,Karen Carpenter,(They Long to Be) Close to You,Karen Carpenter,karen carpenter,1007
20511,jhcqg55,31,Karen Carpenter,Superstar,Karen Carpenter,karen carpenter,1007
18037,jhcpyx8,31,The Carpenters,We've only just begun,The Carpenters,carpenters,57
15117,jhcptrv,14,The Carpenters,Close to You,The Carpenters,carpenters,57
20515,jhd2fwe,11,The Carpenters,Superstar,The Carpenters,carpenters,57
...,...,...,...,...,...,...,...
22336,jhcz73j,1,The Carpenters,Close to you,The Carpenters,carpenters,57
3242,jhct0dp,1,The Carpenters,Close to You,The Carpenters,carpenters,57
10402,jhe2fmn,1,The Carpenters,Close to You,The Carpenters,carpenters,57
25454,jhcx2si,1,The Carpenters,Close To You,The Carpenters,carpenters,57


In [77]:
df.groupby('track') \
    .count() \
    .reset_index() \
    .sort_values('artist', ascending=False) \
    .head(20)



Unnamed: 0,track,post_id,post_score,artist,artist2,artist_dedupe,artist_index
4068,Hallelujah,160,160,160,160,160,160
2020,Clair de Lune,70,70,70,70,70,70
2814,Dreams,59,59,59,59,59,59
981,Ave Maria,58,58,58,58,58,58
9374,Songbird,54,54,54,54,54,54
5690,Landslide,53,53,53,53,53,53
8697,Saturn,52,52,52,52,52,52
11483,Vincent,47,47,47,47,47,47
6757,Moonlight Sonata,44,44,44,44,44,44
1331,Blackbird,41,41,41,41,41,41


In [79]:
df['track2'] = df['track'].str.lower()


In [80]:
df = df.drop(df.loc[df['track2'].str.startswith('unknown')].index)
df = df.drop(df.loc[df['track2'].str.startswith('no track')].index)
df = df.drop(df.loc[df['track2']=='cover'].index)
df = df.drop(df.loc[df['track2']=='version'].index)
df = df.drop(df.loc[df['track2']=='anything'].index)
df = df.drop(df.loc[df['track2']=='none'].index)
df = df.drop(df.loc[df['track2'].str.startswith('no artist')].index)
df = df.drop(df.loc[df['track2'].str.startswith('various')].index)
df = df.drop(df.loc[df['track2']==''].index)
len(df)

22605

In [81]:
df[['artist', 'track', 'post_score', 'track2']] \
    .groupby(['artist', 'track2']) \
    .agg(sum= ('post_score', 'sum'), \
         track= ('track', 'first') \
        ) \
    .reset_index() \
    .sort_values('sum', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'sum': 'score'})



Unnamed: 0,artist,track2,score,track
0,Claude Debussy,clair de lune,21989,Clair de Lune
1,Erik Satie,gymnopédies,7003,Gymnopédies
2,Simon & Garfunkel,scarborough fair,6189,Scarborough Fair
3,The Cranberries,dreams,5723,Dreams
4,Neil Young,harvest moon,5326,Harvest Moon
...,...,...,...,...
13398,Indigo Girls,all that we let in,1,All That We Let In
13399,Indila,dernier danse,1,Dernier Danse
13400,Indila,derniere danse,1,derniere danse
13401,Industries of the Blind,i just wanted to make you something beautiful,1,I Just Wanted to Make You Something Beautiful


In [82]:
dedupe_track_df = df[['artist', 'track', 'track2', 'post_score']] \
    .groupby(['artist', 'track', 'track2']) \
    .count() \
    .sort_values('post_score', ascending=False) \
    .reset_index() \
    .reset_index() \
    .rename(columns={'post_score': 'count'})


dedupe_track_df


Unnamed: 0,index,artist,track,track2,count
0,0,Jeff Buckley,Hallelujah,hallelujah,96
1,1,Claude Debussy,Clair de Lune,clair de lune,70
2,2,Sleeping at Last,Saturn,saturn,51
3,3,Don McLean,Vincent,vincent,46
4,4,Ludwig van Beethoven,Moonlight Sonata,moonlight sonata,43
...,...,...,...,...,...
14900,14900,Guy Farley,Drawing,drawing,1
14901,14901,Gwar,Meat Sandwich,meat sandwich,1
14902,14902,Gym Class Heroes,Stereo Hearts,stereo hearts,1
14903,14903,Gym Class Heroes and Adam Levine,Stereo Hearts,stereo hearts,1


In [83]:
!rm dedupe_dataframe_learned_settings 
!rm dedupe_dataframe_training.json   
dedupe_track_df2 = pandas_dedupe.dedupe_dataframe(dedupe_track_df[['artist', 'track2']], 
                                                  ['artist','track2'],
                                                  canonicalize=True,)y
y


Importing data ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dedupe_track_df2 = pandas_dedupe.dedupe_dataframe(dedupe_track_df[['artist', 'track2']],
  dedupe_track_df2 = pandas_dedupe.dedupe_dataframe(dedupe_track_df[['artist', 'track2']],
artist : elton john
track2 : mona lisas and mad hatters

artist : elton john
track2 : mona lisas and mad hatters

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


Starting active labeling...
y


artist : roberta flack
track2 : the first time ever i saw your face

artist : roberta flack
track2 : the first time ever i saw your face

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : simon garfunkel
track2 : bridge over troubled water

artist : simon garfunkel
track2 : bridge over troubled waters

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : simon garfunkel
track2 : bridge over troubled water

artist : simon garfunkel
track2 : bridge over troubled waters

3/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : beatles
track2 : golden slumbers/carry that weight

artist : beatles
track2 : golden slumbers/carry that weight/the end

4/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : maurice ravel
track2 : daphnis et chloe 2

artist : maurice ravel
track2 : daphnis et chloe

5/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : john denver
track2 : annies song

artist : john denver
track2 : annies song

6/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : yoko shimomura
track2 : dearly beloved

artist : yoko shimomura
track2 : dearly beloved

7/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : portishead
track2 : roads

artist : portishead
track2 : roads

8/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : joni mitchell
track2 : river

artist : joni mitchell
track2 : river

9/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : metallica
track2 : nothing else matters

artist : metallica
track2 : nothing else matter

10/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : john denver
track2 : country roads

artist : john denver
track2 : country road

11/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : ennio morricone
track2 : cinema paradiso theme

artist : ennio morricone
track2 : cinema paradiso

12/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : yoko shimomura
track2 : dearly beloved

artist : yoko shimomura
track2 : dearly beloved reprise

13/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : kishi bashi
track2 : this must be the place

artist : kishi bashi
track2 : this must be the place cover

14/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : elvis presley
track2 : i cant help falling in love with you

artist : elvis presley
track2 : i cant help falling in love

15/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : ellie goulding
track2 : mirror

artist : ellie goulding
track2 : mirrors

16/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : pink floyd
track2 : shine in you crazy diamond

artist : pink floyd
track2 : shine on you crazy diamond

17/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : nikolai rimsky-korsakov
track2 : scheherazade

artist : nikolai rimsky-korsakov
track2 : scheherazade movement 3

18/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : crosby stills nash
track2 : guinevere

artist : crosby stills nash
track2 : guinevere (live)

19/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : beabadoobee
track2 : glue

artist : beabadoobee
track2 : glue song

20/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : elvis presley
track2 : i cant help falling in love with you

artist : elvis presley
track2 : cant help falling in love with you

21/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : edith piaf
track2 : le via en rose

artist : edith piaf
track2 : le vie en rose

22/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : frederic chopin
track2 : nocture op.9 no.2

artist : frederic chopin
track2 : nocturne op.9 no.2 in eb minor

23/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : cardi b ft. megan thee stallion
track2 : wap

artist : cardi b
track2 : wap feat. megan thee stallion

24/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : nightwish
track2 : while your lips are still red

artist : nightwish
track2 : kiss while your lips are still red

25/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : yiruma
track2 : river flows in you

artist : yiruma
track2 : river flows in

26/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : ween
track2 : a tear for eddy

artist : ween
track2 : a tear for eddie

27/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : muse
track2 : exogenesis symphony 1-3

artist : muse
track2 : exogenesis symphony part 3

28/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : orbital
track2 : halcyon on on

artist : orbital
track2 : halcyon on and on

29/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : jvke
track2 : golden hour

artist : jvke
track2 : golden houre

30/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : extreme
track2 : more than words

artist : extreme
track2 : more then words

31/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : extreme
track2 : more than words

artist : extreme
track2 : more then words

32/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : coldplay
track2 : o

artist : coldplay
track2 : prospekts march/poppyfields

33/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : aurora
track2 : potion of love

artist : aurora
track2 : potion for love

33/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : him
track2 : sleepwalking past hope

artist : chain gang of 1974
track2 : sleepwalking

34/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : disturbed
track2 : sound of silence

artist : disturbed
track2 : sounds of silence

34/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : disturbed
track2 : sound of silence

artist : disturbed
track2 : sounds of silence

35/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : queen
track2 : 39

artist : queen
track2 : good old fashioned lover boy

36/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : queen
track2 : who wants to live forever

artist : queen
track2 : 39

36/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : coldplay
track2 : the scientist

artist : coldplay
track2 : o

36/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : desree
track2 : kissing you

artist : desree
track2 : im kissing you

36/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : coldplay
track2 : viva la vida

artist : coldplay
track2 : o

37/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : coldplay
track2 : o

artist : coldplay
track2 : in my place

37/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : shinedown
track2 : 45

artist : shinedown
track2 : ill follow you down

37/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : bob marley
track2 : three little birds

artist : bob marley
track2 : war

37/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : hayde bluegrass orchestra
track2 : all my tears

artist : julie miller
track2 : all my tears

37/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : null
track2 : moon river

artist : jacob collier
track2 : moon river

37/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : null
track2 : moon river

artist : breakfast at tiffanys
track2 : moon river

37/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : frederic chopin
track2 : nocturnes, op.9, no 1 on b-flat minor

artist : frederic chopin
track2 : nocturne op. 9 no. 1

37/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : israel kamakawiwoole
track2 : somewhere over the rainbow

artist : izzy
track2 : somewhere over the rainbow

38/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : erik satie
track2 : gymnopedie no. 1

artist : erik satie
track2 : gymnopedie movement

39/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : sergei rachmaninoff
track2 : rachmaninoffs piano concerto no. 2

artist : sergei rachmaninoff
track2 : rachmaninov 2nd piano concerto

40/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : frederic chopin
track2 : nocturne 72 no. 1

artist : frederic chopin
track2 : nocturne op.9 no.2 in eb minor

41/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : don mclean
track2 : starry starry night

artist : don mclean
track2 : vincent(starry starry night)

41/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : the weeknd
track2 : as you are

artist : the weeknd
track2 : as u are

42/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : israel kamakawiwoole
track2 : over the rainbow

artist : kamakawiwoole
track2 : over the rainbow

43/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : bon iver
track2 : auatc

artist : bon iver
track2 : everything that bon iver has ever written

44/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : melissa benoist
track2 : moon river

artist : null
track2 : moon river

44/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : moby
track2 : porcelain

artist : marianas trench
track2 : porcelain

44/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : yann tiersen
track2 : comptine dun autre ete - lapres-midi

artist : yann tiersen
track2 : comptine d un autre ete (piano song from the film amelie)

44/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : joni mitchell
track2 : carey

artist : joni mitchell
track2 : chinese cafe/unchained melody

45/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : ludwig van beethoven
track2 : 9th symphony

artist : ludwig van beethoven
track2 : symphony no. 7 in a major, op. 92: ii. allegretto

45/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : bon iver
track2 : the wolves (acts i and ii)

artist : bon iver
track2 : 666

45/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : elvis presley
track2 : cant help falling in love with you

artist : elvis presley
track2 : falling in love with you

45/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious





(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : erik satie
track2 : gymnopedie no.1

artist : erik satie
track2 : gymnopedies, lent et douloureux

46/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : elvis presley
track2 : cant help falling in love with you

artist : elvis presley
track2 : falling in love with you

47/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : taylor swift
track2 : invisible string

artist : taylor swift
track2 : ivy

48/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : hans zimmer
track2 : time

artist : hans zimmer
track2 : interstellar piano cover

48/10 positive, 20/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : soen
track2 : lotus

artist : jhene aiko
track2 : lotus

48/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : lana del rey
track2 : mariners apartment complex

artist : lana del rey
track2 : yayo

48/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : annie lennox
track2 : cold

artist : annie lennox
track2 : dont let it bring you down

48/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : israel kamakawiwoole
track2 : somewhere over the rainbow

artist : izzy
track2 : somewhere over the rainbow

48/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : sergei rachmaninoff
track2 : rachmaninoff piano concerto 2

artist : sergei rachmaninoff
track2 : rachmaninov 2nd piano concerto

49/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : pearl jam
track2 : sirens

artist : pearl jam
track2 : elderly woman behind the counter in a small town

50/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : tpau
track2 : heart and soul

artist : hoagy carmichael
track2 : heart and soul

50/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : frederic chopin
track2 : nocturnes, op.9, no 1 on b-flat minor

artist : frederic chopin
track2 : nocturne no. 2

50/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : johann sebastian bach
track2 : tocatta and fugue transposed to harp or guitar is a much better timbre than on organ imo

artist : johann sebastian bach
track2 : cello suite

50/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : sergei rachmaninoff
track2 : rachmaninoff piano concerto 2

artist : sergei rachmaninoff
track2 : rachmaninoffs 3rd

50/10 positive, 28/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : frederic chopin
track2 : nocturne in c minor

artist : frederic chopin
track2 : nocturne in b flat minor

50/10 positive, 29/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : joe satriani
track2 : always with me, always with you

artist : joe satriani
track2 : always with you, always with me

50/10 positive, 30/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : joe satriani
track2 : always with you, always with me

artist : joe satriani
track2 : always with me, always with you

51/10 positive, 30/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : bjork
track2 : joga

artist : bjork
track2 : yoga

52/10 positive, 30/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : ludwig van beethoven
track2 : heiliger dankgesang, the 3rd movement of his string quartet 15, opus 132

artist : ludwig van beethoven
track2 : moonlight

53/10 positive, 30/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : bon iver
track2 : 666

artist : bon iver
track2 : a song for a lover of long ago

53/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : beyonce
track2 : halo

artist : martin odonnell and michael salvatori
track2 : halo

53/10 positive, 32/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : tears for fears
track2 : everybody wants to rule the world

artist : tears for fears
track2 : stay

53/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


artist : erik satie
track2 : gymnopedie no. 1

artist : erik satie
track2 : gymnopedies, lent et douloureux

53/10 positive, 34/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


artist : maxwell
track2 : this womans work

artist : maxwell
track2 : pretty this womans work

54/10 positive, 34/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


Clustering...
# duplicate sets 10026


In [84]:
dedupe_track_df2


Unnamed: 0,artist,track2,cluster id,confidence,canonical_artist,canonical_track2
0,jeff buckley,hallelujah,1,0.401786,jeff buckley,hallelujah
1,claude debussy,clair de lune,2,0.684708,claude debussy,clair de lune
2,sleeping at last,saturn,3,0.246748,sleeping at last,something about saturn
3,don mclean,vincent,4,0.533008,don mclean,vincent (starry starry night)
4,ludwig van beethoven,moonlight sonata,5,0.275113,ludwig van beethoven,piano concerto no.5 in e flat major op.73 - em...
...,...,...,...,...,...,...
14900,guy farley,drawing,10023,1.000000,guy farley,drawing
14901,gwar,meat sandwich,10024,1.000000,gwar,meat sandwich
14902,gym class heroes,stereo hearts,1804,0.729249,gym class heroes,stereo hearts
14903,gym class heroes and adam levine,stereo hearts,1804,0.729249,gym class heroes,stereo hearts


In [85]:
dedupe_track_df['track_id']=dedupe_track_df2['cluster id']
dedupe_track_df['confidence']=dedupe_track_df2['confidence']
dedupe_track_df

Unnamed: 0,index,artist,track,track2,count,track_id,confidence
0,0,Jeff Buckley,Hallelujah,hallelujah,96,1,0.401786
1,1,Claude Debussy,Clair de Lune,clair de lune,70,2,0.684708
2,2,Sleeping at Last,Saturn,saturn,51,3,0.246748
3,3,Don McLean,Vincent,vincent,46,4,0.533008
4,4,Ludwig van Beethoven,Moonlight Sonata,moonlight sonata,43,5,0.275113
...,...,...,...,...,...,...,...
14900,14900,Guy Farley,Drawing,drawing,1,10023,1.000000
14901,14901,Gwar,Meat Sandwich,meat sandwich,1,10024,1.000000
14902,14902,Gym Class Heroes,Stereo Hearts,stereo hearts,1,1804,0.729249
14903,14903,Gym Class Heroes and Adam Levine,Stereo Hearts,stereo hearts,1,1804,0.729249


In [86]:
dedupe_track_df3 = dedupe_track_df.loc[dedupe_track_df['confidence'] > 0.4][['artist', 'track', 'track_id']]
dedupe_track_df3


Unnamed: 0,artist,track,track_id
0,Jeff Buckley,Hallelujah,1
1,Claude Debussy,Clair de Lune,2
3,Don McLean,Vincent,4
6,Johann Pachelbel,Canon in D,7
10,The Beach Boys,God Only Knows,10
...,...,...,...
14900,Guy Farley,Drawing,10023
14901,Gwar,Meat Sandwich,10024
14902,Gym Class Heroes,Stereo Hearts,1804
14903,Gym Class Heroes and Adam Levine,Stereo Hearts,1804


In [87]:
names2i = {(artist, track): id for artist, track, id in zip(dedupe_track_df3['artist'], 
                               dedupe_track_df3['track'],
                               dedupe_track_df3['track_id'])
         }
names2i

{('Jeff Buckley', 'Hallelujah'): 1,
 ('Claude Debussy', 'Clair de Lune'): 2,
 ('Don McLean', 'Vincent'): 4,
 ('Johann Pachelbel', 'Canon in D'): 7,
 ('The Beach Boys', 'God Only Knows'): 10,
 ('Neil Young', 'Harvest Moon'): 11,
 ('leonard cohen', 'Hallelujah'): 1805,
 ('Beatles', 'Yesterday'): 1806,
 ('Led Zeppelin', 'The Rain Song'): 13,
 ('The Cranberries', 'Linger'): 15,
 ('Louis Armstrong', 'What a Wonderful World'): 16,
 ('Beatles', 'Blackbird'): 17,
 ('JVKE', 'Golden Hour'): 18,
 ('Radiohead', 'Nude'): 19,
 ('Goo Goo Dolls', 'Iris'): 1807,
 ('Yiruma', 'River flows in you'): 0,
 ('Radiohead', 'Reckoner'): 1808,
 ('Jimi Hendrix', 'Little Wing'): 24,
 ('The Righteous Brothers', 'Unchained Melody'): 26,
 ('Enya', 'Only Time'): 27,
 ('Beatles', 'Something'): 1809,
 ('Radiohead', 'Let Down'): 30,
 ('Eric Clapton', 'Tears in Heaven'): 31,
 ('The Moody Blues', 'Nights in White Satin'): 33,
 ('null', 'null'): 34,
 ('Chris Isaak', 'Wicked Game'): 35,
 ('John Butler Trio', 'Ocean'): 36,
 ('

In [88]:
df['track_index']= df.apply(lambda r: str(names2i[(r['artist'], r['track'])]) if (r['artist'], r['track']) in names2i else r['track'], axis=1)

In [89]:
df

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe,artist_index,track2,track_index
20023,jhbktrn,13941,Claude Debussy,Clair de Lune,Claude Debussy,claude debussy,490,clair de lune,2
20009,jhc2dyv,6996,Erik Satie,Gymnopédies,Erik Satie,erik satie,511,gymnopédies,265
20014,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair,Simon & Garfunkel,simon & garfunkel,1,scarborough fair,Scarborough Fair
20030,jhcc2q8,5014,Israel Kamakawiwoole,Over the Rainbow,Israel Kamakawiwoole,israel kamakawiwoole,15,over the rainbow,40
20042,jhbjlwj,4332,The Cranberries,Dreams,The Cranberries,cranberries,16,dreams,Dreams
...,...,...,...,...,...,...,...,...,...
933,jheh9kn,1,Eagles,7 bridges road,Eagles,eagles,554,7 bridges road,1709
14587,jhh5yvh,1,Patrick Watson,5:15 AM,Patrick Watson,patrick watson,74,5:15 am,2592
2717,jhef1ci,1,The Weakerthans,12 Stout Street,The Weakerthans,weakerthans,2804,12 stout street,4451
917,jheh4mx,1,BØRNS,10000 emerald pools,BØRNS,børns,1922,10000 emerald pools,1698


In [24]:
tempdf = df[['artist', 'track', 'post_score']] \
    .groupby(['artist']) \
    .agg(sum= ('post_score', 'sum'), \
         track= ('track', 'first') \
        ) \
    .reset_index() \
    .sort_values(['sum'], ascending=False) \
    .rename(columns={'sum': 'score'}) \
    .reset_index(drop=True)
tempdf

Unnamed: 0,artist,score,track
0,Claude Debussy,22150,Clair de Lune
1,Simon & Garfunkel,11123,Scarborough Fair
2,Beatles,8900,In my Life
3,The Cranberries,8272,Dreams
4,Erik Satie,7568,Gymnopédies
...,...,...,...
6222,Jephte Guillaume,1,The Prayer
6223,Jens Lekman,1,Night Falls Over Kortedala/The Linden Trees Ar...
6224,Jenny Lewis,1,Love you forever
6225,Jenny Hval,1,Die


In [25]:
df

Unnamed: 0,post_id,post_score,artist,track,artist2,artist_dedupe
20023,jhbktrn,13941,Claude Debussy,Clair de Lune,Claude Debussy,claude debussy
20009,jhc2dyv,6996,Erik Satie,Gymnopédies,Erik Satie,erik satie
20014,jhc6oud,6144,Simon & Garfunkel,Scarborough Fair,Simon & Garfunkel,simon & garfunkel
20030,jhcc2q8,5014,Israel Kamakawiwoole,Over the Rainbow,Israel Kamakawiwoole,israel kamakawiwoole
20042,jhbjlwj,4332,The Cranberries,Dreams,The Cranberries,cranberries
...,...,...,...,...,...,...
933,jheh9kn,1,Eagles,7 bridges road,Eagles,eagles
14587,jhh5yvh,1,Patrick Watson,5:15 AM,Patrick Watson,patrick watson
2717,jhef1ci,1,The Weakerthans,12 Stout Street,The Weakerthans,weakerthans
917,jheh4mx,1,BØRNS,10000 emerald pools,BØRNS,børns


In [92]:
tempdf = tempdf[['artist', 'track', 'score']]
display(tempdf.loc[tempdf['score'] > 4].head(20))
display(tempdf.loc[tempdf['score'] > 4].tail(20))


Unnamed: 0,artist,track,score
0,Claude Debussy,Clair de Lune,22051
1,Erik Satie,Gymnopédies,7242
2,Israel Kamakawiwoole,Over the Rainbow,6958
3,Simon & Garfunkel,Scarborough Fair,6189
4,The Cranberries,Dreams,5723
5,Neil Young,Harvest Moon,5326
6,Beatles,In my Life,4268
7,Elton John,Your Song,3907
8,Don McLean,"Vincent (Starry, Starry Night)",3832
9,Mazzy Star,Fade into you,3821


Unnamed: 0,artist,track,score
1517,The Shins,Simple song,5
1518,Black Sabbath,orchid,5
1519,Stephen Sanchez,until I found you,5
1520,Franz Liszt,Un sospiro,5
1521,Ronan Keating,When You Say Nothing at All,5
1522,Simon & Garfunkel,59th street bridge song,5
1523,Tycho,A walk,5
1524,James Blake,Into the Red,5
1525,Death Cab for Cutie,Tiny Vessels,5
1526,Tyler Childers,Lady May,5


In [26]:
df = tempdf.loc[tempdf['score'] > 4]
df.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(df, f)


In [None]:
# I think dedupe may not be matching most popular form of artist
z = {'Hans Zimmer Radiohead':'Hans Zimmer',
'Fleetwood Mac / Peter Green':'Fleetwood Mac',
'Jeremy Soule featuring Asja':'Jeremy Soule',
'Cardi B featuring Megan Thee Stallion':'Cardi B',
'FWM':'Fleetwood Mac',
'Eric Whitacre ft. Voces8':'Eric Whitacre',
'Gorillaz feat. Little Dragon':'Gorillaz',
'RAC (feat. Katie Herzig)':'RAC',
'Jamie XX featuring Romy':'Jamie XX',
'Spell Songs featuring Julie Fowlis':'Spell Songs',
'Kenny G featuring Aaron Neville':'Kenny G',
'Charlie Haden Quartet featuring Norah Jones':'Charlie Haden Quartet',
'Wiz Khalifa feat. Charlie Puth':'Wiz Khalifa',
'Marshmello feat. Khalid':'Marshmello',
'YOSHIKI feat. HYDE':'YOSHIKI',
'Charlie Hunter Quartet featuring Norah Jones':'Charlie Hunter Quartet',
'Alison Krauss featuring Natalie MacMaster':'Alison Krauss',
'Hodson featuring Jay-Z':'Hodson',
'Nujabes feat. Cise Starr':'Nujabes',
'Flume feat. Tove Lo':'Flume',
'Drake feat. Yebba':'Drake',
'Radwimps featuring Toaka':'Radwimps',
'Weezer featuring Hayley Williams':'Weezer',
'Post Malone featuring Swae Lee':'Post Malone',
'Polyphia featuring Ichika':'Polyphia',
'Eminem featuring Dido':'Eminem',
'PJ Morton featuring Yebba':'PJ Morton',
'Aurora featuring Pomme':'Aurora',
'Kaskade featuring Haley':'Kaskade',
'XXXTENTACION featuring Scott James':'XXXTENTACION',
'Snow Patrol feat. Martha Wainwright':'Snow Patrol',
'Direct featuring Danyka Nadeau':'Direct',
'Erra feat. Courtney LaPlante from Spirit Box':'Erra',
'Ursine Vulpine featuring Annaca':'Ursine Vulpine',
'Howard Shore featuring Sir James Galway':'Howard Shore',
'Daft Punk feat. Paul Williams':'Daft Punk',
'Wooli featuring Delaney Kai':'Wooli',
'UNKLE featuring Thom Yorke':'UNKLE',
'Bastille feat. The Chamber Orchestra of London':'Bastille',
"Marty O'Donnell, Stan LePard, & Michael Salvatori":"Marty O'Donnell",
"Stan Getz, João Gilberto":"Stan Getz",
'RAC (feat. Katie Herzig)':'RAC',
'Taylor Swift ft. Bon Iver':'Taylor Swift',
'Calvin Harris ft. Florence Welch':'Calvin Harris',
'Brad Paisley ft. Alison Krauss':'Brad Paisley',
'Delirium ft. Sarah McLachlan':'Delirium',
'T-Pain ft. Akon, Mary J. Blige':'T-Pain',
'Manchester Orchestra ft. Daniel Radcliffe and Paul Dano':'Manchester Orchestra',
'Nujabes ft. MINMI':'Nujabes',
'Samurai Champloo ft. MINMI & Nujabes':'Samurai Champloo',
'MINMI ft. Nujabes':'MINMI',
'ODESZA ft. MARO':'ODESZA',
'Sharon Jones & The Dap-Kings ft. Lee Fields':'Sharon Jones & The Dap-Kings',
'Sarah Barrios ft. Eric Nam':'Sarah Barrios',
'David Arkenstone ft. Charlee Brooks':'David Arkenstone',
'XXYYXX ft. Anneka':'XXYYXX',
'Leonard Cohen, Brandi Carlile':'Leonard Cohen',
'Harry Waters Jr., Marvin Berry, and the Starlighters':'Harry Waters Jr.',
'Roberta Flack, Donny Hathaway':'Roberta Flack',
'Zedd, Maren Morris, Grey':'Zedd',
'Frank Ocean, James Blake':'Frank Ocean',
'Susan Suh, Robert Koch':'Susan Suh',
'Black Country, New Road':'Black Country',
'Khruangbin, Leon Bridges':'Khruangbin',
'Friendship, Emily Warren':'Friendship',
'Jonsi, Alex':'Jonsi',
'Khalid, Future':'Khalid',
'Duke Ellington, John Coltrane':'Duke Ellington',
'Clams Casino, Imogen Heap':'Clams Casino',
'Burial, Sacred Tapestry':'Burial',
'foudeqush, Ludwig Goransson':'foudeqush',
'Conjure One, Poe':'Conjure One',
'Cyril Giroux, Chloé Lacan':'Cyril Giroux',
'Carrie Underwood, Travis Cottrell, Debby Boone':'Carrie Underwood',
'Aska, Chage':'Aska',
'T-Pain ft. Akon, Mary J. Blige':'T-Pain',
'Dan Balan, Katerina Begu':'Dan Balan',
'Jose Padilla, Seal':'Jose Padilla',
'Ratso, Nick Cave':'Ratso',
'Slaughter Beach, Dog':'Slaughter Beach',
'Paganini, Liszt':'Paganini',
'May Erlewine, Woody Goss':'May Erlewine',
'Cyua, Hiroyuki Sawano':'Cyua',
'Dan Zanes, Natalie Merchant':'Dan Zanes',
'Sting, Ray Chen':'Sting',
'Coco and Clair Clair, Okthxbb':'Coco and Clair Clair',
'Edgar Meyer, Mike Marshall, Bela Fleck':'Edgar Meyer',
'Ray Charles, Willie Nelson':'Ray Charles',
'Wildlight, The Polish Ambassador and Ayla Nereo':'Wildlight',
'Max Richter, Dinah Washington':'Max Richter',
'Steve Martin, Dolly Parton, Vince Gill':'Steve Martin',
'Elis Regina, Antonio Carlos Jobim':'Elis Regina',
'James Blunt, The Righteous Brothers, Brad Paisley':'James Blunt',
'Leprous, Dream Theater, Periphery':'Leprous',
'Frank Sinatra, Glenn Miller, Van Morrison':'Frank Sinatra',
'Pink, Willow Sage Hart':'Pink',
'Jessye Norman, Stephen Adams, Christopher Bowers-Broadbent':'Jessye Norman',
'Nujabes, MINMI, and Samurai Champloo':'Nujabes',
'Nujabes, Samurai Champloo, and MINMI':'Nujabes',
'Steve Martin, Steep Canyon Rangers':'Steve Martin',
'Celine Dion, Barbra Streisand':'Celine Dion',
'Bryan Adams, Luciano Pavarotti':'Bryan Adams',
'MUZZ (Mat Zo, Olan and A&B)':'MUZZ',
'Pink, Sage (The Gemini':'Pink',
'Moby, Sinead Oconnor':'Moby',
'I Vow to Thee, My Country':'I Vow to Thee',
'Jose Padilla, Kirsty Keach':'Jose Padilla',
'Don Francisco, Wendy Francisco, Jerry Palmer':'Don Francisco',
'Dave Grohl, Josh Homme, & Trent Reznor':'Dave Grohl',
'Steve Conte, Maaya Sakamoto':'Steve Conte',
'Sting, Stevie Wonder':'Sting',
'Jacob Collier, Lizzy McAlpine, John Mayer':'Jacob Collier',
'Joseph Shabason, Nicholas Krgovich, Shabason & Krgovich':'Joseph Shabason',
'Khalid, Benny Blanco, Halsey':'Khalid',
'Ed Sheeran, Andrea Bocelli':'Ed Sheeran',
'Kim Petras, Nicki Minaj':'Kim Petras',
'Bryce Dessner, James McAlister, Nico Muhly, Sufjan Stevens':'Bryce Dessner',
'Appleseed, YouSeeBigGirl, T:T':'Appleseed',
'Solarstone, Andy Bury':'Solarstone',
'Carti, Summertime Sadness':'Carti',
'Dolly Parton, Linda Ronstadt, and Emmylou Harris':'Dolly Parton',
'LMM, Hwasa':'LMM',
'Snowgoons, Viro the Virus':'Snowgoons',
'Sarah Class, Cantamus Choir':'Sarah Class',
'Boy meets Girl, Brian McKnight, Vanessa Williams, Bonnie Tyler, Jax':'Boy meets Girl',
'Jeff Buckley, The Righteous Brothers, Johann Pachelbel':'Jeff Buckley',
'Nate J, Nate Traveller':'Traveller',
'Debussy, Flight Facilities':'Flight Facilities',
'Death Cab for Cutie/The Postal Service':'Death Cab for Cutie',
'Sting (musician)':'Sting',
'Mick Hucknall Simply Red':'Simply Red',
'Johnny Cash and Bob Dylan':'Johnny Cash',
'John Prine and Bonnie Raitt':'Bonnie Raitt',
'Andrea Bocelli and Celine Dion':'Andrea Bocelli',
'Bob Dylan & Johnny Cash':'Bob Dylan',
'Minnie Riperton and Richard Rudolph':'Minnie Riperton',
'Eric Whitacre featuring Voces8':'Eric Whitacre',
'Hans Zimmer and Benjamin Wallfisch':'Hans Zimmer',
'Grover Washington Jr. & Bill Withers':'Grover Washington Jr.',
'Barry DeVorzon and Perry Botkin Jr.':'Barry DeVorzon',
'Porter Robinson and Madeon':'Porter Robinson',
'Ed Sheeran ft Yebba':'Ed Sheeran',
'Sara Bareilles & Josh Groban':'Sara Bareilles',
'St. Vincent (musician)':'St. Vincent',
'Glen Hansard and Marketa Irglova':'Glen Hansard',
'Willie Nelson & Ray Charles':'Willie Nelson',
'Alina Baraz Galimatias':'Alina Baraz',
'Hans Zimmer & Lisa Gerrard':'Hans Zimmer' ,
'Louis Armstrong Jr.': 'Louis Armstrong',
'Ludovico Einaudi ft. Greta Svabo Bech': 'Ludovico Einaudi',
'Henry Mancini & Audrey Hepburn': 'Henri Mancini',
'Bon Iver and St. Vincent': 'Bon Iver',
'Coldplay Avicii': 'Coldplay',
'Bruce Springsteen, Melissa Etheridge': 'Bruce Springsteen',
'Billie Eilish ft. Khalid': 'Billie eilish',
'Andrea Bocelli and Josh Groban': 'Andrea Bocelli',
'Norah Jones and Danger Mouse': 'Norah Jones',
'Ennio Morricone & Joan Baez': 'Ennio Morricone',
"Des'ree": 'Desree',
'Porter Robinson, Madeon': 'Porter Robinson',
'Ray Charles & Willie Nelson': 'Ray Charles',
'Ludvig Forssell and Jenny Plant': 'Ludvig Forssell',
'Nicole Kidman & Ewan McGregor': 'Ewan McGregor',
'Deadmau5 and Kaskade': 'Deadmau5',
'Beabadoobee feat. Clairo': 'Beabadoobee',
'John Coltrane, Duke Ellington': 'John COltrane',
'Nicholas Britell and Spring 1 - Max Richter': 'Nicholas Britell',
'Dave Matthews & Tim Reynolds': 'Dave Matthews',
'MINMI & Nujabes': 'MINMI',
'Nu Deco Ensemble Kishi Bashi': 'Kishi Bashi',
'Ana Carolina Seu Jorge': 'Ana Carolina',
'Skillet (band': 'Skillet',
'Soccer Mommy (Sophie Allison': 'Soccer Mommy',

}

In [None]:
for k,v in z.items(): 
    tdf = df.loc[df['artist']==k]
    if len(tdf) > 0:
        print(k, len(tdf))        
        df.loc[df['artist']==k, 'artist']=v


In [None]:
# tempdf = df[['artist', 'post_score']] \
#     .groupby('artist') \
#     .sum() \
#     .reset_index() 

# tempdf.loc[tempdf['post_score']> 2].to_csv('x.csv', index=False)

In [94]:
df

Unnamed: 0,artist,track,score
0,Claude Debussy,Clair de Lune,22051
1,Erik Satie,Gymnopédies,7242
2,Israel Kamakawiwoole,Over the Rainbow,6958
3,Simon & Garfunkel,Scarborough Fair,6189
4,The Cranberries,Dreams,5723
...,...,...,...
1532,lil Yachty,Drive me crazy,5
1533,Ludwig van Beethoven,moonlight sonata,5
1534,Claude Debussy,Girl with the Flaxen Hair,5
1535,Joni Mitchell,A Case of you,5


In [95]:
df = df[['artist', 'track', 'score']].groupby(["artist", "track"]) \
    .sum() \
    .reset_index() \
    .sort_values(["score", "artist", "track"], ascending=False)

df.head(20)



Unnamed: 0,artist,track,score
263,Claude Debussy,Clair de Lune,22051
418,Erik Satie,Gymnopédies,7242
634,Israel Kamakawiwoole,Over the Rainbow,6958
1230,Simon & Garfunkel,Scarborough Fair,6189
1348,The Cranberries,Dreams,5723
964,Neil Young,Harvest Moon,5326
122,Beatles,In my Life,4268
387,Elton John,Your Song,3907
356,Don McLean,"Vincent (Starry, Starry Night)",3832
910,Mazzy Star,Fade into you,3821


## Filter by minimum score


In [27]:
df = df.loc[df['score'] >4]
df

Unnamed: 0,artist,score,track
0,Claude Debussy,22150,Clair de Lune
1,Simon & Garfunkel,11123,Scarborough Fair
2,Beatles,8900,In my Life
3,The Cranberries,8272,Dreams
4,Erik Satie,7568,Gymnopédies
...,...,...,...
1054,Punch Brothers,5,Julep
1055,Koffee,5,W
1056,Kenny Loggins,5,Return to Pooh Corner
1057,IU,5,Peach


In [28]:
df.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(df, f)


In [29]:
with open('silver.pkl', 'rb') as f:
    df = pickle.load(f)
df.head(20)

Unnamed: 0,artist,score,track
0,Claude Debussy,22150,Clair de Lune
1,Simon & Garfunkel,11123,Scarborough Fair
2,Beatles,8900,In my Life
3,The Cranberries,8272,Dreams
4,Erik Satie,7568,Gymnopédies
5,Israel Kamakawiwoole,7025,Over the Rainbow
6,Neil Young,5379,Harvest Moon
7,Jim Croce,4871,Time in a Bottle
8,Elton John,4241,Your Song
9,Mazzy Star,4208,Fade into you


# Load into a Spotify playlist


In [3]:
# log in
client_credentials_manager = SpotifyClientCredentials(client_id=os.getenv('SPOTIFY_CLIENT_ID'), 
                                                      client_secret=os.getenv('SPOTIFY_CLIENT_SECRET'),
                                                      )

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


In [31]:
df = pd.read_csv("silver.csv")
df = df.groupby(['artist','track']).sum().reset_index().sort_values(["score", "artist", "track"], ascending=False)
df.to_csv('silver.csv', index=False)


In [32]:
# check artists
# update to spotify canonical name as necessary

df = pd.read_csv("silver.csv")
df = df.groupby(['artist','track']).sum().reset_index().sort_values(["score", "artist", "track"], ascending=False)

dedupe = {}
fail_list = []
artist_map = {}
for index, artist, title, score in df.itertuples():
    artist = str(artist)
    if artist in dedupe:
        continue
    dedupe[artist]=1
    query_str = 'artist:%s' % (artist)
    artist_results = sp.search(q=query_str, type='artist', limit=3, offset=0, market='US')
    artist_names = [artist['name'] for artist in artist_results['artists']['items']]
    if artist_names:
        if artist.lower() != artist_names[0].lower():
            artist_map[artist] = artist_names[0]
            print(artist, '->', artist_names[0])
    else:
        fail_list.append((artist, title))
        print("not found:", artist, "-", title)

# then clean up manually as appropriate

Beatles -> The Beatles
Israel Kamakawiwoole -> Israel Kamakawiwo'ole
Ben E King -> Ben E. King
Goo Goo Dolls -> The Goo Goo Dolls
Sinead O'connor -> Sinéad O'Connor
Dream Academy -> The Dream Academy
The Band -> The Band CAMINO
America -> The All-American Rejects
Crosby, Stills, Nash, Young -> Crosby, Stills, Nash & Young
Hollies -> The Hollies
Cat Stevens -> Yusuf / Cat Stevens
not found: Harry Waters Jr., Marvin Berry, and the Starlighters - Earth Angel (Will You Be Mine)
Zep -> Led Zeppelin
Cranberries -> The Cranberries
Seal -> Seals and Crofts
Heart -> Tom Petty and the Heartbreakers
not found: Giacomo Puccini & Giovacchino Forzano - Un Bel Di Vedremo
not found: John Prine and Bonnie Raitt - Angel from Montgomery
Flaming Lips -> The Flaming Lips
Selena -> Selena Gomez
Mamas & The Papas -> The Mamas & The Papas
Loggins Messina -> Loggins & Messina
José Gonzalez -> José González
Dixie Chicks -> Karaoke - Dixie Chicks
not found: MC Snub Nose - My name is Preeti and am very pretty
The

In [33]:
[f[0] for f in fail_list]

['Harry Waters Jr., Marvin Berry, and the Starlighters',
 'Giacomo Puccini & Giovacchino Forzano',
 'John Prine and Bonnie Raitt',
 'MC Snub Nose',
 'Ella Fitzgerald and Louis Armstrong',
 'Kozue Takada',
 'Louis Armstrong Jr.',
 'Lord Franklin',
 'Cardi B ft. Megan Thee Stallion',
 'Martin David Robinson',
 'Pedro Barker',
 'Steven Universe (soundtrack)',
 'Dave Matthews & Tim Reynolds',
 'Amazon Wiretap',
 'Michael Bublé and Blake Shelton',
 'Guitars and Dragons',
 'Joshua Ritter',
 'Barry DeVorzon and Perry Botkin Jr.',
 'Aurora featuring Pomme',
 'Terrence Jay',
 'Misheard Lyrics',
 'Traditional Welsh Lullaby',
 'Taylor Swift ft. Bon Iver',
 'Max Richter: Sunrise Mass',
 'Khruangbin, Leon Bridges',
 'Katamari Damacy Original Soundtrack',
 'Howl s Moving Castle',
 'Amadeus soundtrack']

In [None]:
artist_map

In [None]:
ignore_list = [ 

'1',     #One Direction',
 'Hem', #'Natalie Hemby',
    'Priscilla',     #: 'Priscilla Chan',
     'Drake',     #: 'Nick Drake',
 'William Ackerman',     #: 'Mark Ackerman, William James Ross',
 'Jason',     #: 'Jason Mraz',
 'Juice',     #: 'Juice WRLD',
 'Origa',     #: 'Origami Angel',
 'Nico',     #: 'Nico & Vinz',
 'Mako',     #: 'Mako Road',
 'Low',     #: 'All Time Low',
 'La La Land Soundtrack',     #: 'LAND Soundtrack',
 'Flamingos',     #: 'Flamingosis',
 'BoA',     #: 'Boards of Canada',
 'Traditional',     #: 'Chinese Traditional',
 'Future',     #: 'Future Islands',
'ASAP Rocky',     # -> Seth Narley feat. ASAP Rocky
'Acoustic',     # -> Acoustic Alchemy
'Adeem',     #Adeem the Artist',
'Al Stewart',     #Alexander Stewart',
'Alpine' ,     #-> Alpine Universe
'America',     #The All-American Rejects',
'Arrow',     # -> Arrows in Action
'Berlin',     #Berliner Philharmoniker',
'Brian Wilson',     # -> Brian Courtney Wilson
'CSNY',     #Csnyee_',
'Choir Choir Choir!',     #Mav City Gospel Choir',
'Dallas Green',     # -> Jimmy Carter and Dallas County Green
'Death',     #Five Finger Death Punch',
'Dixie Chicks',     # -> Karaoke - Dixie Chicks
'Eileen',     #Eileen Walker',
'Eric Johnson',     #Eric D. Johnson',
'Frente',     #Frente Cumbiero',
'IZ',     #Izzamuzzic',
'Japanese House',     # -> The Japanese House
'Jewel',     # -> Run The Jewels
'LP',     #LP Giobbi',
'La La Land Soundtrack' ,     #-> LAND Soundtrack
'Live',     # -> DPR LIVE
'MCR',     #Tate McRae',
'Meatloaf',     #meatloafi',
'Múm',     #Mumford & Sons',
'Nico',     #Nicki Nicole',
'One',     # -> One Direction
'Phil',     #Phil Collins',
'Pink',     #PinkPantheress',
'Priscilla',     # -> Priscilla Block
'Rainbow',     #Rainbow Kitten Surprise',
'Seal',     #Seals and Crofts',
'South Park',     #South Park Mexican',
'The Band',     #The Band CAMINO',
'The La’s',     #The Kid LAROI',
'The Philadelphia Orchestra',     #The Philadelphia Virtuosi Chamber Orchestra',
'The Promise',     #Lukas Nelson and Promise of the Real',
'Train',     #Meghan Trainor',
'Vince',     #Vince Staples',
'a-ha',     #Daryl Hall & John Oates',

]

for k in ignore_list:
    try:
        print(k, artist_map.get(k))
        artist_map.pop(k)
    except:
        print('error', k)
        pass





In [None]:
artist_map.get('Train')

In [None]:
df['artist'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)
df.head(20)


In [None]:
df = df.groupby(['artist','track']).sum().reset_index().sort_values(["score", "artist", "track"], ascending=False)
df.to_csv('silver.csv', index=False)


In [72]:
# check tracks
# possibly update tracks to spotify canonical name

df = pd.read_csv("silver-0524.csv")

dedupe = {}
mylist = []
fail_list = []
artist_list, track_list, uri_list, album_list, score_list = [], [], [], [], []
orig_artist, orig_track = [], []

for index, artist, title, score in df.itertuples():
    query_str = 'artist:%s track:%s' % (artist, title)
    track_results = sp.search(q=query_str, type='track', limit=1, offset=0, market='US')
    results = track_results['tracks']['items']
    
    if results:
        r = results[0]
        # failsafe to never put same track twice
        if dedupe.get(r['id']):
            continue
        dedupe[r['id']]=True
        if title.lower() != r['name'].lower():
            print ("%04d %s|%s : %s|%s" % (index, artist, title, r['artists'][0]['name'], r['name']))
        uri_list.append(r['uri'])
        artist_list.append(r['artists'][0]['name'])
        track_list.append(r['name'])
        album_list.append(r['album']['name'])
        orig_artist.append(artist)
        orig_track.append(title)
        score_list.append(score)
#         print('  ',
#               r['artists'][0]['name'],'|',
#               r['name'], '|',
#               r['album']['name'],'|',
#               r['album']['release_date'],'|',
#               r['popularity'])
    else:
        fail_list.append((artist, title))
        print("not found:", artist, "-", title)
        

0000 Claude Debussy|Clair de Lune : Claude Debussy|Suite bergamasque, L. 75: III. Clair de lune
0001 Simon & Garfunkel|Scarborough Fair : Simon & Garfunkel|Scarborough Fair / Canticle
0002 The Beatles|In my Life : The Beatles|In My Life - Remastered 2009
0003 Erik Satie|Gymnopédies : Erik Satie|3 Gymnopédies: No. 1 Lent et douloureux
not found: Neil Young - Harvest Moon
0010 The Beach Boys|God Only Knows : The Beach Boys|God Only Knows - Mono
0011 Sigur Rós|Hoppipolla : Sigur Rós|Hoppípolla
0019 John Denver|Annie s Song : John Denver|Annie's Song
0025 Otis Redding|(Sittin  On) The Dock of the Bay : Otis Redding|(Sittin' On) the Dock of the Bay
0033 Peter Gabriel|In Your Eyes : Peter Gabriel|In Your Eyes - 2012 Remaster
0035 Samuel Barber|Adagio for Strings : Samuel Barber|Barber: Adagio for Strings
0041 Grateful Dead|Standing on the Moon : Grateful Dead|Standing on the Moon - 2013 remaster
0042 The Smiths|There is a light that never goes out : The Smiths|There Is a Light That Never Goe

0256 Maurice Ravel|Pavane pour une infante (pavane for a dead princess) : Maurice Ravel|Pavane pour une infante défunte (Pavane for a Dead Princess)
0260 Paul Simon|The Boxer : Paul Simon|The Boxer - Live at Central Park, New York, NY - August 15, 1991
0262 Franz Liszt|Liebestraum no 3 : Franz Liszt|Liebestraume, S541/R211 : No. 3: Nocturne in A-Flat Major
0264 Nick Cave & The Bad Seeds|Into My Arms : Nick Cave & The Bad Seeds|Into My Arms - 2011 Remastered Version
0266 Yes|And you and I : Yes|And You and I - 2003 Remaster
0268 Tracy Chapman|Baby, Can I Hold You? : Tracy Chapman|Baby Can I Hold You
0269 The Everly Brothers|Cathy s Clown : The Everly Brothers|Cathy's Clown - 2007 Remaster
0271 John Mayer|Gravity. : John Mayer|Gravity
0274 Yann Tiersen|Summer 78 : Yann Tiersen|Summer 78 (1)
0275 Nina Simone|Stars : Nina Simone|Stars (Live at Montreux)
0277 Grimes|Symphonia IX : Grimes|Symphonia IX (My Wait Is U)
0278 The Killers|Sawdust : The Killers|Glamorous Indie Rock And Roll - Sawdu

0563 Blue Rodeo|Try : Blue Rodeo|Try - 2012 Remaster
0564 Billie Holiday|I ll Be Seeing You : Billie Holiday|I'll Be Seeing You
not found: Young Thug - Who Do You Love
not found: Trey Parker - Jacking it in San Diego
0572 Todd Rundgren|Hello It s Me : Todd Rundgren|Hello It's Me
0573 The White Stripes|Do : The White Stripes|My Doorbell
0574 The Temptations|Just My Imagination : The Temptations|Just My Imagination (Running Away With Me)
0581 Mum|Green Grass of Tunnel : múm|Green Green Grass Of Tunnel
0582 Ms. Lauryn Hill|Ex factor : Ms. Lauryn Hill|Ex-Factor
not found: Mogwai - Mogwai Fear Satan (Kevin Shields remix)
0584 Modest Mouse|White Teeth : Modest Mouse|White Lies, Yellow Teeth
0589 Julie Andrews|The Sound of Music : Julie Andrews|The Lonely Goatherd
0591 John Frusciante|Song to Sing When I m Lonely : John Frusciante|Song To Sing When I'm Lonely
0594 Harry Chapin|Cats in the cradle : Harry Chapin|Cat's in the Cradle
not found: Emmylou Harris with Mark Knopfler & His Band - Till 

0883 The Style Council|You re the Best Thing : The Style Council|You're The Best Thing
not found: The Pogues - 12 stout street
0887 The Monkees|Me and Magdalena : The Monkees|Me & Magdalena
0890 Taylor Swift, Bon Iver|Exile : Taylor Swift|exile (feat. Bon Iver)
0894 Steve Reich|Duet for two violins : Steve Reich|Duet for two Solo Violins and String Orchestra (Dedicated to and written for Yehudi Menuhin)
0895 Stephen Sanchez|until I found you : Stephen Sanchez|Until I Found You (with Em Beihold) - Em Beihold Version
0896 Steely Dan|Any Major Dude : Steely Dan|Any Major Dude Will Tell You
not found: Star Trek: The Next Generation - The Inner Light
0901 Solange|Don t touch my hair : Solange|Don't Touch My Hair (feat. Sampha)
0905 Silver Mt. Zion|13 angels standing guard round the side of your bed : Silver Mt. Zion|13 Angels Standing Guard 'Round The Side Of Your Bed
not found: Shinedown - I ll follow you down
0911 Robert Schumann|Träumerei : Robert Schumann|Kinderszenen, Op. 15: Träumerei

In [73]:
print(len(fail_list))
fail_list



40


[('Neil Young', 'Harvest Moon'),
 ('Neil Young', 'Words'),
 ('Joanna Newsom', 'Sawdust and Diamonds'),
 ('Yuki Kajiura', 'Key of the Twilight'),
 ('Mako', 'his version'),
 ('Keiichi Okabe', 'Peaceful Sleep from the Neir Automata Soundtrack'),
 ('Dean Bowser', 'Peaches'),
 ('Young Thug', 'Who Do You Love'),
 ('Trey Parker', 'Jacking it in San Diego'),
 ('Mogwai', 'Mogwai Fear Satan (Kevin Shields remix)'),
 ('Emmylou Harris with Mark Knopfler & His Band', 'Till I Gain Control Again'),
 ('Dallas Green', 'Rain When i Die'),
 ('Skyrim', 'Skyrim soundtrack'),
 ('Original Broadway Cast of Moulin Rouge! The Musical', 'music'),
 ('Lucy Wainwright Roche', 'Amy Ray'),
 ('Hiroyuki Sawano', 'EriOne$'),
 ('Barbra Streisand', 'Avinu Malkenu'),
 ('The Hold Steady', 'Put that summer song on again'),
 ('Adeem the Artist', 'White Trash Revelry'),
 ('Danny Elfman', 'Summer Fields from the Fable soundtrack'),
 ('The Piano Guys', 'O Come, O Come, Emmanuel!'),
 ('Meredith Godreau', 'Boats & Birds'),
 ('Marc

## Save gold.csv


In [74]:

gold_df = pd.DataFrame({'score': score_list,
                        'input_artist': orig_artist,
                        'artist': artist_list,
                        'input_track': orig_track,
                        'track': track_list,
                        'album': album_list,
                        'uri': uri_list})

with pd.option_context("display.max_rows", 9999):
    display(gold_df)



Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
0,22099,Claude Debussy,Claude Debussy,Clair de Lune,"Suite bergamasque, L. 75: III. Clair de lune","Debussy: Suite bergamasque, L. 75, 3. Clair de...",spotify:track:1cmigB9I6IRpFqjIbzvSQB
1,11922,Simon & Garfunkel,Simon & Garfunkel,Scarborough Fair,Scarborough Fair / Canticle,"Parsley, Sage, Rosemary And Thyme",spotify:track:3g2fYZW5v2od8KIF7VktT0
2,9049,The Beatles,The Beatles,In my Life,In My Life - Remastered 2009,Rubber Soul (Remastered),spotify:track:3KfbEIOC7YIv90FIfNSZpo
3,7620,Erik Satie,Erik Satie,Gymnopédies,3 Gymnopédies: No. 1 Lent et douloureux,Satie: The Magic of Satie,spotify:track:7kTVe6XhIveidvkt8nb7jK
4,7305,The Cranberries,The Cranberries,Dreams,Dreams,"Everybody Else Is Doing It, So Why Can't We?",spotify:track:4JGKZS7h4Qa16gOU3oNETV
5,6681,Israel Kamakawiwo ole,Israel Kamakawiwo'ole,Over the Rainbow,Over the Rainbow,Alone In Iz World,spotify:track:3oQomOPRNQ5NVFUmLJHbAV
6,5285,Jim Croce,Jim Croce,Time in a Bottle,Time in a Bottle,You Don't Mess Around With Jim,spotify:track:561F1zqRwGPCTMRsLsXVtL
7,4357,Mazzy Star,Mazzy Star,Fade into you,Fade Into You,So Tonight That I Might See,spotify:track:1LzNfuep1bnAUR9skqdHCK
8,3989,Don McLean,Don McLean,"Vincent (Starry, Starry Night)","Vincent (Starry, Starry Night)",Rearview Mirror: An American Musical Journey,spotify:track:2YDyH60Vro33KkDtNZCXIk
9,3882,The Beach Boys,The Beach Boys,God Only Knows,God Only Knows - Mono,Pet Sounds (Original Mono & Stereo Mix),spotify:track:6iGU74CwXuT4XVepjc9Emf


In [75]:
# inspect where the track name differs
with pd.option_context("display.max_rows", 999):
    display(gold_df.loc[gold_df['input_artist'].str.lower().str[:8] != gold_df['artist'].str.lower().str[:8]])
    

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
52,668,Queen,Queensrÿche,Silent lucidity,Silent Lucidity - Remastered 2003,Empire - 20th Anniversary Edition,spotify:track:6OSyCAmXT4Gkd3OQ2aPOaF
201,53,"Schonberg, Claude-Michel",Claude-Michel Schönberg,I dreamed a dream,I Dreamed A Dream - Original Broadway Cast/1987,More Broadway Love Songs,spotify:track:5iFwVrTDpkT3CEnejnyZae
234,42,Desree,Des'ree,Kissing You,I'm Kissing You,Supernatural,spotify:track:1Xp8MKmfoFDib6dHM6JF53
262,36,Guns N Roses,Guns N' Roses,November Rain,November Rain,Use Your Illusion I,spotify:track:3YRCqOhFifThpSRFJ1VWFM
282,31,BoA,bôa,Duvet,Duvet,Twilight,spotify:track:42qNWdLKCI41S4uzfamhFM
300,28,Loreen,Loreena McKennitt,Dante s Prayer,Dante's Prayer,The Book Of Secrets,spotify:track:02kYCQFBEHKpArFdOyJVxt
303,28,Franz Schubert,Franz Liszt,Ave Maria,"Ave Maria, S. 558 (after Schubert, D. 839)","Liszt: Ave Maria, S. 558 (after Schubert, D. 839)",spotify:track:6mfmrVSe5Cm7cvsNwW6uJk
312,27,Flamingos,The Flamingos,I Only Have Eyes For You,I Only Have Eyes for You,Flamingo Serenade,spotify:track:3YdKJzcoMZMacISlpY4QoP
360,22,"Air, Beth Hirsch",Air,All I Need,All I Need (feat. Beth Hirsch),Moon Safari,spotify:track:7jOM0KIKgIppqIEvEjixaj
364,21,Low,Low Roar,Give Up,Give Up,Low Roar,spotify:track:4L321mZADfO4S04ICSIrwR


In [None]:
# these are songs that look like covers or otherwise not the expected response from spotify search 
# (which is a bit wonky, doesn't like quotes and such)
# remove from df and add manually
bad_lookups = [
421,
494,
557,
598,
669,
823,
]

for i in bad_lookups:
    print(gold_df.iloc[i])
    
# add manually, plus 'not found'


In [None]:
gold_df = gold_df.drop(
    axis='index',
    labels=bad_lookups)


In [None]:
gold_df

In [None]:
# this you could upload and make a new playlist
# existing playlist is result of multiple iterations

gold_df[['artist', 'track', 'score']].to_csv('gold.csv', index=False)

with pd.option_context("display.max_rows", 999):
    display(gold_df)

# Get Spotify playlist and add songs

In [7]:
# must follow an oauth workflow to write a playlist in Spotify
# running this cell should request a spotify login and then redirect to an url
# paste whole url with id into form to authenticate

scope = "playlist-modify-public"

sp = spotipy.Spotify(auth_manager=spotipy.SpotifyOAuth(scope=scope,
                                                       client_id=os.getenv('SPOTIFY_CLIENT_ID'),
                                                       client_secret=os.getenv('SPOTIFY_CLIENT_SECRET'),
                                                       redirect_uri="https://druce.ai"
                                                      ))


In [40]:
# get playlist id
# first create a playlist in Spotify UI to load songs
def get_playlist_id(playlist_name, verbose=False):
    playlists = sp.user_playlists(os.getenv('SPOTIFY_USERNAME'))
    while playlists:
        for i, playlist in enumerate(playlists['items']):
            if playlist['name'] == playlist_name:
                if verbose:
                    print('"%s": offset %d, URI %s' % (playlist['name'], i + 1 + playlists['offset'], playlist['uri']))
                return playlist['id']

        # not found yet, get next page if there is one
        if playlists['next']:
            playlists = sp.next(playlists)
        else:
            return None

playlist_id = get_playlist_id("RPS2")
print(playlist_id)


7rY5fhzbW7wCTXGMhEbZwk


In [39]:
# add songs to playlist 

addlist = gold_df['uri'].to_list()
print (len(addlist))

# while(addlist):
#     sp.user_playlist_add_tracks(os.getenv('SPOTIFY_USERNAME'), 
#                                 playlist_id=playlist_id, 
#                                 tracks=addlist[-100:])
#     addlist = addlist[:-100]
#     print("added items, remaining ", len(addlist))


874


In [52]:
# compare to playlist

def get_playlist_df(playlist_name):
    
    results = sp.user_playlist(os.getenv('SPOTIFY_USERNAME'), 
                               get_playlist_id(playlist_name),
                               fields='tracks,next,name')
    tracks = results['tracks']
    
    # get tracks, paging as needed
    track_list = []
    while tracks:
        for track_item in tracks['items']:
            track_list.append(track_item['track'])
        # more pages?
        tracks = sp.next(tracks) if tracks['next'] else None
                    
    return pd.DataFrame({'artist': [track['artists'][0]['name'] for track in track_list],
                         'track': [track['name'] for track in track_list],
                         'uri': [track['uri'] for track in track_list],
                         'id': [track['id'] for track in track_list],
                         'popularity': [track['popularity'] for track in track_list],
                        })

rps2_df = get_playlist_df("RPS2")
rps2_df


Unnamed: 0,artist,track,uri,id,popularity
0,The Stranglers,Golden Brown,spotify:track:2AX5E86cn9n2dgioZEjirI,2AX5E86cn9n2dgioZEjirI,71
1,The Sundays,Here's Where The Story Ends,spotify:track:4SPi5Pl7aAtauFsH9Lk5LB,4SPi5Pl7aAtauFsH9Lk5LB,57
2,The Sundays,Wild Horses,spotify:track:418gyIJdAZSZisVdzDXLNc,418gyIJdAZSZisVdzDXLNc,53
3,The Temper Trap,Sweet Disposition,spotify:track:5RoIXwyTCdyUjpMMkk4uPd,5RoIXwyTCdyUjpMMkk4uPd,68
4,The Temptations,Just My Imagination (Running Away With Me),spotify:track:39Bd345OWEhRNyfayhp9gv,39Bd345OWEhRNyfayhp9gv,65
...,...,...,...,...,...
1800,Mindy Gledhill,Anchor,spotify:track:2Rxt9SEQ8rZYl0wsbGS3ag,2Rxt9SEQ8rZYl0wsbGS3ag,40
1801,Emancipator,First Snow,spotify:track:4hGXwJZJASH1U7JGKnhJEg,4hGXwJZJASH1U7JGKnhJEg,44
1802,Deniece Williams,Silly,spotify:track:3XrzApq8R10O6WwNwMw8t4,3XrzApq8R10O6WwNwMw8t4,45
1803,David Wise,Aquatic Ambience,spotify:track:4PnqMtmOmvOdkPL6XKV6kD,4PnqMtmOmvOdkPL6XKV6kD,32


In [76]:
set(gold_df['uri'].to_list()).difference(set(rps2_df['uri'].to_list()))

{'spotify:track:02vw0tjLamMJAzMlCSiNH3',
 'spotify:track:03jTyonSwsadnZdhrc9Qya',
 'spotify:track:07pliZ8zz9Zu0UAq9U41Ji',
 'spotify:track:07rZchoGQYZLT4ejEThjEY',
 'spotify:track:0870QNicMawQH2cnzBVZ3P',
 'spotify:track:09uV1Sli9wapcKQmmyaG4E',
 'spotify:track:0BX3ysoHJvxmLEhPMAfb2z',
 'spotify:track:0C7AblcyeKsOYLBT69PTe3',
 'spotify:track:0FVuyC9RP5MACjp4lgU3qZ',
 'spotify:track:0HLBLRkkTOlAmjonShicD0',
 'spotify:track:0Mf6WhWLfOpzOxacvbcboj',
 'spotify:track:0OavtQSojULqejmC4Qbstr',
 'spotify:track:0SCkN2QprWk02svK0HCZ2n',
 'spotify:track:0WtDGnWL2KrMCk0mI1Gpwz',
 'spotify:track:0dRhSF9LV0HR8Jwd3MMMKJ',
 'spotify:track:0fzCtVM9D5UEwiLqcY8Ouq',
 'spotify:track:0gcjc7Vt5xtcfmJgf6g2IO',
 'spotify:track:0hdDPaUbhi1OkzhyicPSBb',
 'spotify:track:0mvkwaZMP2gAy2ApQLtZRv',
 'spotify:track:0nYDnOeXWsAuV0dqOxXXO5',
 'spotify:track:0pzjcWYkyynxhONSm5C8Cu',
 'spotify:track:0vwRHdpyDLRchRBtMjjjBH',
 'spotify:track:11Er3qlX9yXQjhvHkmvcBD',
 'spotify:track:1HAbsTTwgBsA4uJCoUNT8c',
 'spotify:track:

In [77]:
set(rps2_df['uri'].to_list()).difference(set(gold_df['uri'].to_list()))

{'spotify:track:7H3ojI1BsVy0dEJENqMt1k',
 'spotify:track:5cWSpmTfItDuppACMrE3I9',
 'spotify:track:70aUjWZmd9F3bRSsR4DwAJ',
 'spotify:track:3xKsf9qdS1CyvXSMEid6g8',
 'spotify:track:36AlMHDBFwSsD3FQOy1R81',
 'spotify:track:7LuyD4CA7XR4c6y3YaG5nD',
 'spotify:track:6nWneWHV6S5FhXkVBeMoLE',
 'spotify:track:7lNmcW2wW6ciV9Rhh8YOMe',
 'spotify:track:0RstfX9nRY1Lfuy1808MoT',
 'spotify:track:3DRK95roA4UH43eYpyjziA',
 'spotify:track:1JbUMtVkQuaAPdHOvncVTm',
 'spotify:track:0lYMgvKnTa5Kh5hyIIR5wO',
 'spotify:track:1biohJSHVPIJKJV717jip4',
 'spotify:track:5NCX1f3TdiQCuxCLIORAe7',
 'spotify:track:0S9w3jx4cO9rSBcGwB16in',
 'spotify:track:1WOxGNutRwrbik8BA5f2AY',
 'spotify:track:3YQVYTkonczhXyj31HH4AK',
 'spotify:track:6cr6UDpkjEaMQ80OjWqEBQ',
 'spotify:track:7CI0VS6ETno5mgHi4PoCrY',
 'spotify:track:11XR0tRT4g5ov4u8M92wbF',
 'spotify:track:4heCCxMwu4IH4bMaHLNRdg',
 'spotify:track:1MgxDe2vrgoaro1aSISKm2',
 'spotify:track:1tQy2AqgFc0AdddmIboIfX',
 'spotify:track:6IRA4KOVbtiGiTdYoEThJN',
 'spotify:track:

In [81]:
clean up stuff that is in 'reddits prettiest but not in rps2'
clean up stuff that is in rps but not rps2
add remaining stuff from rps3 to rp2
add rps2 to rps
add rps to reddits prettiest
delete rps3
with pd.option_context("display.max_rows", 9999):

    display(gold_df.loc[~gold_df['uri'].isin(rps2_df['uri'])])

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
17,2810,Joni Mitchell,Joni Mitchell,Both Sides Now,Both Sides Now,Feathers.,spotify:track:5EsPLgSs1UQIDJG0U00RuJ
92,251,George Harrison,George Harrison,While My Guitar Gently Weeps,While My Guitar Gently Weeps - Live At Madison...,Let It Roll - Songs of George Harrison,spotify:track:4Egi6XuC0rbLlXfqmQeuFa
98,228,Fleetwood Mac,Fleetwood Mac,Songbird,Songbird - 2004 Remaster,walking down the aisle,spotify:track:5PhHlgTqTz8rYIW3msIxdO
122,142,Elvis Presley,Elvis Presley,Unchained Melody,"Unchained Melody - Live at Ann Arbor, MI",Moody Blue,spotify:track:0OavtQSojULqejmC4Qbstr
138,105,Giacomo Puccini,Giacomo Puccini,Un Bel Di Vedremo,"Madama Butterfly / Act II: ""Un bel dì vedremo""","The Puccini Album: La bohème, Madama Butterfly...",spotify:track:3kOppsI3pPRJimOU4DdpFu
152,78,Pyotr Ilyich Tchaikovsky,Pyotr Ilyich Tchaikovsky,"Symphony No. 5 in E Minor, Op. 64: II","Symphony No. 5 in E Minor, Op. 64: II. Andante...",Tchaikovsky: Symphony No. 5,spotify:track:7MrRFi6swb5Jwef9esaJIF
157,75,James Taylor,James Taylor,You ve got a friend,You've Got a Friend - 2019 Remaster,Mud Slide Slim and the Blue Horizon (2019 Rema...,spotify:track:6zV8IpLvw0tkRSVCFQJB1y
193,57,Linkin Park,Linkin Park,One more light,One More Light,One More Light,spotify:track:3xXBsjrbG1xQIm1xv1cKOt
201,53,"Schonberg, Claude-Michel",Claude-Michel Schönberg,I dreamed a dream,I Dreamed A Dream - Original Broadway Cast/1987,More Broadway Love Songs,spotify:track:5iFwVrTDpkT3CEnejnyZae
219,45,Slipknot,Slipknot,Disasterpiece,Disasterpiece,Iowa,spotify:track:47VSmPTydr0saGjbQGwCeg


In [56]:
gold_df.loc[~gold_df['uri'].isin(rps2_df['uri'])]

Unnamed: 0,artist,track,uri,id,popularity
0,The Stranglers,Golden Brown,spotify:track:2AX5E86cn9n2dgioZEjirI,2AX5E86cn9n2dgioZEjirI,71
1,The Sundays,Here's Where The Story Ends,spotify:track:4SPi5Pl7aAtauFsH9Lk5LB,4SPi5Pl7aAtauFsH9Lk5LB,57
2,The Sundays,Wild Horses,spotify:track:418gyIJdAZSZisVdzDXLNc,418gyIJdAZSZisVdzDXLNc,53
3,The Temper Trap,Sweet Disposition,spotify:track:5RoIXwyTCdyUjpMMkk4uPd,5RoIXwyTCdyUjpMMkk4uPd,68
4,The Temptations,Just My Imagination (Running Away With Me),spotify:track:39Bd345OWEhRNyfayhp9gv,39Bd345OWEhRNyfayhp9gv,65
...,...,...,...,...,...
1800,Mindy Gledhill,Anchor,spotify:track:2Rxt9SEQ8rZYl0wsbGS3ag,2Rxt9SEQ8rZYl0wsbGS3ag,40
1801,Emancipator,First Snow,spotify:track:4hGXwJZJASH1U7JGKnhJEg,4hGXwJZJASH1U7JGKnhJEg,44
1802,Deniece Williams,Silly,spotify:track:3XrzApq8R10O6WwNwMw8t4,3XrzApq8R10O6WwNwMw8t4,45
1803,David Wise,Aquatic Ambience,spotify:track:4PnqMtmOmvOdkPL6XKV6kD,4PnqMtmOmvOdkPL6XKV6kD,32


In [61]:
rps3_df.loc[rps3_df['uri'].isin(rps2_df['uri'])]

Unnamed: 0,artist,track,uri,id,popularity


In [63]:
len(rps3_df)

105

In [40]:
# manually add the ones that weren't found for some reason


# Compare Spotify playlist to gold data
after initial population, we may want to run again and add new songs 


In [41]:
# compare to existing playlist
# can run again and add any new tracks, either because OpenAI is a bit random, or new replies in thread
results = sp.user_playlist(os.getenv('SPOTIFY_USERNAME'), playlist_id,
                                fields='tracks,next,name')
tracks = results['tracks']

playlist_dict_by_uri = {}
playlist_dict_by_str = {}

artist_list = []
track_list = []
uri_list = []
popularity_list = []
album_list=[]

while True:
    for track_item in tracks['items']:
        track_dict = track_item['track']
        track_str = track_dict['artists'][0]['name']  + ' | ' + track_dict['name'][:15]
        uri = track_dict['uri']
        if track_str in playlist_dict_by_str:
            print(track_str)
        playlist_dict_by_str[track_str] = uri
        playlist_dict_by_uri[uri] = track_str
        
        uri_list.append(uri)
        artist_list.append(track_dict['artists'][0]['name'])
        track_list.append(track_dict['name'])
        album_list.append(track_dict['album']['name'])
        popularity_list.append(track_dict['popularity'])
        
    # check if there are more pages
    if tracks['next']:
        tracks = sp.next(tracks)
    else:
        break

print (len(list(playlist_dict_by_str.keys())))
print (len(list(playlist_dict_by_uri.keys())))


Enter the URL you were redirected to: https://druce.ai/?code=AQC45lY2ARFLDOzULMSY9LrKO3s56wVrgVOkOkFh4jVD8IwqKn4YkrvK3OCDX5N0ven6jGxAcueaCNBtL5mL1qL_8EPGuZKVJ7E52jD8gAxtlmTnZ1QXz1-HrNIR8EvrBPt5seoQs4rCazX5vAiRr4ZREbIgZcc2CE_0xfQ0MA82fzM4HjHKzCvlUHQ
229
229


In [42]:
with pd.option_context("display.max_rows", 9999):
    display(gold_df.loc[~gold_df['uri'].isin(playlist_dict_by_uri.keys())])
    

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
0,22150,Claude Debussy,Claude Debussy,Clair de Lune,"Suite bergamasque, L. 75: III. Clair de lune","Debussy: Suite bergamasque, L. 75, 3. Clair de...",spotify:track:1cmigB9I6IRpFqjIbzvSQB
1,11123,Simon & Garfunkel,Simon & Garfunkel,Scarborough Fair,Scarborough Fair / Canticle,"Parsley, Sage, Rosemary And Thyme",spotify:track:3g2fYZW5v2od8KIF7VktT0
2,8900,Beatles,The Beatles,In my Life,In My Life - Remastered 2009,Rubber Soul (Remastered),spotify:track:3KfbEIOC7YIv90FIfNSZpo
3,8272,The Cranberries,The Cranberries,Dreams,Dreams,"Everybody Else Is Doing It, So Why Can't We?",spotify:track:4JGKZS7h4Qa16gOU3oNETV
4,7568,Erik Satie,Erik Satie,Gymnopédies,3 Gymnopédies: No. 1 Lent et douloureux,Satie: The Magic of Satie,spotify:track:7kTVe6XhIveidvkt8nb7jK
5,7025,Israel Kamakawiwoole,Israel Kamakawiwo'ole,Over the Rainbow,Over the Rainbow,Alone In Iz World,spotify:track:3oQomOPRNQ5NVFUmLJHbAV
6,4871,Jim Croce,Jim Croce,Time in a Bottle,Time in a Bottle,You Don't Mess Around With Jim,spotify:track:561F1zqRwGPCTMRsLsXVtL
7,4241,Elton John,Elton John,Your Song,Your Song,Elton John,spotify:track:38zsOOcu31XbbYj9BIPUF1
8,4208,Mazzy Star,Mazzy Star,Fade into you,Fade Into You,So Tonight That I Might See,spotify:track:1LzNfuep1bnAUR9skqdHCK
9,4202,Sigur Rós,Sigur Rós,Hoppipolla,Hoppípolla,Takk...,spotify:track:0yQPpUq5BJyqah5m2Q5Stt


In [120]:
gold_df2 = gold_df.loc[~gold_df['uri'].isin(playlist_dict_by_uri.keys())]
dfz = gold_df2.loc[gold_df2.index> 500].copy()

In [126]:
addlist = dfz['uri'].to_list()
print (len(addlist))
while(addlist):
    sp.user_playlist_add_tracks(os.getenv('SPOTIFY_USERNAME'), 
                                playlist_id=playlist_id, 
                                tracks=addlist[-100:])
    addlist = addlist[:-100]
    print("added items, remaining ", len(addlist))

250
added items, remaining  150
added items, remaining  50
added items, remaining  0


In [127]:
dfz

Unnamed: 0,score,input_artist,artist,input_track,track,album,uri
502,12,Mike Oldfield,Mike Oldfield,Moonlight shadow,Moonlight Shadow,The Mike Oldfield Collection,spotify:track:4WpmwEdc4RBos8W8icSSxW
511,12,Devin Townsend,Devin Townsend Project,Grace,Grace,Epicloud,spotify:track:6zic4tNd5P9HQZyLvQDCtR
512,12,Depeche Mode,Depeche Mode,Black Celebration,Black Celebration,Black Celebration,spotify:track:2mW2jTui4J1kCujDrhKioG
513,12,Commodores,Commodores,Jesus is love,Jesus Is Love,The Best Of Lionel Richie 20th Century Masters...,spotify:track:52i3VeUFbCRk3z8YgBFM7N
514,12,Carla Bruni,Carla Bruni,Quelqu un m a Dit,Quelqu'un M'a Dit,Amor en Rose,spotify:track:30SJXdse9nsV9Pu4ACfTWZ
...,...,...,...,...,...,...,...
964,5,Amos Lee,Amos Lee,violin,Violin,Mission Bell,spotify:track:7eRDTkDUO5UVWgla9Max4h
965,5,Alison Krauss,Alison Krauss,I Will,I Will,Now That I've Found You: A Collection,spotify:track:2ALrRRCZQ0ZDTBEs11CkNV
966,5,Alabama Shakes,Alabama Shakes,Sound & Color,Sound & Color,Sound & Color (Deluxe),spotify:track:26HaY4D7AV0gVxmkz11bIz
968,5,AJR,AJR,Dear winter,Dear Winter,Neotheater,spotify:track:4Dd0v2LBhMpqOwhmeU4yBJ


In [None]:
playlist_df = pd.DataFrame({'artist': artist_list,
                           'track': track_list,
                           'album': album_list,
                           'popularity': popularity_list,
                           })



In [None]:
with pd.option_context("display.max_rows", 9999):
    display(playlist_df.sort_values('popularity'))
    

In [None]:
gold_dict_by_uri = {}
gold_dict_by_str = {}
addlist = []
c = 0
for i, artist, track, uri in gold_df[['artist', 'track', 'uri']].itertuples():
    # print(artist, track, uri)
    track_str = artist + ' | ' + track[:15]
    if track_str not in playlist_dict_by_str:
        addlist.append([artist, track, uri])
        print(artist, track, uri)
    gold_dict_by_uri[uri]=track_str
    gold_dict_by_str['track_str']= uri
#     if track_str not in playlist_dict_by_str:
#         c += 1
#         print (c, track_str)
        
print(len(gold_dict_by_str.items()))
print(len(gold_dict_by_uri.items()))

In [None]:
addlist

In [None]:
addlist = [['ABBA', 'One Of Us', 'spotify:track:6zgtBUEkAfilJ2YEOvNexR'],
 ['Gregorio Allegri',
  'Miserere mei, Deus',
  'spotify:track:6es7DmrhnDoKj5rsFvh3XU'],
 ['Amy Winehouse',
  'Love Is A Losing Game',
  'spotify:track:3uliGwmB52ZA7brgpZMzyH'],
 ['Barbara',
  "Ma plus belle histoire d'amour",
  'spotify:track:0qBVET4VkHsQAoboWlQ2pJ'],
 ['Ludwig van Beethoven',
  'Symphony No. 5 in C Minor, Op. 67: I. Allegro con brio',
  'spotify:track:2ygeBLTP9uu3OW3VTulD8N'],
 ['Benny Goodman', 'Sing, Sing, Sing', 'spotify:track:5L8ta4ECl5zeA6bGqY7G38'],
 ['Bill Withers', 'Lean on Me', 'spotify:track:3M8FzayQWtkvOhqMn2V4T2'],
 ['Billy Joel', 'Piano Man', 'spotify:track:70C4NyhjD5OZUMzvWZ3njJ'],
 ['Bob Dylan', 'Ballad of a Thin Man', 'spotify:track:0f5N14nB8xi0p3o4BlVvbx'],
 ['Bob Dylan', "Blowin' in the Wind", 'spotify:track:18GiV1BaXzPVYpp9rmOg0E'],
 ['Bob Dylan', 'Desolation Row', 'spotify:track:4n1ZGm3TxYmoYe1YR8cMus'],
 ['Bob Dylan', 'Duquesne Whistle', 'spotify:track:5kKW4bszhKSCYVPDO0sMbX'],
 ['Bob Dylan',
  'Forever Young - Slow Version',
  'spotify:track:4yWl0tnEanf3zmZzl9kbQn'],
 ['Bob Dylan', 'Gotta Serve Somebody', 'spotify:track:760420tYNmNjFgi8bWvbop'],
 ['Bob Dylan', 'Highway 61 Revisited', 'spotify:track:6os5B6xjuke9YfBKH3tu1e'],
 ['Bob Dylan',
  'I Shall Be Released - Studio Outtake - 1971',
  'spotify:track:5vyw005QQ42hrzrLxb3xEX'],
 ['Bob Dylan', 'I Want You', 'spotify:track:7tJQ4Ekp2vN3NlI3vJJW3v'],
 ['Bob Dylan', "It Ain't Me Babe", 'spotify:track:5nbNWAfT1S6V1vqj3snHxS'],
 ['Bob Dylan', 'Jokerman', 'spotify:track:6cuHkcRUqtQhtJ4sWCkd1q'],
 ['Bob Dylan',
  "Knockin' On Heaven's Door",
  'spotify:track:6HSXNV0b4M4cLJ7ljgVVeh'],
 ['Bob Dylan', 'Lay, Lady, Lay', 'spotify:track:4uYwlMp841PLJmj1gJJwIq'],
 ['Bob Dylan', 'Like a Rolling Stone', 'spotify:track:3AhXZa8sUQht0UEdBJgpGc'],
 ['Bob Dylan', 'Love Sick', 'spotify:track:3O1hpSOaJDW4SelgUG2XT3'],
 ['Bob Dylan', "Maggie's Farm", 'spotify:track:5rGD8FFgHw74cp3RPhucyg'],
 ['Bob Dylan',
  'Make You Feel My Love',
  'spotify:track:6rfGPGghQL7SJmZPXprXIc'],
 ['Bob Dylan',
  'Mississippi - Version 2',
  'spotify:track:6JWHNd8QMxTvojYkmZtKGI'],
 ['Bob Dylan', 'Mr. Tambourine Man', 'spotify:track:3RkQ3UwOyPqpIiIvGVewuU'],
 ['Bob Dylan', 'Murder Most Foul', 'spotify:track:1LfTvT9JPYuuZanwxLtZCr'],
 ['Bob Dylan', 'Not Dark Yet', 'spotify:track:1qbn6QrHG8XfnqVFKgNzKP'],
 ['Bob Dylan',
  'Rainy Day Women #12 & 35',
  'spotify:track:7BkAlVpGwXXl3sYNn5OoJ7'],
 ['Bob Dylan',
  'Sad-Eyed Lady of the Lowlands',
  'spotify:track:4jdtLLyEL7wY0TlCdMKhxq'],
 ['Bob Dylan', 'She Belongs to Me', 'spotify:track:2itBkHBUxGl4VfDj4HNyoD'],
 ['Bob Dylan',
  'Stuck Inside of Mobile with the Memphis Blues Again',
  'spotify:track:1NYTj6JEw3IOh4ggiBh82h'],
 ['Bob Dylan',
  'Subterranean Homesick Blues',
  'spotify:track:6k9DUKMJpWvu6eFG3O64Lg'],
 ['Bob Dylan', 'Tangled up in Blue', 'spotify:track:6Vcwr9tb3ZLO63F8DL8cqu'],
 ['Bob Dylan', 'Tempest', 'spotify:track:19scNzd4ogVsHrNWsms8Rg'],
 ['Bob Dylan',
  "The Times They Are A-Changin'",
  'spotify:track:52vA3CYKZqZVdQnzRrdZt6'],
 ['Bob Dylan',
  'Things Have Changed - Single Version',
  'spotify:track:5KOi77ameCimkAdw0DMNoy'],
 ['Bob Dylan',
  'Thunder on the Mountain',
  'spotify:track:4wo2eRp6aHcAlmhmfwiTAH'],
 ['Bob Dylan', 'Visions of Johanna', 'spotify:track:2rslQV48gNv3r9pPrQFPW1'],
 ['Brian Wilson', 'God Only Knows', 'spotify:track:2SznAUigFh6rMdGpcS5d7e'],
 ['Bright Eyes',
  'First Day of My Life',
  'spotify:track:0eBryM7ePQH3Klt3jz8xZd'],
 ['Crowded House',
  'Don’t Dream It’s Over - Home Demo',
  'spotify:track:0fiSpF9mvRFQWy0ca64d1g'],
 ['Léo Delibes', 'Flower Duet', 'spotify:track:5K8jqeLAxZIqHR6e5w5so1'],
 ['Dire Straits', 'Brothers In Arms', 'spotify:track:6XYBbVpu455ZdGWZNRLGbG'],
 ['Don McLean',
  'Vincent (Starry, Starry Night)',
  'spotify:track:2YDyH60Vro33KkDtNZCXIk'],
 ['Ed Sheeran', 'Photograph', 'spotify:track:41xNsY82OWtWbIfnRMK2ky'],
 ['Elvis Presley',
  'Can’t Help Falling in Love - Acoustic Cover',
  'spotify:track:0ghQkNDYLSl4GsqfkjTjWx'],
 ['Enya', 'Amarantine', 'spotify:track:0VmzazQQ0Mo1vJldr5NxTW'],
 ['Evan Rachel Wood', 'If I Fell', 'spotify:track:0gd3hRBQAEAw096YOcUrmR'],
 ['Fleetwood Mac', 'Rhiannon', 'spotify:track:05oETzWbd4SI33qK2gbJfR'],
 ['George Harrison',
  'All Things Must Pass - 2014 Remaster',
  'spotify:track:16OwZQuzMqnwn3FZsCBZly'],
 ['George Harrison',
  'Apple Scruffs - 2014 Remaster',
  'spotify:track:2K7WhpfZX3TCCMiwebp0W7'],
 ['George Harrison',
  'Art of Dying - 2014 Remaster',
  'spotify:track:6Jod7qrtYBhU3HcUmKk4hX'],
 ['George Harrison',
  'Awaiting on You All - 2014 Remaster',
  'spotify:track:0b65WkrBrg2qOkzQeDtQ9d'],
 ['George Harrison',
  'Ballad of Sir Frankie Crisp (Let It Roll) - 2014 Remaster',
  'spotify:track:0FWeRrB8T5R6maHbWQw4Kk'],
 ['George Harrison',
  'Behind That Locked Door',
  'spotify:track:2VVbLn8nMcWJzjcL1tZsUr'],
 ['George Harrison',
  'Beware of Darkness - 2014 Remaster',
  'spotify:track:606MCyZFMBlc52Ojnn1nvU'],
 ['George Harrison',
  'Give Me Love (Give Me Peace on Earth)',
  'spotify:track:71fXxvXqo1zxWDtBmjoEVk'],
 ['George Harrison',
  'Hear Me Lord - 2014 Remaster',
  'spotify:track:3kopbNyRj10XO1actGZexP'],
 ['George Harrison',
  'I Dig Love - 2014 Remaster',
  'spotify:track:42yK1Wy62c7malKSRwy0Qk'],
 ['George Harrison',
  'I Remember Jeep - 2014 Remaster',
  'spotify:track:058AE5M3ifbCh8VWOV7903'],
 ['George Harrison',
  "It's Johnny's Birthday - 2014 Remaster",
  'spotify:track:6Cv05rcW8HWwCC6wyEp1fC'],
 ['George Harrison',
  'Let It Down - 2014 Remaster',
  'spotify:track:5FFruMKbVg8AhwHnX4xBov'],
 ['George Harrison',
  'My Sweet Lord - 2014 Remaster',
  'spotify:track:6vE90mi4yKsQGY3YD2OOv1'],
 ['George Harrison',
  'Out of the Blue - 2014 Remaster',
  'spotify:track:1KHMyFaGvwVQ7ax4yjq4BZ'],
 ['George Harrison',
  'Plug Me In - 2014 Remaster',
  'spotify:track:0tyk2xHVjBd3nk16cGktTG'],
 ['George Harrison',
  'Run of the Mill - 2014 Remaster',
  'spotify:track:4uSlUBg3NVOA77E7wwKFTO'],
 ['George Harrison',
  'Thanks for the Pepperoni - 2014 Remaster',
  'spotify:track:3smkwfPqFsTmwfnBztMXaM'],
 ['George Harrison',
  'The Inner Light (Alternative Take) - Instrumental',
  'spotify:track:7gWPnvhaBFMlQsTBWEGcSC'],
 ['George Harrison',
  'Wah-Wah - 2014 Remaster',
  'spotify:track:5j3aqkMO2fl0s5eaSuVnQ8'],
 ['George Harrison',
  'What Is Life - 2014 Remaster',
  'spotify:track:44fw7RulJyj7dGIi9qR86N'],
 ['George Harrison',
  'While My Guitar Gently Weeps - Live At Madison Square Garden; 2009 Remaster',
  'spotify:track:4Egi6XuC0rbLlXfqmQeuFa'],
 ['Glenn Miller', 'In the Mood', 'spotify:track:1xsY8IFXUrxeet1Fcmk4oC'],
 ['Hans Zimmer', 'Cornfield Chase', 'spotify:track:6pWgRkpqVfxnj3WuIcJ7WP'],
 ['Hans Zimmer',
  'Day One (Interstellar Theme)',
  'spotify:track:4WmB04GBqS4xPMYN9dHgBw'],
 ["Israel Kamakawiwo'ole",
  'Maui Medley',
  'spotify:track:6TSJ3L9pBQsYIlCD5pk7ju'],
 ['James Taylor',
  'You’ve Got a Friend',
  'spotify:track:3nK4hWsTEr7fVXziI5bTmh'],
 ['Jay Ungar', 'Ashoken Farewell', 'spotify:track:2s6pqLeVialgt5l5TTSeas'],
 ['Jeff Buckley',
  'If You Knew - Live at Sin-é, New York, NY - July/August 1993',
  'spotify:track:1nd2JEHXbUuQFDiQzCBpsv'],
 ['Jimi Hendrix', 'One Rainy Wish', 'spotify:track:5Zyv0v4rPcrXjkaeImuodv'],
 ['Jimi Hendrix',
  'Spanish Castle Magic',
  'spotify:track:2KFE98Iw0X23sf4vJYcbLH'],
 ['Jimi Hendrix',
  'Wait Until Tomorrow',
  'spotify:track:2YtVzmZzew1ILUdNueyWd7'],
 ['John Lennon',
  'Imagine - Remastered 2010',
  'spotify:track:7pKfPomDEeI4TPT6EOYjn9'],
 ['John Mayer', 'Queen of California', 'spotify:track:0CETmgFGt8Ne8vLnaLcduU'],
 ['Johnny Cash',
  'I Walk The Line - Single Version',
  'spotify:track:1TKPfF2fvn6gVLVfp3iG4j'],
 ['Joni Mitchell',
  'Mitchell: Urge for Going (Instrumental Arrangement of the B-Side Track of the Joni Mitchell Single "You Turn Me on I\'m a Radio")',
  'spotify:track:1I1u9aTdxxQ7SDLgBB3V7b'],
 ['Kanye West', 'Come to Life', 'spotify:track:5xvXeuxISyXJDRbZZf4uzd'],
 ['Leonard Cohen', 'Chelsea Hotel #2', 'spotify:track:4krhCfJg0znykZoyjeMXRe'],
 ['Leonard Cohen', 'Dear Heather', 'spotify:track:3MTKMphPprAcBFG1uIhzPZ'],
 ['Leonard Cohen',
  "Death of a Ladies' Man",
  'spotify:track:5wrylUGwZugelovhryPYg2'],
 ['Leonard Cohen', 'The Future', 'spotify:track:5l8lYrnPEM1ln3J4XaTcy5'],
 ['Leonard Cohen',
  'You Want It Darker',
  'spotify:track:5zb7npjQqoJ7Kcpq4yD9qn'],
 ['Lingers.On', 'In Lingerie', 'spotify:track:6FH3kGlJbFVJDCG9RcERf7'],
 ['Louis Armstrong',
  'La vie en rose - Single Version',
  'spotify:track:3yYfoYGVpriV4fG9L1ogsD'],
 ['The Lovecats', 'The Lovecats', 'spotify:track:7iJUiiTfnuY5cTIeEBnqHr'],
 ['Ludovico Einaudi', 'Primavera', 'spotify:track:4BMHp3DkI8VLsuB9Kr0pzu'],
 ['Mazzy Star', 'Flowers In December', 'spotify:track:0G6Ws8Gbdt0S7pZeuYmkmm'],
 ['Metallica',
  'Fade To Black (Remastered)',
  'spotify:track:0dqGfCMAGyDgpUAgLNOjWd'],
 ['Wolfgang Amadeus Mozart',
  'Requiem in D Minor, K. 626: III. Sequenz No. 6, Lacrimosa dies illa',
  'spotify:track:4bvzJZXpkI3bkjxMCWOSu1'],
 ['My Chemical Romance',
  'The Light Behind Your Eyes',
  'spotify:track:3HyDpKAuR3e4l6QB7hSB2l'],
 ['Paul McCartney',
  'Here Today - Remixed 2015',
  'spotify:track:0QtnwXDziZN1K55fXuLN6q'],
 ['Paul McCartney',
  'I’ll Follow The Sun - Live At Amoeba 2007',
  'spotify:track:3xT59EeQdq0TPGtOlXXI8t'],
 ['Puscifer', 'The Humbling River', 'spotify:track:69GE6yPZZldvqtgBHrKXxg'],
 ['Ray LaMontagne',
  'Such A Simple Thing',
  'spotify:track:4PuUa8e5s7P3Zv1IdCGIsa'],
 ['Ray Manzarek',
  'Riders on the Storm',
  'spotify:track:3FvYcTXO2QtDY7kZQHku2d'],
 ['Red Hot Chili Peppers', 'Dosed', 'spotify:track:1iFIZUVDBCCkWe705FLXto'],
 ['Sky Cries Mary',
  "Don't Forget The Sky",
  'spotify:track:4sVpjCJRClVetRrdxVBolP'],
 ['Stevie Nicks', 'Landslide', 'spotify:track:5fprEY6WEN1wvFXkgfb22C'],
 ['Stevie Wonder', 'Isn’t She Lovely', 'spotify:track:6wGlAaMfyhKdEPr2zycAnN'],
 ['Taylor Swift',
  'Fearless (Taylor’s Version)',
  'spotify:track:77sMIMlNaSURUAXq5coCxE'],
 ['Taylor Swift',
  'the lakes - bonus track',
  'spotify:track:0eFQWVz0qIxDOvhLpZ40P7'],
 ['The Band',
  'When I Paint My Masterpiece - Remastered',
  'spotify:track:76WChUuOPeIK027IeUgr0l'],
 ['The Beach Boys',
  "I Just Wasn't Made For These Times - Mono",
  'spotify:track:4CuO8TINNqM3D7aUdNQ3zG'],
 ['The Beach Boys',
  "Let's Go Away For A While - Mono",
  'spotify:track:3GsgJI1aBrvUtqX8f3MhKT'],
 ['The Beatles',
  "Don't Let Me Down - Naked Version / Remastered 2013",
  'spotify:track:5BhMoGrz5KzG2fA5uzHjZ1'],
 ['The Beatles',
  'Love Me Do - Remastered 2009',
  'spotify:track:3VbGCXWRiouAq8VyMYN2MI'],
 ['The Chemical Brothers',
  'The Boxer',
  'spotify:track:1EUeDFq2zNP784GPaRs9aH'],
 ['The Cure',
  'A Night like This - 2006 Remaster',
  'spotify:track:7cKCz7gG84i1XLvDeM3ByT'],
 ['The Cure',
  'Disintegration - 2010 Remaster',
  'spotify:track:0zY8t5dC1KQXcPUKByWMJM'],
 ['The Cure',
  'From the Edge of the Deep Green Sea',
  'spotify:track:2vwBL9RVyr0vA4Og5VH0i3'],
 ['The Cure',
  'In Between Days - 2006 Remaster',
  'spotify:track:07CyrZF9eVd02zzIse7tZA'],
 ['The Cure', 'A Letter to Elise', 'spotify:track:4DdXOLc1VMAY34ourCn1Xa'],
 ['The Cure',
  'Lullaby - 2010 Remaster',
  'spotify:track:4d4oXk7O2lEhZ83ivV93li'],
 ['The Cure', 'Underneath The Stars', 'spotify:track:0PKVjYlKw7z3IvKAoxrYTR'],
 ['The Eagles', 'The Desperadoes', 'spotify:track:10ppF835WJMYI5v65gFLZ3'],
 ['The Helio Sequence',
  'Keep Your Eyes Ahead',
  'spotify:track:3yatRBsGMJ7wMoUIgDBzzo'],
 ['The Moldy Peaches',
  'Anyone Else But You',
  'spotify:track:2pKi1lRvXNASy7ybeQIDTy'],
 ['The Strokes', 'Someday', 'spotify:track:7hm4HTk9encxT0LYC0J6oI'],
 ['Traditional',
  'Scarborough Fair (Arr. Parkin)',
  'spotify:track:4wlNPczIullwvmwb4x0ltz'],
 ['Van Morrison',
  'Madame George - 1999 Remaster',
  'spotify:track:1N4MKISvC1ddfRCRQDXDd2'],
 ['Various Artists',
  'The Girl From Ipanema',
  'spotify:track:0JgH7g0kwsIs1THEVqhlUS'],
 ['Víg Mihály',
  'Öreg - From "Werckmeister Harmóniák"',
  'spotify:track:63wMgkXQuomlkW4an4O9b4'],
 ['Willie Nelson', 'Crazy', 'spotify:track:0xqtcLB45iKNfHroi5y1em']]


In [None]:
len(addlist)

In [None]:
addlist2 = [a[2] for a in addlist]

print (len(addlist2), 'items')

while(addlist2):
    sp.user_playlist_add_tracks(os.getenv('SPOTIFY_USERNAME'), 
                                playlist_id=playlist_id, 
                                tracks=addlist2[-100:])
    addlist2 = addlist2[:-100]
    print("added items, remaining ", len(addlist2))


In [128]:
z1=pd.read_csv('artist_map.csv')
z2=pd.read_csv('artist_map2.csv')



Unnamed: 0,artist_orig,artist_corrected
0,(G)-IDLE,(G)I-DLE
1,10000 Maniacs,"10,000 Maniacs"
2,10CC,10cc
3,2 Cellos,2Cellos
4,2 cellos,2Cellos
...,...,...
2967,Olifur Arnalds,Ólafur Arnalds
2968,"Ólafur Arnalds, Alice Sara Ott",Ólafur Arnalds
2969,Ēriks Esenvalds,Ēriks Ešenvalds
2970,Zeljko Joksimovic,Željko Joksimović


In [136]:
zmap = dict(zip(z2['artist'], z2['artist2']))
zmap


{'The 2 Live Crew': '2 Live Crew',
 '2 live crew': '2 Live Crew',
 'A Great Big World & Christina Aguilera': 'A Great Big World',
 'A$AP Rocky & Ruth B': 'A$AP Rocky',
 'Aaron Neville and Linda Ronstadt': 'Aaron Neville',
 'AFROMAN': 'Afroman',
 'Agustin Barrios Mangore': 'Agustín Barrios Mangoré',
 'The Alan Parsons Project': 'Alan Parsons Project',
 'Alan Gogoll': 'Alan gogoll',
 'Alanis Morissette, Sarah McLachlan, P!nk': 'Alanis Morissette',
 'Ali Sethi and Shae Gill': 'Ali Sethi',
 'Alina Baraz and Khalid': 'Alina Baraz',
 'Alison Krauss and Union Station': 'Alison Krauss',
 'All Time Low': 'All time low',
 'Allman brothers band': 'Allman Brothers',
 'The Allman Brothers Band': 'Allman Brothers',
 'Alpha': 'Alpha 9',
 'alt-J': 'Alt-J',
 'amos lee': 'Amos Lee',
 'Anastasia Elliot': 'Anastasia',
 'Andrew Lloyd Webber and Tim Rice': 'Andrew Lloyd Webber',
 'Angel': 'Angel Olsen',
 'Animal collective': 'Animal Collective',
 'Animals As Leaders': 'Animals as Leaders',
 'The Animals': '

In [142]:
z1['3'] = z1['artist_orig'].apply(lambda s: zmap[s] if s in zmap else s)


In [148]:
z1[['artist_orig', 'artist_corrected']].to_csv('z1.csv', index=False)

In [140]:
z1['artist_corrected']=z1['3']

In [150]:
!diff z1.csv artist_map.csv

81c81
< Ali sethi and shae gill,Ali Sethi
---
> Ali sethi and shae gill,Ali Sethi and Shae Gill
139,149c139,149
< Bocelli,andrea bocelli
< Andrea bocelli,andrea bocelli
< Andrea Bocelli and Celine Dion,andrea bocelli
< Andrea Bocelli and Eros Ramazzotti,andrea bocelli
< Andre Bocelli and Sarah Brightman,andrea bocelli
< Andrea Bocelli and Sarah Brightman,andrea bocelli
< Andrea Boccelli,andrea bocelli
< Andrea Bocelli & Josh Groban,andrea bocelli
< Andrea Bocceli,andrea bocelli
< Andrea Bocelli & Sarah Brightman,andrea bocelli
< "Andrea Bocelli, Sarah Brightman",andrea bocelli
---
> Bocelli,Andrea Bocelli
> Andrea bocelli,Andrea Bocelli
> Andrea Bocelli and Celine Dion,Andrea Bocelli
> Andrea Bocelli and Eros Ramazzotti,Andrea Bocelli
> Andre Bocelli and Sarah Brightman,Andrea Bocelli
> Andrea Bocelli and Sarah Brightman,Andrea Bocelli
> Andrea Boccelli,Andrea Bocelli
> Andrea Bocelli & Josh Groban,Andrea Bocelli
> Andrea Bocceli,Andrea Bocelli
> Andrea Bocell