In [1]:
import pandas as pd
import numpy as np
from IPython.display import Markdown as md
import re
np.__version__

'1.19.5'

# Load in the Hot 100 data
Basically, Sean Miller (who made this data.world dataset I used for the prototype) keeps [a super up-do-date list of Hot 100 on GitHub](https://github.com/HipsterVizNinja/random-data/tree/main/Music/hot-100) through much of 2022, last I checked.

In [2]:
USEFUL_COLUMNS = ['chart_position', 'chart_debut', 'song', 'performer', 'song_id']
hot_100_raw = pd.read_csv('./data/Hot 100 through Oct 2022.csv')[USEFUL_COLUMNS]

In [3]:
hot_100_raw.head()

Unnamed: 0,chart_position,chart_debut,song,performer,song_id
0,84,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
1,78,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
2,68,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
3,60,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
4,58,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"


In [4]:
md(f"## Unique songs available in full dataset: {len(hot_100_raw.drop_duplicates(subset=['song_id']))}")

## Unique songs available in full dataset: 30314

In [5]:
SONG_RANKING_FILTER_THRESHOLD = 10

In [6]:
md(f"# 1. Filter to unique songs that breached the top {SONG_RANKING_FILTER_THRESHOLD}")

# 1. Filter to unique songs that breached the top 10

In [7]:
temp = hot_100_raw.sort_values('chart_position').groupby(['song_id']).first()
songs_that_made_it_above_x = temp[
    temp['chart_position'] <= SONG_RANKING_FILTER_THRESHOLD
].reset_index()

songs_that_made_it_above_x

Unnamed: 0,song_id,chart_position,chart_debut,song,performer
0,#9 DreamJohn Lennon,9,1974-12-21,#9 Dream,John Lennon
1,'03 Bonnie & ClydeJay-Z Featuring Beyonce Knowles,4,2002-10-26,'03 Bonnie & Clyde,Jay-Z Featuring Beyonce Knowles
2,'65 Love AffairPaul Davis,6,1982-02-27,'65 Love Affair,Paul Davis
3,('til) I Kissed YouThe Everly Brothers,4,1959-08-15,('til) I Kissed You,The Everly Brothers
4,(Can't Live Without Your) Love And AffectionNe...,1,1990-07-07,(Can't Live Without Your) Love And Affection,Nelson
...,...,...,...,...,...
5061,everything i wantedBillie Eilish,8,2019-11-23,everything i wanted,Billie Eilish
5062,iSpyKYLE Featuring Lil Yachty,4,2017-01-14,iSpy,KYLE Featuring Lil Yachty
5063,interludeJ. Cole,8,2021-05-22,interlude,J. Cole
5064,"my.lifeJ. Cole, 21 Savage & Morray",2,2021-05-29,my.life,"J. Cole, 21 Savage & Morray"


# 2. Pull genre for songs

Ok, so at this point I think just *using an artist's Spotify genres as a proxy* is best.

Thinking:
- Spotify has high quality & rich data: the artist's top 5 genres ranked in order
- Given we're looking at popular hits, it's likely song is in their most well-known genre

## A. get a list of unique artists in the dataset

Challenge: we need to deal with things `J. Cole & Lil Baby`. They won't always have identical genres. Likely they have overlap cuz they worked together.

In [8]:
def split_at(splitter, a):
    b = []
    for x in a:
        b = [
            *b,
            *x.split(splitter)
        ]
    return b

def get_array_of_performers(performer_str):
    a = split_at('&', performer_str.split(','))    
    b = split_at('&', a)
    c = split_at('Featuring', b)
    d = split_at('/', c)
    return [performer.strip() for performer in d]

get_array_of_performers('Jay-Z Featuring Beyonce Knowles')

['Jay-Z', 'Beyonce Knowles']

In [9]:
songs_that_made_it_above_x[
    songs_that_made_it_above_x.performer.str.contains('/')
]

Unnamed: 0,song_id,chart_position,chart_debut,song,performer
183,All For LoveBryan Adams/Rod Stewart/Sting,1,1993-11-27,All For Love,Bryan Adams/Rod Stewart/Sting
982,"Don't Call Us, We'll Call YouSugarloaf/Jerry C...",9,1974-12-07,"Don't Call Us, We'll Call You",Sugarloaf/Jerry Corbetta
1012,Don't Let The Sun Go Down On MeGeorge Michael/...,1,1991-12-07,Don't Let The Sun Go Down On Me,George Michael/Elton John
1101,Dueling BanjosDeliverance/Eric Weissberg & Ste...,2,1973-01-13,Dueling Banjos,Deliverance/Eric Weissberg & Steve Mandell
1648,Here We GoC+C Music Factory/F. Williams,3,1991-03-02,Here We Go,C+C Music Factory/F. Williams
1838,I Don't Wanna Live Forever (Fifty Shades Darke...,2,2016-12-31,I Don't Wanna Live Forever (Fifty Shades Darker),Zayn / Taylor Swift
2618,"Love Is (From ""Beverly Hills, 90210"")Vanessa W...",3,1993-01-23,"Love Is (From ""Beverly Hills, 90210"")",Vanessa Williams/Brian McKnight
3019,No More Tears (Enough Is Enough)Barbra Streisa...,1,1979-10-20,No More Tears (Enough Is Enough),Barbra Streisand/Donna Summer
3747,Smoke From A Distant FireThe Sanford/Townsend ...,9,1977-06-18,Smoke From A Distant Fire,The Sanford/Townsend Band
4155,"The EntertainerMarvin Hamlisch/""The Sting""",3,1974-03-23,The Entertainer,"Marvin Hamlisch/""The Sting"""


In [10]:
artist_list = []
def f(s):
    artist_list.extend(get_array_of_performers(s))
    
songs_that_made_it_above_x.performer.apply(lambda s: f(s))
all_artists = list(set(artist_list))
all_artists


['Jordan Knight',
 'Luther Vandross And Janet Jackson With BBD And Ralph Tresvant',
 'Eddie Money',
 "Cam'Ron",
 'The Hollies',
 'Imagine Dragons',
 'Jon Bon Jovi',
 'Loon',
 'Michelle Branch',
 'Rome',
 'Yanou',
 'George McCrae',
 'Marvin Hamlisch',
 'Dr. Dre)',
 'The Floaters',
 'LeAnn Rimes',
 'Quavo',
 "'N Sync",
 'Lynyrd Skynyrd',
 'Jessie J',
 'Paul McCoy',
 'LFO',
 'Frank Sinatra',
 'Suzanne Vega',
 'The New Vaudeville Band',
 'Toto',
 'The Jaggerz',
 'Sammy Davis',
 'Billy Ocean',
 'Lesley Gore',
 'H-Town',
 'Steve Perry',
 'Joji',
 'Roberta Flack With Maxi Priest',
 'Timex Social Club',
 'Richard Marx',
 'Vertical Horizon',
 'Carly Rae Jepsen',
 'Mobb Deep',
 'The Monkees',
 'Samantha Fox',
 'Nicolette Larson',
 'Kool Mo Dee',
 'Puddle Of Mudd',
 'Brenton Wood',
 'Harry Styles',
 'Buzz Clifford',
 'Billy Joe',
 'French Montana',
 'The Everly Brothers',
 'Nena',
 'The Cyrkle',
 'Missy "Misdemeanor" Elliott',
 'Tyrone Davis',
 'Elle King',
 'Brandy',
 'Len Barry',
 'The Crazy Wo

In [68]:
[a for a in all_artists if ' And ' in a]

['Luther Vandross And Janet Jackson With BBD And Ralph Tresvant',
 'Rick Nelson And The Stone Canyon Band',
 'His Orchestra And Chorus',
 'Love And Rockets',
 'Bobby "Boris" Pickett And The Crypt-Kickers',
 'Jimmy Gilmer And The Fireballs',
 'Ian Whitcomb And Bluesville',
 'Gary Puckett And The Union Gap',
 'Mel And Tim',
 'Jay And The Techniques',
 'Prince And The N.P.G.',
 'Roy Orbison And The Candy Men',
 'Harold Melvin And The Blue Notes',
 'Brewer And Shipley',
 'Usher And Alicia Keys',
 'Freddie And The Dreamers',
 'Tom Petty And The Heartbreakers',
 'Eric Burdon And War',
 'Diana Ross And The Supremes',
 'Rosie And The Originals',
 'Paul McCartney And Michael Jackson',
 'James Brown And The Famous Flames',
 'Kenny Rogers And Sheena Easton',
 'Prince And The Revolution',
 'Mitch Ryder And The Detroit Wheels',
 'Ronny And The Daytonas',
 'Tones And I',
 'Celine Dion And Peabo Bryson',
 'Julio Iglesias And Willie Nelson',
 'Ruby And The Romantics',
 'KC And The Sunshine Band',
 'Sh

## B. pull from Spotify
Pull my own metadata from [Spotify](https://developer.spotify.com/documentation/web-api/), using [Spotipy](https://spotipy.readthedocs.io/en/2.19.0/).

### NOTE: unfortunately Spotify doesn't (no longer?) give you genre for a given song. The best you can get is genre for the *artist*. Darn...

In [13]:
import spotipy
import sys
from spotipy.oauth2 import SpotifyClientCredentials

spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id='8d5441abc3fc486fad784142cadfbeac'
))

In [72]:
def get_name_and_genres_from_spotify(name):
    try: 
        print(name)
        results = spotify.search(q='artist:' + name, type='artist')
        try:
            items = results['artists']['items']
            first_result = items[0]
        except:
            print('NO RESULTS')
            return
        return (
            name,
            first_result['name'],
            first_result['genres'],
        )
    except:
        print(F'API failed: {name}')

In [73]:
get_name_and_genres_from_spotify('Rick Nelson And The Stone Canyon Band')

Rick Nelson And The Stone Canyon Band
NO RESULTS


In [66]:
artist_metadata = []
for artist_name in all_artists[:10]:
    artist_metadata.extend(
        [get_name_and_genres_from_spotify(artist_name)]
    )

artist_metadata

Jordan Knight
Luther Vandross And Janet Jackson With BBD And Ralph Tresvant
OOPS: Luther Vandross And Janet Jackson With BBD And Ralph Tresvant
Eddie Money
Cam'Ron
The Hollies
Imagine Dragons
Jon Bon Jovi
Loon
Michelle Branch
Rome


[('Jordan Knight', 'Jordan Knight', []),
 None,
 ('Eddie Money',
  'Eddie Money',
  ['album rock',
   'classic rock',
   'country rock',
   'folk rock',
   'hard rock',
   'mellow gold',
   'pop rock',
   'rock',
   'singer-songwriter',
   'soft rock']),
 ("Cam'Ron",
  'Cam’ron',
  ['battle rap',
   'east coast hip hop',
   'gangster rap',
   'hardcore hip hop',
   'harlem hip hop',
   'hip hop',
   'hip pop',
   'pop rap',
   'rap',
   'southern hip hop',
   'trap']),
 ('The Hollies',
  'The Hollies',
  ['album rock',
   'brill building pop',
   'british invasion',
   'bubblegum pop',
   'classic rock',
   'classic uk pop',
   'country rock',
   'folk rock',
   'mellow gold',
   'merseybeat',
   'psychedelic rock',
   'rock',
   'rock-and-roll',
   'soft rock']),
 ('Imagine Dragons', 'Imagine Dragons', ['modern rock', 'rock']),
 ('Jon Bon Jovi', 'Jon Bon Jovi', ['glam metal', 'hard rock']),
 ('Loon', 'LOONA', ['k-pop', 'k-pop girl group']),
 ('Michelle Branch',
  'Michelle Branch',
  

# 3. TODO: Add genre to the dataset

# 4. Export

In [16]:
ready_for_export = songs_that_made_it_above_x

In [18]:
ready_for_export.to_csv('./data/1 DONE RIGHT OUTPUT unique songs.csv', index=False)