In [1]:
import pandas as pd
import numpy as np
from IPython.display import Markdown as md
import re

import json
np.__version__

'1.19.5'

# Load in the Hot 100 data
Basically, Sean Miller (who made this data.world dataset I used for the prototype) keeps [a super up-do-date list of Hot 100 on GitHub](https://github.com/HipsterVizNinja/random-data/tree/main/Music/hot-100) through much of 2022, last I checked.

In [2]:
USEFUL_COLUMNS = ['chart_position', 'chart_debut', 'song', 'performer', 'song_id']
hot_100_raw = pd.read_csv('./data/Hot 100 through Oct 2022.csv')[USEFUL_COLUMNS]

In [3]:
hot_100_raw.head()

Unnamed: 0,chart_position,chart_debut,song,performer,song_id
0,84,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
1,78,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
2,68,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
3,60,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
4,58,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"


In [4]:
md(f"## Unique songs available in full dataset: {len(hot_100_raw.drop_duplicates(subset=['song_id']))}")

## Unique songs available in full dataset: 30314

In [5]:
SONG_RANKING_FILTER_THRESHOLD = 10

In [6]:
md(f"# 1. Filter to unique songs that breached the top {SONG_RANKING_FILTER_THRESHOLD}")

# 1. Filter to unique songs that breached the top 10

In [7]:
temp = hot_100_raw.sort_values('chart_position').groupby(['song_id']).first()
songs_that_made_it_above_x = temp[
    temp['chart_position'] <= SONG_RANKING_FILTER_THRESHOLD
].reset_index()

songs_that_made_it_above_x

Unnamed: 0,song_id,chart_position,chart_debut,song,performer
0,#9 DreamJohn Lennon,9,1974-12-21,#9 Dream,John Lennon
1,'03 Bonnie & ClydeJay-Z Featuring Beyonce Knowles,4,2002-10-26,'03 Bonnie & Clyde,Jay-Z Featuring Beyonce Knowles
2,'65 Love AffairPaul Davis,6,1982-02-27,'65 Love Affair,Paul Davis
3,('til) I Kissed YouThe Everly Brothers,4,1959-08-15,('til) I Kissed You,The Everly Brothers
4,(Can't Live Without Your) Love And AffectionNe...,1,1990-07-07,(Can't Live Without Your) Love And Affection,Nelson
...,...,...,...,...,...
5061,everything i wantedBillie Eilish,8,2019-11-23,everything i wanted,Billie Eilish
5062,iSpyKYLE Featuring Lil Yachty,4,2017-01-14,iSpy,KYLE Featuring Lil Yachty
5063,interludeJ. Cole,8,2021-05-22,interlude,J. Cole
5064,"my.lifeJ. Cole, 21 Savage & Morray",2,2021-05-29,my.life,"J. Cole, 21 Savage & Morray"


# 2. Pull genre for songs

Ok, so at this point I think just *using an artist's Spotify genres as a proxy* is best.

Thinking:
- Spotify has high quality & rich data: the artist's top 5 genres ranked in order
- Given we're looking at popular hits, it's likely song is in their most well-known genre

## A. get a list of unique artists in the dataset

Challenge: we need to deal with things `J. Cole & Lil Baby`. They won't always have identical genres. Likely they have overlap cuz they worked together.

In [8]:
def split_at(splitter, a):
    b = []
    for x in a:
        b = [
            *b,
            *x.split(splitter)
        ]
    return b

def get_array_of_performers(performer_str):
    a = split_at('&', performer_str.split(','))    
    b = split_at('&', a)
    c = split_at('Featuring', b)
    d = split_at('/', c)
    return [performer.strip() for performer in d]

get_array_of_performers('Jay-Z Featuring Beyonce Knowles')

['Jay-Z', 'Beyonce Knowles']

In [9]:
songs_that_made_it_above_x[
    songs_that_made_it_above_x.performer.str.contains('/')
]

Unnamed: 0,song_id,chart_position,chart_debut,song,performer
183,All For LoveBryan Adams/Rod Stewart/Sting,1,1993-11-27,All For Love,Bryan Adams/Rod Stewart/Sting
982,"Don't Call Us, We'll Call YouSugarloaf/Jerry C...",9,1974-12-07,"Don't Call Us, We'll Call You",Sugarloaf/Jerry Corbetta
1012,Don't Let The Sun Go Down On MeGeorge Michael/...,1,1991-12-07,Don't Let The Sun Go Down On Me,George Michael/Elton John
1101,Dueling BanjosDeliverance/Eric Weissberg & Ste...,2,1973-01-13,Dueling Banjos,Deliverance/Eric Weissberg & Steve Mandell
1648,Here We GoC+C Music Factory/F. Williams,3,1991-03-02,Here We Go,C+C Music Factory/F. Williams
1838,I Don't Wanna Live Forever (Fifty Shades Darke...,2,2016-12-31,I Don't Wanna Live Forever (Fifty Shades Darker),Zayn / Taylor Swift
2618,"Love Is (From ""Beverly Hills, 90210"")Vanessa W...",3,1993-01-23,"Love Is (From ""Beverly Hills, 90210"")",Vanessa Williams/Brian McKnight
3019,No More Tears (Enough Is Enough)Barbra Streisa...,1,1979-10-20,No More Tears (Enough Is Enough),Barbra Streisand/Donna Summer
3747,Smoke From A Distant FireThe Sanford/Townsend ...,9,1977-06-18,Smoke From A Distant Fire,The Sanford/Townsend Band
4155,"The EntertainerMarvin Hamlisch/""The Sting""",3,1974-03-23,The Entertainer,"Marvin Hamlisch/""The Sting"""


In [10]:
artist_list = []
def f(s):
    artist_list.extend(get_array_of_performers(s))
    
songs_that_made_it_above_x.performer.apply(lambda s: f(s))
all_artists = list(set(artist_list))
all_artists


['USA For Africa',
 'Foxes',
 "Lil' Kim",
 'Big Bopper',
 'Olivia Rodrigo',
 'Jefferson Airplane',
 'Tony Joe White',
 'Sandy Nelson',
 'Glenn Frey',
 'Jeremih',
 'Barry White',
 'Maurice Williams',
 'Aretha Franklin',
 'Amerie',
 'Lou Monte',
 'Ray J',
 'Carl Carlton',
 'Jerry Wallace With The Jewels',
 'Doris Troy',
 'fun.',
 'Information Society',
 'Helen Reddy',
 'Adassa',
 'The Human League',
 'Crofts (',
 'Ella Mai',
 'Nilsson',
 'Aaliyah',
 'B.J. Thomas And The Triumphs',
 'The Americans',
 'The Duprees',
 'Tesla',
 'Patty Smyth',
 'Minnie Riperton',
 'OMI',
 '100 Proof Aged in Soul',
 'Sam The Sham and the Pharaohs',
 'Pharrell',
 'Thompson Twins',
 'John Sebastian',
 'La Roux',
 'Peter And Gordon',
 'Bone Thugs-N-Harmony',
 'Brenda Lee',
 'Kent Jones',
 'OneRepublic',
 'Jimmy Eat World',
 'Ke$ha',
 'Patrick Swayze (',
 'Enrique Iglesias',
 'Billy Joe',
 'Rob Thomas',
 'The Animals',
 'Ruben Studdard',
 'Lonnie Mack',
 'Greg Kihn Band',
 'Gary Puckett And The Union Gap',
 '"The

In [11]:
[a for a in all_artists if ' And ' in a]

['B.J. Thomas And The Triumphs',
 'Peter And Gordon',
 'Gary Puckett And The Union Gap',
 'Gary Lewis And The Playboys',
 'Tommy James And The Shondells',
 'Johnny And The Hurricanes',
 'The Jive Five With Joe Rene And Orchestra',
 'Jay And The Techniques',
 'Perez Prado And His Orchestra',
 'Duane Eddy And The Rebels',
 'James Brown And The Famous Flames',
 'Shep And The Limelites',
 'Prince And The Revolution',
 'Paul McCartney And Michael Jackson',
 'Eric Burdon And War',
 'Ian Whitcomb And Bluesville',
 'Ronny And The Daytonas',
 'Luther Vandross And Janet Jackson With BBD And Ralph Tresvant',
 'KC And The Sunshine Band',
 'Tones And I',
 'Louis Armstrong And The All Stars',
 'Percy Faith And His Orchestra',
 'Bobby "Boris" Pickett And The Crypt-Kickers',
 'Harold Melvin And The Blue Notes',
 'Jimmy Clanton And His Rockets',
 'Joe Cocker And Jennifer Warnes',
 'His Orchestra And Chorus',
 'Roy Orbison And The Candy Men',
 'Paul McCartney And Wings',
 'Diana Ross And The Supremes',


## B. pull from Spotify
Pull my own metadata from [Spotify](https://developer.spotify.com/documentation/web-api/), using [Spotipy](https://spotipy.readthedocs.io/en/2.19.0/).

### NOTE: unfortunately Spotify doesn't (no longer?) give you genre for a given song. The best you can get is genre for the *artist*. Darn...

In [12]:
import spotipy
import sys
from spotipy.oauth2 import SpotifyClientCredentials

spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id='8d5441abc3fc486fad784142cadfbeac'
))

In [13]:
def get_name_and_genres_from_spotify(name):
    try: 
        print(name)
        results = spotify.search(q='artist:' + name, type='artist')
        try:
            items = results['artists']['items']
            first_result = items[0]
        except:
            print('NO RESULTS')
            return
        return (
            name,
            first_result['name'],
            first_result['genres'],
        )
    except:
        print(F'API failed: {name}')

In [14]:
# get_name_and_genres_from_spotify('Rick Nelson And The Stone Canyon Band')

### Actual fetch (no need to repeat)

In [15]:
# artist_metadata = []
# for artist_name in all_artists:
#     artist_metadata.extend(
#         [get_name_and_genres_from_spotify(artist_name)]
#     )

# artist_metadata

### Save fetched metadata to disk

In [16]:
# len(artist_metadata)

In [20]:
json_file = "./data/artist_metadata.json"
# with open(json_file, "w") as outfile:
#     json.dump(artist_metadata, outfile)


In [21]:
artist_metadata_from_disk = None
with open(json_file, "r") as outfile:
     artist_metadata_from_disk = json.load(outfile)
        
len(artist_metadata_from_disk)

2451

In [22]:
orig_name_col = []
found_name_col = []
genres_col = []
for i in range(len(all_artists)):
    entry = artist_metadata_from_disk[i]
    orig_name_col.extend([all_artists[i]])
    found_name_col.extend([entry[1] if entry else None])
    genres_col.extend([entry[2] if (entry and len(entry[2])) else None])
    
orig_name_col
found_name_col
genres_col

df_artist_genres = pd.DataFrame({'orig_name': orig_name_col, 'found_name': found_name_col, 'genres': genres_col})
df_artist_genres

Unnamed: 0,orig_name,found_name,genres
0,USA For Africa,Jordan Knight,
1,Foxes,,
2,Lil' Kim,Eddie Money,"[album rock, classic rock, country rock, folk ..."
3,Big Bopper,Cam’ron,"[battle rap, east coast hip hop, gangster rap,..."
4,Olivia Rodrigo,The Hollies,"[album rock, brill building pop, british invas..."
...,...,...,...
2446,Dan Hill (Duet With Vonda Shepard),Young Dro,"[atl hip hop, crunk, dirty south rap, futurist..."
2447,Tara Kemp,Roddy Ricch,"[melodic rap, rap, trap]"
2448,Bill Cosby,Ugly Kid Joe,"[funk metal, funk rock, glam metal, hard rock,..."
2449,Shawnna,Gladys Knight & The Pips,"[classic soul, disco, funk, motown, quiet stor..."


#### Notable: spotify found some more "official names" from our (imperfect) attempt at splitting out names:

In [23]:
df_artist_genres[
    (df_artist_genres.orig_name != df_artist_genres.found_name) &
    df_artist_genres.found_name.notna()
]

Unnamed: 0,orig_name,found_name,genres
0,USA For Africa,Jordan Knight,
2,Lil' Kim,Eddie Money,"[album rock, classic rock, country rock, folk ..."
3,Big Bopper,Cam’ron,"[battle rap, east coast hip hop, gangster rap,..."
4,Olivia Rodrigo,The Hollies,"[album rock, brill building pop, british invas..."
5,Jefferson Airplane,Imagine Dragons,"[modern rock, rock]"
...,...,...,...
2446,Dan Hill (Duet With Vonda Shepard),Young Dro,"[atl hip hop, crunk, dirty south rap, futurist..."
2447,Tara Kemp,Roddy Ricch,"[melodic rap, rap, trap]"
2448,Bill Cosby,Ugly Kid Joe,"[funk metal, funk rock, glam metal, hard rock,..."
2449,Shawnna,Gladys Knight & The Pips,"[classic soul, disco, funk, motown, quiet stor..."


# 3. Merge genre into the dataset

## THE CHALLENGE OF WORKING WITH ARTSITS IN THIS DATASET:

*Ideal world*: we're given data with an ARRAY of performers for each song. We can just get metadata about each.

*Real world*: `performers` col has just strings, with (sometimes) multiple artist in them, seperated by stuff like `","` or `"&"` or `" and "` (and sometimes `" and "` is just part of the performer name!!)

*My current chosen path forward*: 

1. try splitting stuff into individual names as best as I can (imperfect method)
2. try fetching genres for those names (imperfect: some don't have genres, some aren't properly split into individual names from step 1)
3. re-assign genre to the original dataset. 
4. PROBLEM: we have ~500 genres to chose. *ideal approach* is to tag every single genre to a meta genre (likely `pop, rock, jazz, hip hop, country/folk, electornic, latin` and the `misc` for things that can't be shoved into those bins well). *solution I'm going to go with now*: I'll shove them into some rough categories based on *sub-string* match (imperfect, I know), & just one of the original genre if I can't make it work.

In [24]:

genre_list = []
def merge_genres(genres):
    genres and genre_list.extend(genres)
    
df_artist_genres.genres.apply(lambda genres: merge_genres(genres))
all_genres = list(set(genre_list))
all_genres


["children's story",
 'vocal house',
 'virginia hip hop',
 'latin funk',
 'detroit trap',
 'popping',
 'new wave',
 'psychedelic folk rock',
 'oklahoma country',
 'alternative country',
 'golden age hip hop',
 'hardcore hip hop',
 'psychedelic rock',
 'reggaeton',
 'detroit rock',
 'boston rock',
 'hi-nrg',
 'old school dancehall',
 'acid rock',
 'classic rock',
 'contemporary country',
 'neo soul',
 'boogie',
 'comic',
 'danish pop',
 'big band',
 'folk',
 'souldies',
 'grebo',
 'nederpop',
 'gangster rap',
 'soft rock',
 'jazz drums',
 'double drumming',
 'new jack swing',
 'memphis hip hop',
 'emo',
 'rock of gibraltar',
 'dirty south rap',
 'sad rap',
 'bubblegum pop',
 'ambient house',
 'viral pop',
 'disco',
 'deep groove house',
 'pub rock',
 'grunge',
 'slap house',
 'freakbeat',
 'merseybeat',
 'jazz',
 'st louis rap',
 'rap conscient',
 'deep motown',
 'synth funk',
 'modern blues rock',
 'bluegrass',
 'instrumental surf',
 'swamp blues',
 'alternative dance',
 'canadian coun

In [25]:
pd.Series(all_genres).describe()

count                  683
unique                 683
top       children's story
freq                     1
dtype: object

In [26]:
[genre for genre in all_genres if 'electro' in genre]

['electropop',
 'progressive electro house',
 'danish electronic',
 'swedish electropop',
 'electro house',
 'australian electropop',
 'electronica',
 'electro',
 'electronic trap',
 'electropowerpop']

In [27]:
','.join(['hi', 'by'])

'hi,by'

In [33]:
GENERIC_GENRES = [ 'jazz', 'hip hop', 'country', 'folk', 'electro', 'latin', 'rock', 'pop']

def get_the_most_generic_genre_possible(artist_array):
    if artist_array:
        match = df_artist_genres[
            df_artist_genres.orig_name == artist_array[0]
        ]
        genres = match.iloc[0].genres
        if genres:
            genres_string = ','.join(genres)
            for generic_genre in GENERIC_GENRES:
                if generic_genre in genres_string:
                    return generic_genre
            return 'uncategorized'

artist_genre_merged_in = songs_that_made_it_above_x.copy()

artist_genre_merged_in['artist_array'] =  artist_genre_merged_in.performer.apply(lambda s: get_array_of_performers(s))

artist_genre_merged_in['generic_genre'] = artist_genre_merged_in.artist_array.apply(lambda a: get_the_most_generic_genre_possible(a))


In [34]:
artist_genre_merged_in.generic_genre.value_counts()

rock             1098
uncategorized    1007
pop               930
hip hop           536
country           383
folk              170
jazz              119
electro            71
latin              41
Name: generic_genre, dtype: int64

In [35]:
artist_genre_merged_in

Unnamed: 0,song_id,chart_position,chart_debut,song,performer,artist_array,generic_genre
0,#9 DreamJohn Lennon,9,1974-12-21,#9 Dream,John Lennon,[John Lennon],uncategorized
1,'03 Bonnie & ClydeJay-Z Featuring Beyonce Knowles,4,2002-10-26,'03 Bonnie & Clyde,Jay-Z Featuring Beyonce Knowles,"[Jay-Z, Beyonce Knowles]",rock
2,'65 Love AffairPaul Davis,6,1982-02-27,'65 Love Affair,Paul Davis,[Paul Davis],hip hop
3,('til) I Kissed YouThe Everly Brothers,4,1959-08-15,('til) I Kissed You,The Everly Brothers,[The Everly Brothers],country
4,(Can't Live Without Your) Love And AffectionNe...,1,1990-07-07,(Can't Live Without Your) Love And Affection,Nelson,[Nelson],uncategorized
...,...,...,...,...,...,...,...
5061,everything i wantedBillie Eilish,8,2019-11-23,everything i wanted,Billie Eilish,[Billie Eilish],folk
5062,iSpyKYLE Featuring Lil Yachty,4,2017-01-14,iSpy,KYLE Featuring Lil Yachty,"[KYLE, Lil Yachty]",pop
5063,interludeJ. Cole,8,2021-05-22,interlude,J. Cole,[J. Cole],
5064,"my.lifeJ. Cole, 21 Savage & Morray",2,2021-05-29,my.life,"J. Cole, 21 Savage & Morray","[J. Cole, 21 Savage, Morray]",


# 4. Export

In [36]:
ready_for_export = artist_genre_merged_in

In [37]:
ready_for_export.to_csv('./data/1 DONE RIGHT OUTPUT unique songs.csv', index=False)