In [1]:
import pandas as pd
import numpy as np
from IPython.display import Markdown as md
import re

import json
np.__version__

'1.19.5'

# Load in the Hot 100 data
Basically, Sean Miller (who made this data.world dataset I used for the prototype) keeps [a super up-do-date list of Hot 100 on GitHub](https://github.com/HipsterVizNinja/random-data/tree/main/Music/hot-100), downloaded through 2022.

In [8]:
USEFUL_COLUMNS = ['chart_position', 'chart_debut', 'song', 'performer', 'song_id']
hot_100_raw = pd.read_csv('./data/Hot 100 through 2022.csv')[USEFUL_COLUMNS]

In [43]:
hot_100_raw

Unnamed: 0,chart_position,chart_debut,song,performer,song_id
0,84,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
1,78,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
2,68,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
3,60,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
4,58,1990-05-05,"""B"" Girls",Young And Restless,"""B"" GirlsYoung And Restless"
...,...,...,...,...,...
336290,99,2000-11-04,www.memory,Alan Jackson,www.memoryAlan Jackson
336291,66,1958-12-13,¿Dònde Està Santa Claus? (Where Is Santa Claus?),Augie Rios,¿Dònde Està Santa Claus? (Where Is Santa Claus...
336292,53,1958-12-13,¿Dònde Està Santa Claus? (Where Is Santa Claus?),Augie Rios,¿Dònde Està Santa Claus? (Where Is Santa Claus...
336293,47,1958-12-13,¿Dònde Està Santa Claus? (Where Is Santa Claus?),Augie Rios,¿Dònde Està Santa Claus? (Where Is Santa Claus...


In [10]:
md(f"## Unique songs available in full dataset: {len(hot_100_raw.drop_duplicates(subset=['song_id']))}")

## Unique songs available in full dataset: 30444

In [11]:
SONG_RANKING_FILTER_THRESHOLD = 10

In [12]:
md(f"# 1. Filter to unique songs that breached the top {SONG_RANKING_FILTER_THRESHOLD}")

# 1. Filter to unique songs that breached the top 10

In [13]:
temp = hot_100_raw.sort_values('chart_position').groupby(['song_id']).first()
songs_that_made_it_above_x = temp[
    temp['chart_position'] <= SONG_RANKING_FILTER_THRESHOLD
].reset_index()

songs_that_made_it_above_x

Unnamed: 0,song_id,chart_position,chart_debut,song,performer
0,#9 DreamJohn Lennon,9,1974-12-21,#9 Dream,John Lennon
1,'03 Bonnie & ClydeJay-Z Featuring Beyonce Knowles,4,2002-10-26,'03 Bonnie & Clyde,Jay-Z Featuring Beyonce Knowles
2,'65 Love AffairPaul Davis,6,1982-02-27,'65 Love Affair,Paul Davis
3,('til) I Kissed YouThe Everly Brothers,4,1959-08-15,('til) I Kissed You,The Everly Brothers
4,(Can't Live Without Your) Love And AffectionNe...,1,1990-07-07,(Can't Live Without Your) Love And Affection,Nelson
...,...,...,...,...,...
5089,everything i wantedBillie Eilish,8,2019-11-23,everything i wanted,Billie Eilish
5090,iSpyKYLE Featuring Lil Yachty,4,2017-01-14,iSpy,KYLE Featuring Lil Yachty
5091,interludeJ. Cole,8,2021-05-22,interlude,J. Cole
5092,"my.lifeJ. Cole, 21 Savage & Morray",2,2021-05-29,my.life,"J. Cole, 21 Savage & Morray"


In [47]:
songs_that_made_it_above_x[
    songs_that_made_it_above_x.chart_debut.str.startswith('2022-11')
]

Unnamed: 0,song_id,chart_position,chart_debut,song,performer
275,Anti-HeroTaylor Swift,1,2022-11-05,Anti-Hero,Taylor Swift
347,BackOutsideBoyzDrake,9,2022-11-19,BackOutsideBoyz,Drake
429,BejeweledTaylor Swift,6,2022-11-05,Bejeweled,Taylor Swift
719,Circo LocoDrake & 21 Savage,8,2022-11-19,Circo Loco,Drake & 21 Savage
2340,KarmaTaylor Swift,9,2022-11-05,Karma,Taylor Swift
2429,Lavender HazeTaylor Swift,2,2022-11-05,Lavender Haze,Taylor Swift
2513,Lift Me UpRihanna,2,2022-11-12,Lift Me Up,Rihanna
2733,Major DistributionDrake & 21 Savage,3,2022-11-19,Major Distribution,Drake & 21 Savage
2766,MaroonTaylor Swift,3,2022-11-05,Maroon,Taylor Swift
2810,Midnight RainTaylor Swift,5,2022-11-05,Midnight Rain,Taylor Swift


# 2. Pull genre for songs

Ok, so at this point I think just *using an artist's Spotify genres as a proxy* is best.

Thinking:
- Spotify has high quality & rich data: the artist's top 5 genres ranked in order
- Given we're looking at popular hits, it's likely song is in their most well-known genre

## A. get a list of unique artists in the dataset

## 🚧 IMPROVEMENTS: 
- clean the names better (eg remove things like `(...)` or just `)` that might exist)
- add more splitting instances (especially ` And ` and ` With `), and just handle special cases of bands that use that splitter in their name (ie create a "safe list" of them)

Challenge: we need to deal with things `J. Cole & Lil Baby`. They won't always have identical genres. Likely they have overlap cuz they worked together.

In [14]:
def split_at(splitter, a):
    b = []
    for x in a:
        b = [
            *b,
            *x.split(splitter)
        ]
    return b

def get_array_of_performers(performer_str):
    a = split_at('&', performer_str.split(','))    
    b = split_at('&', a)
    c = split_at('Featuring', b)
    d = split_at('/', c)
    return [performer.strip() for performer in d]

get_array_of_performers('Jay-Z Featuring Beyonce Knowles')

['Jay-Z', 'Beyonce Knowles']

In [15]:
songs_that_made_it_above_x[
    songs_that_made_it_above_x.performer.str.contains('/')
]

Unnamed: 0,song_id,chart_position,chart_debut,song,performer
183,All For LoveBryan Adams/Rod Stewart/Sting,1,1993-11-27,All For Love,Bryan Adams/Rod Stewart/Sting
988,"Don't Call Us, We'll Call YouSugarloaf/Jerry C...",9,1974-12-07,"Don't Call Us, We'll Call You",Sugarloaf/Jerry Corbetta
1018,Don't Let The Sun Go Down On MeGeorge Michael/...,1,1991-12-07,Don't Let The Sun Go Down On Me,George Michael/Elton John
1107,Dueling BanjosDeliverance/Eric Weissberg & Ste...,2,1973-01-13,Dueling Banjos,Deliverance/Eric Weissberg & Steve Mandell
1655,Here We GoC+C Music Factory/F. Williams,3,1991-03-02,Here We Go,C+C Music Factory/F. Williams
1845,I Don't Wanna Live Forever (Fifty Shades Darke...,2,2016-12-31,I Don't Wanna Live Forever (Fifty Shades Darker),Zayn / Taylor Swift
2630,"Love Is (From ""Beverly Hills, 90210"")Vanessa W...",3,1993-01-23,"Love Is (From ""Beverly Hills, 90210"")",Vanessa Williams/Brian McKnight
3034,No More Tears (Enough Is Enough)Barbra Streisa...,1,1979-10-20,No More Tears (Enough Is Enough),Barbra Streisand/Donna Summer
3769,Smoke From A Distant FireThe Sanford/Townsend ...,9,1977-06-18,Smoke From A Distant Fire,The Sanford/Townsend Band
4181,"The EntertainerMarvin Hamlisch/""The Sting""",3,1974-03-23,The Entertainer,"Marvin Hamlisch/""The Sting"""


In [16]:
artist_list = []
def f(s):
    artist_list.extend(get_array_of_performers(s))
    
songs_that_made_it_above_x.performer.apply(lambda s: f(s))
all_artists = list(set(artist_list))
all_artists


['Rakim',
 'Toya',
 'Black Box',
 'Barry Mann',
 'Bill Withers',
 'John Martin',
 'A Taste Of Honey',
 'My Chemical Romance',
 'Edgar Winter Group',
 'Kelly Rowland',
 '3OH!3',
 'Lil Tjay',
 'Rickie Lee Jones',
 'Styles P.',
 'Barbra Streisand',
 'Natasha Bedingfield',
 'Pips',
 'Matthew Santos',
 'Shadows Of Knight',
 'Hanson',
 "Al (He's the King) Hirt",
 'The Newbeats',
 'Bell Biv DeVoe',
 'Juice Newton',
 'Ne-Yo',
 'Tom Petty',
 'Harry Chapin',
 'The Chakachas',
 'Ray Parker Jr.',
 'Jennifer Warnes',
 'Wanz',
 'Les Crane',
 'The New Seekers',
 'The First Edition',
 'Boz Scaggs',
 'Regina Belle',
 "The O'Jays",
 'Animotion',
 'Joe Cocker with Leon Russell',
 'Chicago',
 'Michael Bolton',
 'Van McCoy And The Soul City Symphony',
 'Mike + The Mechanics',
 'Tony! Toni! Tone!',
 'Gary Lewis And The Playboys',
 'Spin Doctors',
 'Major Lazer',
 'Bobby Sherman',
 "Michel'le",
 'Nicole Scherzinger',
 'Skylar Grey',
 'LL Cool J',
 'Divine',
 'Mongo Santamaria Band',
 'The Young Rascals',
 'D

In [17]:
[a for a in all_artists if ' And ' in a]

['Van McCoy And The Soul City Symphony',
 'Gary Lewis And The Playboys',
 'Mike Reno And Ann Wilson',
 'Hank Ballard And The Midnighters',
 'James Brown And The Famous Flames',
 'Kenny Rogers And Sheena Easton',
 'Michael Jackson And Paul McCartney',
 'Prince And The N.P.G.',
 'Peter And Gordon',
 'John Fred And The Playboys',
 'Usher And Alicia Keys',
 'Celine Dion And Peabo Bryson',
 'Mitch Ryder And The Detroit Wheels',
 'Ronny And The Daytonas',
 'Johnny And The Hurricanes',
 'Harold Melvin And The Blue Notes',
 'Bo Donaldson And The Heywoods',
 'Gladys Knight And The Pips',
 'Prince And The Revolution',
 'Paul McCartney And Michael Jackson',
 'Jay And The Techniques',
 'Ruby And The Romantics',
 'Lisa Lisa And Cult Jam With Full Force',
 'Percy Faith And His Orchestra',
 'Paul Mauriat And His Orchestra',
 'Shep And The Limelites',
 'Joe Cocker And Jennifer Warnes',
 'Dr. Hook And The Medicine Show',
 'Friend And Lover',
 'Roy Head And The Traits',
 'Duane Eddy And The Rebels',
 'L

## B. pull from Spotify

Pull my own metadata from [Spotify](https://developer.spotify.com/documentation/web-api/), using [Spotipy](https://spotipy.readthedocs.io/en/2.19.0/).

### NOTE: unfortunately Spotify doesn't (no longer?) give you genre for a given song. The best you can get is genre for the *artist*. Darn...

## 🚧 IMPROVEMENTS: 
- ⬆️ using updated artists list, refetch all, but be sure to save `artist ID` this time, in case we need it.
- inspect artist names that didn't return results, try again with tweaks





In [18]:
import spotipy
import sys
from spotipy.oauth2 import SpotifyClientCredentials

# spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
#     client_id='8d5441abc3fc486fad784142cadfbeac'
# ))

In [19]:
def get_name_and_genres_from_spotify(name):
    try: 
        print(name)
        results = spotify.search(q='artist:' + name, type='artist')
        try:
            items = results['artists']['items']
            first_result = items[0]
        except:
            print('NO RESULTS')
            return
        return (
            name,
            first_result['name'],
            first_result['genres'],
        )
    except:
        print(F'API failed: {name}')

In [20]:
# get_name_and_genres_from_spotify('Rick Nelson And The Stone Canyon Band')

### Actual fetch (no need to repeat)

In [21]:
# artist_metadata = []
# for artist_name in all_artists:
#     artist_metadata.extend(
#         [get_name_and_genres_from_spotify(artist_name)]
#     )

# artist_metadata

### Save fetched metadata to disk

In [22]:
# len(artist_metadata)

In [23]:
json_file = "./data/artist_metadata.json"
# with open(json_file, "w") as outfile:
#     json.dump(artist_metadata, outfile)


In [24]:
artist_metadata_from_disk = None
with open(json_file, "r") as outfile:
     artist_metadata_from_disk = json.load(outfile)
        
len(artist_metadata_from_disk)

2451

In [25]:
orig_name_col = []
found_name_col = []
genres_col = []
for i in range(len(artist_metadata_from_disk)):
    entry = artist_metadata_from_disk[i]
    if entry:
        orig_name_col.extend([entry[0]])
        found_name_col.extend([entry[1] if entry else None])
        genres_col.extend([entry[2] if (entry and len(entry[2])) else None])

df_artist_genres = pd.DataFrame({'orig_name': orig_name_col, 'found_name': found_name_col, 'genres': genres_col})
df_artist_genres

Unnamed: 0,orig_name,found_name,genres
0,Jordan Knight,Jordan Knight,
1,Eddie Money,Eddie Money,"[album rock, classic rock, country rock, folk ..."
2,Cam'Ron,Cam’ron,"[battle rap, east coast hip hop, gangster rap,..."
3,The Hollies,The Hollies,"[album rock, brill building pop, british invas..."
4,Imagine Dragons,Imagine Dragons,"[modern rock, rock]"
...,...,...,...
2301,Young Dro,Young Dro,"[atl hip hop, crunk, dirty south rap, futurist..."
2302,Roddy Ricch,Roddy Ricch,"[melodic rap, rap, trap]"
2303,Ugly Kid Joe,Ugly Kid Joe,"[funk metal, funk rock, glam metal, hard rock,..."
2304,Gladys Knight And The Pips,Gladys Knight & The Pips,"[classic soul, disco, funk, motown, quiet stor..."


In [26]:
df_artist_genres[
#     df_artist_genres.orig_name == 'Mariah Carey'
    df_artist_genres.orig_name == df_artist_genres.found_name
#     df_artist_genres.found_name == 'Mariah Carey'
]

Unnamed: 0,orig_name,found_name,genres
0,Jordan Knight,Jordan Knight,
1,Eddie Money,Eddie Money,"[album rock, classic rock, country rock, folk ..."
3,The Hollies,The Hollies,"[album rock, brill building pop, british invas..."
4,Imagine Dragons,Imagine Dragons,"[modern rock, rock]"
5,Jon Bon Jovi,Jon Bon Jovi,"[glam metal, hard rock]"
...,...,...,...
2299,Danny O'Keefe,Danny O'Keefe,"[country rock, singer-songwriter]"
2301,Young Dro,Young Dro,"[atl hip hop, crunk, dirty south rap, futurist..."
2302,Roddy Ricch,Roddy Ricch,"[melodic rap, rap, trap]"
2303,Ugly Kid Joe,Ugly Kid Joe,"[funk metal, funk rock, glam metal, hard rock,..."


#### Notable: spotify found some more "official names" from our (imperfect) attempt at splitting out names:

In [27]:
df_artist_genres[
    (df_artist_genres.orig_name.str.lower() != df_artist_genres.found_name.str.lower()) &
    df_artist_genres.found_name.notna()
]

Unnamed: 0,orig_name,found_name,genres
2,Cam'Ron,Cam’ron,"[battle rap, east coast hip hop, gangster rap,..."
6,Loon,LOONA,"[k-pop, k-pop girl group]"
8,Rome,Romeo Santos,[bachata]
12,Dr. Dre),Dr. Dre,"[g funk, gangster rap, hip hop, rap, west coas..."
16,'N Sync,*NSYNC,"[boy band, dance pop, post-teen pop]"
...,...,...,...
2287,Johnny And The Hurricanes,Johnny & The Hurricanes,"[brill building pop, merseybeat, rock-and-roll..."
2288,Climax,Climax Blues Band,"[blues rock, british blues, classic rock, coun..."
2292,Billy J. Kramer With The Dakotas,Billy J. Kramer & The Dakotas,[merseybeat]
2300,Sole,Solence,[post-screamo]


# 3. Merge genre into the dataset

## THE CHALLENGE OF WORKING WITH ARTSITS IN THIS DATASET:

*Ideal world*: we're given data with an ARRAY of performers for each song. We can just get metadata about each.

*Real world*: `performers` col has just strings, with (sometimes) multiple artist in them, seperated by stuff like `","` or `"&"` or `" and "` (and sometimes `" and "` is just part of the performer name!!)

*My current chosen path forward*: 

1. try splitting stuff into individual names as best as I can (imperfect method)
2. try fetching genres for those names (imperfect: some don't have genres, some aren't properly split into individual names from step 1)
3. re-assign genre to the original dataset. 
4. PROBLEM: we have ~500 genres to chose. *ideal approach* is to tag every single genre to a meta genre (likely `pop, rock, jazz, hip hop, country/folk, electornic, latin` and the `misc` for things that can't be shoved into those bins well). *solution I'm going to go with now*: I'll shove them into some rough categories based on *sub-string* match (imperfect, I know), & just one of the original genre if I can't make it work.

## 🚧  IMPROVEMENTS: tag every single genre to a meta genre (eg `pop, rock, jazz, hip hop, country/folk, electornic, latin`) by hand, with the help of basically trying to keyword search to make things faster.

In [28]:

genre_list = []
def merge_genres(genres):
    genres and genre_list.extend(genres)
    
df_artist_genres.genres.apply(lambda genres: merge_genres(genres))
all_genres = list(set(genre_list))
all_genres


['scottish new wave',
 'pub rock',
 'second line',
 'american folk revival',
 'tennessee hip hop',
 'country',
 'instrumental funk',
 'pixie',
 'pop emo',
 'sierreno',
 'acoustic blues',
 'italian adult pop',
 'indietronica',
 'riddim',
 'gauze pop',
 'quiet storm',
 'classic rock',
 'rockabilly',
 'philly rap',
 'truck-driving country',
 'alt z',
 'jazz drums',
 'chicago bop',
 'hip pop',
 'classical performance',
 'socal pop punk',
 'country rap',
 'steampunk',
 'modern blues rock',
 'rap metal',
 'west coast rap',
 'underground power pop',
 'german hard rock',
 'classic dubstep',
 'parody',
 'nederpop',
 'rap kreyol',
 'north carolina hip hop',
 'texas blues',
 'uk contemporary r&b',
 'screamo',
 'conscious hip hop',
 'retro soul',
 'pop house',
 'kiwi rock',
 'psychedelic folk',
 'post-disco',
 'texas country',
 'candy pop',
 'ohio hip hop',
 'christian indie',
 'musica mexicana',
 'sad sierreno',
 'classic italian pop',
 'irish rock',
 'classic canadian rock',
 'indie poptimism',


In [29]:
pd.Series(all_genres).describe()

count                   683
unique                  683
top       scottish new wave
freq                      1
dtype: object

In [33]:
POP = [
    'hip pop',
 'classic swedish pop',
 'chamber pop',
 'metropopolis',
 'post-teen pop',
 'dance pop',
 'bahamian pop',
 'psychedelic pop',
 'thai pop',
 'underground power pop',
 'pop romantico',
 'guyanese pop',
 'nz pop',
 'britpop',
 'mexican pop',
 'europop',
 'swamp pop',
 'german pop',
 'neo-synthpop',
 'italian adult pop',
 'sophisti-pop',
 'popping',
 'power pop',
 'synthpop',
 'deep dance pop',
 'transpop',
 'uk pop',
 'pop punk',
 'modern alternative pop',
 'nederpop',
 'classic uk pop',
 'swedish synthpop',
 'k-pop boy group',
 'yakut pop',
 'classic italian pop',
 'bow pop',
 'acoustic pop',
 'indie poptimism',
 'modern power pop',
 'scandipop',
 'socal pop punk',
 'brill building pop',
 'dream pop',
 'pop',
 'danish pop',
 'new wave pop',
 'indie pop',
 'baroque pop',
 'k-pop',
 'canadian pop',
 'sunshine pop',
 'jangle pop',
 'pop dance',
 'swedish pop',
 'puerto rican pop',
 'neon pop punk',
 'afropop',
 'etherpop',
 'candy pop',
 'pop r&b',
 'space age pop',
 'french indie pop',
 'viral pop',
 'pacific islands pop',
 'shiver pop',
 'pop soul',
 'pop emo',
 'art pop',
 'austropop',
 'gauze pop',
 'colombian pop',
 'k-pop girl group',
 'australian pop',
 'nigerian pop',
 'bubblegum pop',
 'barbadian pop',
]
ROCK = [
    'progressive rock',
 'christian alternative rock',
 'comedy rock',
 'rock',
 'classic garage rock',
 'psychedelic rock',
 'rock drums',
 'classic rock',
 'boston rock',
 'canadian rock',
 'detroit rock',
 'pub rock',
 'rock keyboard',
 'piano rock',
 'irish rock',
 'rock of gibraltar',
 'art rock',
 'pop rock',
 'roots rock',
 'australian rock',
 'alternative rock',
 'german rock',
 'yacht rock',
 'soft rock',
 'swedish hard rock',
 'swamp rock',
 'deep soft rock',
 'rock-and-roll',
 'hard rock',
 'mexican classic rock',
 'dance rock',
 'rockabilly',
 'swedish melodic rock',
 'garage rock',
 'modern alternative rock',
 'flute rock',
 'kindie rock',
 'rocksteady',
 'indie rock',
 'southern rock',
 'acid rock',
 'german hard rock',
 'sleaze rock',
 'album rock',
 'gothic rock',
 'modern rock',
 'deep classic garage rock',
 'lovers rock',
 'glam rock',
 'scottish rock',
 'dutch rock',
 'classic canadian rock',
 'heartland rock',
 'kiwi rock',
 'celtic rock',
 'symphonic rock',
 'australian alternative rock',
    # indie
 'michigan indie',
 'athens indie',
 'indiecoustica',
 'maine indie',
 'bergen indie',
 'nashville indie',
 'australian indie',
 'canadian indie',
 'eau claire indie',
 'christian indie',
 'tempe indie',
 'sacramento indie',
 'seattle indie',
    # punk
 'uk post-punk',
 'protopunk',
 'anarcho-punk',
 'post-punk',
 'cyberpunk',
 'steampunk',
 'art punk',
 'chicano punk',
 'glam punk',
 'punk',
    # metal
 'progressive metal',
 'alternative metal',
 'glam metal',
 'canadian metal',
 'neo classical metal',
 'birmingham metal',
 'nu metal',
 'us power metal',
 'thrash metal',
 'proto-metal',
 'german metal',
 'metal',
    
 'british invasion',
]
LATIN = [
   'latin worship',
 'latin christian',
 'latin pop',
 'canadian latin',
 'latin viral pop',
    # reggeaton
    'pop reggaeton',
 'reggaeton flow',
 'reggaeton',
    # salsa
    'salsa', 'modern salsa', 'salsa colombiana',
    # pop
     'pop venezolano',
    
 'nueva musica mexicana', 'banda', 'norteno', 
 'tierra caliente', 'ranchera', 'samba', 'bolero', 'bachata',
]
ELECTRO = [
    # electro
    'electropowerpop',
 'electronica',
 'electro',
 'progressive electro house',
 'australian electropop',
 'danish electronic',
 'electropop',
 'electro house',
 'swedish electropop',
 'electronic trap',
    'indietronica',
    'synthesizer',
    # house
    'disco house',
 'filter house',
 'pop house',
 'classic house',
 'tropical house',
 'house',
 'dutch house',
 'ambient house',
 'progressive house',
 'hip house',
 'vocal house',
 'deep groove house',
 'diva house',
 'italo house',
 'acid house',
 'deep euro house',
 'slap house',
 'deep house',
    # dance
    'eurodance',
 'australian dance',
 'german dance',
 'italo dance',

 'alternative dance',
 'uk dance',
 'bubblegum dance',
    # disco
    'italian disco', 'post-disco', 'disco',
    # dnb
    'french dnb', 'uk dnb', 'dancefloor dnb',
    # edm
    'edm', 'pop edm',
    'dub', 'classic dubstep',
    'brostep',
    # techno
    'german techno',
    'hard minimal techno',
]
COUNTRY = [
    #country
    'classic country pop',
 'country dawn',
 'canadian country',
 'country',
 'outlaw country',
 'country gospel',
 'truck-driving country',
 'arkansas country',
 'texas country',
 'country road',
 'alternative country',
 'country pop',
 'country rock',
 'classic australian country',
 'modern country rock',
 'contemporary country',
 'country rap',
 'australian country',
 'traditional country',
 'oklahoma country',
 
 'cowpunk',
]
FOLK = [
    'american folk revival',
 "children's folk",
 'ectofolk',
 'freak folk',
 'british folk',
 'folk rock',
 'psychedelic folk rock',
 'folk-pop',
 'folk',
 'ambient folk',
 'indie folk',
 'irish folk',
 'psychedelic folk',
    'bluegrass',
    'banjo',
    # americana
    'new americana', 'deep new americana', 'new england americana',
]
HIP_HOP = [
 'canadian hip hop',
 'oakland hip hop',
 'kentucky hip hop',
 'deep underground hip hop',
 'southern hip hop',
 'canadian old school hip hop',
 'detroit hip hop',
 'hardcore hip hop',
 'miami hip hop',
 'harlem hip hop',
 'virginia hip hop',
 'lgbtq+ hip hop',
 'memphis hip hop',
 'ohio hip hop',
 'underground hip hop',
 'minnesota hip hop',
 'bronx hip hop',
 'old school atlanta hip hop',
 'alternative hip hop',
 'hip hop',
 'boston hip hop',
 'asian american hip hop',
 'north carolina hip hop',
 'seattle hip hop',
 'indonesian hip hop',
 'tennessee hip hop',
 'atl hip hop',
 'south carolina hip hop',
 'portland hip hop',
 'korean old school hip hop',
 'experimental hip hop',
 'desi hip hop',
 'latin hip hop',
 'argentine hip hop',
 'queens hip hop',
 'mississippi hip hop',
 'australian hip hop',
 'nigerian hip hop',
 'conscious hip hop',
 'east coast hip hop',
 'golden age hip hop',
 'manchester hip hop',
 'political hip hop',
 'old school hip hop',
 'hindi hip hop',
 'jamaican hip hop',
 'pinoy hip hop',
    # rap
    'dmv rap',
 'comedy rap',
 'deep southern trap',
 'pittsburgh rap',
 'trap soul',
 'trap latino',
 'dirty south rap',
 'melodic rap',
 'rhode island rap',
 'alabama rap',
 'emo rap',
 'rap kreyol',
 'desi trap',
 'rap latina',
 'indie pop rap',
 'bass trap',
 'trap queen',
 'traprun',
 'houston rap',
 'canadian trap',
 'chicano rap',
 'viral rap',
 'trap argentino',
 'st louis rap',
 'vapor trap',
 'k-rap',
 'viral trap',
 'texas latin rap',
 'rap conscient',
 'gangster rap',
 'philly rap',
 'trap',
 'dfw rap',
 'cali rap',
 'west coast rap',
 'atl trap',
 'new jersey rap',
 'rap metal',
 'battle rap',
 'rap',
 'sad rap',
 'detroit trap',
 'baton rouge rap',
 'new orleans rap',
 'nyc rap',
 'toronto rap',
 'pop rap',
 'florida rap',
 'chicago rap',
 'rap rock',
 'london rap',
    'chicago bop',
    
    'afro dancehall',
]
JAZZ_INFLUENCED = [
    # jazz
    'cool jazz',
 'japanese jazz',
 'south african jazz',
 'jazz blues',
 'vocal jazz',
 'jazz trumpet',
 'jazz saxophone',
 'jazz organ',
 'latin jazz',
 'jazz funk',
 'contemporary vocal jazz',
 'samba-jazz',
 'jazz rap',
 'jazz piano',
 'new orleans jazz',
 'jazz',
 'jazz guitar',
 'brazilian jazz',
 'jazz fusion',
 'bossa nova jazz',
 'contemporary jazz',
 'jazz drums',
 'soul jazz',
 'smooth jazz',
 'free jazz',
 'jazz trombone',
 'nu jazz',
 'avant-garde jazz',
 'dixieland',
    'hard bop', 'bebop', 'contemporary post-bop',
    # blues
    'memphis blues',
 'harmonica blues',
 'electric blues',
 'piano blues',
 'canadian blues',
 'chicago blues',
 'blues',
 'swamp blues',
 'soul blues',
 'traditional blues',
 'new orleans blues',
 'blues rock',
 'modern blues rock',
 'psychedelic blues-rock',
 'punk blues',
 'louisiana blues',
 'modern blues',
 'acoustic blues',
 'jump blues',
 'delta blues',
 'texas blues',
 'british blues',
 'rhythm and blues',
 'deep motown', 
    # soul
 'philly soul',
 'soul',
 'retro soul',
 'bedroom soul',
 'classic soul',
 'memphis soul',
 'souldies',
 'instrumental soul',
 'new orleans soul',
 'indie soul',
 'chicago soul',
 'psychedelic soul',
 'northern soul',
 'british soul',
 'neo soul',
 'southern soul',
    # r&b
    'gospel r&b',
 'contemporary r&b',
 'canadian contemporary r&b',
 'r&b argentino',
 'r&b en espanol',
 'uk contemporary r&b',
 'afro r&b',
 'indie r&b',
 'r&b',
 'alternative r&b',
 'australian r&b',
    'afrofuturism', 'afrobeat',
    'gospel',
    # swing
    'new jack swing', 'swing revival', 'swing italiano', 'swing',
    'harlem renaissance',
    
    'bboy',
]
REGGAE = [
    # ska
    'ska',
    'jamaican ska',
    'ska revival',
    
    'modern reggae',
 'uk reggae',
 'reggae',
 'reggae fusion',
 'reggaeton colombiano',
 'early reggae',
 'roots reggae',
    # dancehall
    'old school dancehall', 'dancehall', 'dancehall queen',
    
]
FUNK = ['funk rock',
 'synth funk',
 'brit funk',
 'funk',
 'uk funky',
 'instrumental funk',
 'liquid funk',
 'latin funk',
 'g funk',
 'new orleans funk',
 'funk metal']


# MISC?
CLASSIFIED_GENRES = [*POP, *ROCK, *LATIN, *ELECTRO, *COUNTRY, *FOLK, *HIP_HOP, *JAZZ_INFLUENCED, *REGGAE, *FUNK]

unclassified_genres = [genre for genre in all_genres if genre not in CLASSIFIED_GENRES]

[genre for genre in unclassified_genres if '' in genre]
unclassified_genres



['scottish new wave',
 'second line',
 'pixie',
 'sierreno',
 'riddim',
 'quiet storm',
 'alt z',
 'classical performance',
 'parody',
 'screamo',
 'musica mexicana',
 'sad sierreno',
 'hi-nrg',
 'idol',
 'melbourne bounce international',
 'mellow gold',
 'neo-psychedelic',
 "preschool children's music",
 'world worship',
 'boy band',
 'minneapolis sound',
 'neoperreo',
 'deep talent show',
 'lo-fi brasileiro',
 'chicago drill',
 'american oi',
 'orchestral performance',
 'background piano',
 'big room',
 'christian a cappella',
 'speedrun',
 'nashville sound',
 'yodeling',
 'grebo',
 'emo',
 "children's music",
 'neo mellow',
 'redneck',
 'new wave',
 'cancion melodica',
 'honky tonk',
 'jawaiian',
 'instrumental surf',
 'classic girl group',
 'old school thrash',
 'lilith',
 "children's story",
 'drum and bass',
 'worship',
 'freestyle',
 'cosmic american',
 'neue deutsche welle',
 'new romantic',
 'rare groove',
 'melancholia',
 'skiffle',
 'shanty',
 'fourth world',
 'ccm',
 'chris

In [34]:
GENERIC_GENRES = {
'LATIN': LATIN, 
'ELECTRO': ELECTRO, 
'COUNTRY': COUNTRY, 
'FOLK': FOLK, 
'HIP_HOP': HIP_HOP, 
'JAZZ_INFLUENCED': JAZZ_INFLUENCED, 
'REGGAE': REGGAE, 
'FUNK': FUNK, 
'POP': POP, 
'ROCK': ROCK,
'UNCATEGORIZED': unclassified_genres
}

def get_genre_category(genre):
    for generic_genre in GENERIC_GENRES.keys():
        if genre in GENERIC_GENRES[generic_genre]:
            return generic_genre

get_genre_category('deep house')

'ELECTRO'

In [35]:
def get_the_most_generic_genre_possible(artist_array):
    if artist_array:
        match = df_artist_genres[
            df_artist_genres.orig_name == artist_array[0]
        ]
        try:
            genres = match.iloc[0].genres
            if genres:
                found_genre = get_genre_category(genres[0])
                if found_genre is 'UNCATEGORIZED':
                    found_genre = get_genre_category(genres[1])
                    if found_genre is 'UNCATEGORIZED':
                        found_genre = get_genre_category(genres[2])
                return found_genre or 'UNCATEGORIZED'
        except:
            pass
            
    return 'UNCATEGORIZED'

artist_genre_merged_in = songs_that_made_it_above_x.copy()

artist_genre_merged_in['artist_array'] =  artist_genre_merged_in.performer.apply(lambda s: get_array_of_performers(s))

artist_genre_merged_in['generic_genre'] = artist_genre_merged_in.artist_array.apply(lambda a: get_the_most_generic_genre_possible(a))


In [36]:
artist_genre_merged_in.generic_genre.value_counts()

POP                1879
ROCK               1087
UNCATEGORIZED       575
JAZZ_INFLUENCED     555
HIP_HOP             423
ELECTRO             260
COUNTRY             136
FUNK                 81
LATIN                45
FOLK                 42
REGGAE               11
Name: generic_genre, dtype: int64

In [37]:
artist_genre_merged_in[
    artist_genre_merged_in.generic_genre.isna()
]

Unnamed: 0,song_id,chart_position,chart_debut,song,performer,artist_array,generic_genre


In [38]:
artist_genre_merged_in

Unnamed: 0,song_id,chart_position,chart_debut,song,performer,artist_array,generic_genre
0,#9 DreamJohn Lennon,9,1974-12-21,#9 Dream,John Lennon,[John Lennon],ROCK
1,'03 Bonnie & ClydeJay-Z Featuring Beyonce Knowles,4,2002-10-26,'03 Bonnie & Clyde,Jay-Z Featuring Beyonce Knowles,"[Jay-Z, Beyonce Knowles]",HIP_HOP
2,'65 Love AffairPaul Davis,6,1982-02-27,'65 Love Affair,Paul Davis,[Paul Davis],ROCK
3,('til) I Kissed YouThe Everly Brothers,4,1959-08-15,('til) I Kissed You,The Everly Brothers,[The Everly Brothers],POP
4,(Can't Live Without Your) Love And AffectionNe...,1,1990-07-07,(Can't Live Without Your) Love And Affection,Nelson,[Nelson],COUNTRY
...,...,...,...,...,...,...,...
5089,everything i wantedBillie Eilish,8,2019-11-23,everything i wanted,Billie Eilish,[Billie Eilish],POP
5090,iSpyKYLE Featuring Lil Yachty,4,2017-01-14,iSpy,KYLE Featuring Lil Yachty,"[KYLE, Lil Yachty]",POP
5091,interludeJ. Cole,8,2021-05-22,interlude,J. Cole,[J. Cole],HIP_HOP
5092,"my.lifeJ. Cole, 21 Savage & Morray",2,2021-05-29,my.life,"J. Cole, 21 Savage & Morray","[J. Cole, 21 Savage, Morray]",HIP_HOP


# 4. Export

In [39]:
ready_for_export = artist_genre_merged_in

In [40]:
ready_for_export.to_csv('./data/1 DONE RIGHT OUTPUT unique songs.csv', index=False)