## Initial test for determining viability for long list of songs

In [89]:
# Import Dependencies
import requests
import lyricsgenius
import time
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
import re

# Import Access Tokens and Credentials
from config import genius_token, spotify_id, spotify_secret

In [137]:
# Import list of songs
test_df = pd.read_csv("song_files/combined.csv")

In [138]:
test_df.head()

Unnamed: 0,Song,Artist
0,Satisfaction,The Rolling Stones
1,Respect,Aretha Franklin
2,Stairway to Heaven,Led Zeppelin
3,Like a Rolling Stone,Bob Dylan
4,Born to Run,Bruce Springsteen


In [139]:
# Rename columns to match copdied code from "proof of concept"
test_df.columns = ["song", "artist"]

In [140]:
# Convert dataframe to list of dictionaries
input_list = test_df.to_dict('records')

### API Step 1: Get Lyrics from Genius

In [141]:
# Initialize lyricgenius Class
genius = lyricsgenius.Genius(genius_token)

In [142]:
# Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.remove_section_headers = True 

In [146]:
# Initialize Results List and Error List
results = []
errors = []

# Retrieve Song Lyrics

start = time.time()

for item in input_list:
    try:
        song = genius.search_song(item["song"], artist = item["artist"])
        temp_dict = {"song":item["song"],
                     "artist": item["artist"],
                     "genius_artist": song.artist,
                     "spotify_artist": "",
                     "artist_id": "",
                     "album": "",
                     "artist_genre(s)":[], 
                     "lyrics": song.lyrics,
                     "audio_features": {}
                    } 
        results.append(temp_dict)
    except:
        temp_dict = {"song":item["song"],
                     "artist": item["artist"],
                     "lyrics": "null"
                    }
        errors.append(temp_dict)
print("finished")
duration = time.time() - start
print(f'Ran in {duration} seconds/{duration/60} minutes')
print(f'There were {len(errors)} errors')

Searching for "Satisfaction" by The Rolling Stones...
Done.
Searching for "Respect" by Aretha Franklin...
Done.
Searching for "Stairway to Heaven" by Led Zeppelin...
Done.
Searching for "Like a Rolling Stone" by Bob Dylan...
Done.
Searching for "Born to Run" by Bruce Springsteen...
Done.
Searching for "Hotel California" by The Eagles...
Done.
Searching for "Light My Fire" by The Doors...
Done.
Searching for "Good Vibrations" by The Beach Boys...
Done.
Searching for "Hey Jude" by The Beatles...
Done.
Searching for "Imagine" by John Lennon...
Done.
Searching for "Louie Louie" by The Kingsmen...
Done.
Searching for "Yesterday" by The Beatles...
Done.
Searching for "My Generation" by The Who...
Done.
Searching for "What's Going On" by Marvin Gaye...
Done.
Searching for "Johnny B. Goode" by Chuck Berry...
Done.
Searching for "Layla" by Derek & The Dominos...
Done.
Searching for "Won't Get Fooled Again" by The Who...
Done.
Searching for "Jailhouse Rock" by Elvis Presley...
Done.
Searching fo

Done.
Searching for "Banned in D.C" by Bad Brains...
Done.
Searching for "X-Offender" by Blondie...
Done.
Searching for "Roadrunner" by The Modern Lovers...
Done.
Searching for "Basket Case" by Green Day...
Done.
Searching for "Radio, Radio" by Elvis Costello...
Done.
Searching for "Nazi Punks Fuck Off" by Dead Kennedys...
Done.
Searching for "See No Evil" by Television...
Done.
Searching for "TV Party" by Black Flag...
Done.
Searching for "Hoochie Coochie Man" by Muddy Waters...
Done.
Searching for "The Thrill is Gone" by B.B. King...
Done.
Searching for "Me And The Devil Blues" by Robert Johnson...
Done.
Searching for "Stone Crazy" by Buddy Guy...
Done.
Searching for "I’d Rather Go Blind" by Etta James...
Done.
Searching for "I’m Tore Down" by Freddie King...
Done.
Searching for "Call It Stormy Monday" by T-Bone Walker...
Done.
Searching for "Boogie Chillen’" by John Lee Hooker...
Done.
Searching for "Red House" by Jimi Hendrix...
Done.
Searching for "Smokestack Lightning" by Howlin’

Done.
Searching for "Paid in Full" by Eric B. & Rakim...
Done.
Searching for "Killing Me Softly" by Fugees...
Done.
Searching for "U Can't Touch This" by MC Hammer...
Done.
Searching for "Hold It Now, Hit It" by Beastie Boys...
Done.
Searching for "It Was a Good Day" by Ice Cube...
Done.
Searching for "The Humpty Dance" by Digital Underground...
Done.
Searching for "Check the Rhime" by A Tribe Called Quest...
Done.
Searching for "How Ya Like Me Now" by Kool Moe Dee...
Done.
Searching for "It's All About the Benjamins" by Puff Daddy...
Done.
Searching for "Tha Crossroads" by Bone Thugs-n-Harmony...
Done.
Searching for "Insane in the Brain" by Cypress Hill...
Done.
Searching for "Ladies First" by Queen Latifah...
Done.
Searching for "Hot in Herre" by Nelly...
Done.
Searching for "It Takes Two" by Rob Base & DJ E-Z Rock...
Done.
Searching for "Gangsta's Paradise" by Coolio...
Done.
Searching for "Wild Thing" by Tone Lōc...
Done.
Searching for "It's Like That" by Run-D.M.C....
Done.
Search

Done.
Searching for "Kiss an Angel Good Mornin'" by Charley Pride...
Done.
Searching for "Family Tradition" by Hank Williams, Jr....
Done.
Searching for "Go Rest High on That Mountain" by Vince Gill...
Done.
Searching for "Lovesick Blues" by Hank Williams...
Done.
Searching for "Don't Rock the Jukebox" by Alan Jackson...
Done.
Searching for "Tennessee Waltz" by Patti Page...
Done.
Searching for "When You Say Nothing at All" by Alison Krauss...
Done.
Searching for "God Bless the Usa" by Lee Greenwood...
Done.
Searching for "Green, Green Grass of Home" by Porter Wagoner...
Done.
Searching for "It's Your Love" by Tim McGraw & Faith Hill...
Done.
Searching for "There Stands the Glass" by Webb Pierce...
Done.
Searching for "The Devil Went Down to Georgia" by The Charlie Daniels Band...
Done.
Searching for "Chiseled in Stone" by Vern Gosdin...
Done.
Searching for "Don't Toss Us Away" by Patty Loveless...
Done.
Searching for "A Boy Named Sue" by Johnny Cash...
Done.
Searching for "You Are My 

### API Step 2: Get Artist ID from Spotify

In [144]:
# Initialize Spotipy Class
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=spotify_id, client_secret=spotify_secret))

In [147]:
# Iterate through results add artist Spotify ID

start = time.time()

artist_search_errors = []

for item in results:
    
    try:
        sp_result = sp.search(item["artist"])

        artist_results = sp_result['tracks']['items']
        
        counter = 1
        for i in artist_results:
            spotify_artist = i["album"]["artists"][0]["name"]
            artist_id = i["album"]["artists"][0]["id"]
            
            # Creating different case/and .vs & artist strings; artist names not always consistant 
            genius_lower = item["genius_artist"].lower()
            genius_ampersand = re.sub('and', '&', genius_lower)
            genius_and = re.sub('&', 'and', genius_lower)           
            
            artist_lower = item["artist"].lower()
            artist_ampersand = re.sub('and', '&', artist_lower)
            artist_and = re.sub('&', 'and', artist_lower)
            
            artist_test_list = []
            artist_test_list.extend([genius_lower, genius_ampersand, genius_and, 
                                     artist_lower, artist_ampersand, artist_and])
            
#             if spotify_artist.lower() == artist_lower or spotify_artist.lower() == artist_ampersand or spotify_artist.lower() == artist_and:
            if spotify_artist.lower() in artist_test_list:
                item["artist_id"]=artist_id
                item["spotify_artist"]=spotify_artist
#                 print(spotify_artist, item["artist"])
                break
            
            elif counter == len(artist_results):
                artist_search_errors.append(item["artist"])
            
            else:
                print(f'Search Artist: {item["artist"]}, Spotify Artist: {spotify_artist}')
                counter += 1
    
    except:
        # print(f"yikes, an error happened with {item['artist']}")
        artist_search_errors.append(item["artist"])
    
print("finished")
duration = time.time() - start
print(f'Ran in {duration} seconds/{duration/60} minutes')
print(f'There were {len(artist_search_errors)} errors')

Search Artist: Bob Dylan, Spotify Artist: Fall Out Boy
Search Artist: The Eagles, Spotify Artist: Eagles
Search Artist: The Eagles, Spotify Artist: Eagles
Search Artist: The Eagles, Spotify Artist: Eagles
Search Artist: The Eagles, Spotify Artist: Eagles
Search Artist: The Eagles, Spotify Artist: Eagles
Search Artist: The Eagles, Spotify Artist: Eagles
Search Artist: The Eagles, Spotify Artist: Eagles
Search Artist: The Eagles, Spotify Artist: Eagles
Search Artist: The Eagles, Spotify Artist: Eagles
Search Artist: James Brown, Spotify Artist: SNBRN
Search Artist: Queen, Spotify Artist: Red Velvet
Search Artist: Queen, Spotify Artist: Megan Thee Stallion
Search Artist: Queen, Spotify Artist: Loren Gray
Search Artist: Ray Charles, Spotify Artist: Third World Don
Search Artist: Buddy Holly, Spotify Artist: Weezer
Search Artist: Buddy Holly, Spotify Artist: Various Artists
Search Artist: Bob Dylan, Spotify Artist: Fall Out Boy
Search Artist: David Bowie, Spotify Artist: Queen
Search Artist

Search Artist: Muddy Waters, Spotify Artist: LP
Search Artist: Susan Tedeschi, Spotify Artist: Eric Church
Search Artist: B.B. King, Spotify Artist: Lil Wayne
Search Artist: B.B. King, Spotify Artist: Sean Kingston
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: Howlin’ Wolf, Spotify Artist: Guy Daniels
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: John Mooney, Spotify Artist: Various Artists
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: Howlin’ Wolf, Spotify Artist: Howlin' Wolf
Search Artist: Howlin’ Wolf, S

Search Artist: Game, Spotify Artist: 24kGoldn
Search Artist: Game, Spotify Artist: Queen
Search Artist: Game, Spotify Artist: Sect Unit
Search Artist: Game, Spotify Artist: K CAMP
Search Artist: Game, Spotify Artist: Summer Walker
Search Artist: Game, Spotify Artist: PlayaPhonk
Search Artist: Game, Spotify Artist: Falling In Reverse
Search Artist: Game, Spotify Artist: The Weeknd
Search Artist: Game, Spotify Artist: Cold Ca$e
Search Artist: Method Man, Spotify Artist: Wu-Tang Clan
Search Artist: De La Soul, Spotify Artist: Tom Misch
Search Artist: De La Soul, Spotify Artist: A Tribe Called Quest
Search Artist: Nas, Spotify Artist: Ariana Grande
Search Artist: Nas, Spotify Artist: Lil Nas X
Search Artist: Lil Wayne, Spotify Artist: Drake
Search Artist: Lil Wayne, Spotify Artist: Lil Wayne R
Search Artist: Lil Wayne, Spotify Artist: Polo G
Search Artist: Lil Wayne, Spotify Artist: Tory Lanez
Search Artist: Doug E. Fresh, Spotify Artist: Slick Rick
Search Artist: Doug E. Fresh, Spotify Ar

Search Artist: Hank Williams, Spotify Artist: Hank Williams, Jr.
Search Artist: Hank Williams, Spotify Artist: David Allan Coe
Search Artist: Hank Williams, Spotify Artist: Hank Williams, Jr.
Search Artist: Hank Williams, Spotify Artist: Hank Williams, Jr.
Search Artist: Hank Williams, Spotify Artist: Hank Williams, Jr.
Search Artist: Willie Nelson, Spotify Artist: Toby Keith
Search Artist: Willie Nelson, Spotify Artist: Off The Record Karaoke
Search Artist: Willie Nelson, Spotify Artist: The Highwaymen
Search Artist: Willie Nelson, Spotify Artist: The Lacs
Search Artist: Willie Nelson, Spotify Artist: Waylon Jennings
Search Artist: Willie Nelson, Spotify Artist: Sturgill Simpson
Search Artist: Johnny Cash, Spotify Artist: Jason Aldean
Search Artist: Reba McEntire, Spotify Artist: Thomas Rhett
Search Artist: Hank Williams, Spotify Artist: Hank Williams, Jr.
Search Artist: Hank Williams, Spotify Artist: David Allan Coe
Search Artist: Hank Williams, Spotify Artist: Hank Williams, Jr.
Sea

In [151]:
results[3]

{'song': 'Like a Rolling Stone',
 'artist': 'Bob Dylan',
 'genius_artist': 'Bob Dylan',
 'spotify_artist': 'Bob Dylan',
 'artist_id': '74ASZWbe4lXaubB36ztrGX',
 'album': '',
 'artist_genre(s)': [],
 'lyrics': 'Once upon a time you dressed so fine\nThrew the bums a dime in your prime, didn\'t you?\nPeople call, say "Beware doll, you\'re bound to fall"\nYou thought they were all a-kiddin\' you\nYou used to laugh about\nEverybody that was hangin\' out\nNow you don\'t talk so loud\nNow you don\'t seem so proud\nAbout having to be scrounging your next meal\n\nHow does it feel?\nHow does it feel?\nTo be without a home?\nLike a complete unknown?\nLike a rolling stone?\n\nAw, you\'ve gone to the finest school all right, Miss Lonely\nBut ya know ya only used to get juiced in it\nNobody\'s ever taught ya how to live out on the street\nAnd now you’re gonna have to get used to it\nYou say you never compromise\nWith the mystery tramp, but now you realize\nHe\'s not selling any alibis\nAs you stare 

### API Step 3: Search by Artist ID to Retrieve Genres and Audo Features

Note: Spotify API cannot search using artist

In [152]:
# Initialize Spotify API v.2
AUTH_URL = 'https://accounts.spotify.com/api/token'

auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': spotify_id,
    'client_secret': spotify_secret,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']

headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}
BASE_URL = 'https://api.spotify.com/v1/'

In [153]:
# Iterate through results, search by artist ID
# Get genres, add to list
# If results song is in album, get album and audio features

start = time.time()

spotify_errors = []

for item in results:
    
    try:
#         print(item["artist"]) # uncomment when troubleshooting
        artist_id = item["artist_id"]
        artist_genres = []
        r = requests.get(BASE_URL + 'artists/' + artist_id + '/albums', 
                     headers=headers, 
                     params={'include_groups': 'album', 'limit': 50})
#         print(r) # uncomment when troubleshooting
        d = r.json()
        albums = d["items"]

        for x in range(len(albums)):
            album_name = albums[x]["name"]

            # Get genres associated with artist - not always consistent w/ album, so iterate though all albums
            artist = sp.artist(albums[x]["artists"][0]["external_urls"]["spotify"])
            genres = artist["genres"]
            for genre in genres:
                if genre not in artist_genres:
                    artist_genres.append(genre)
                    
#         print(item["artist"], artist_genres) # uncomment when troubleshooting
        item["artist_genre(s)"] = artist_genres
        
    except:
        spotify_errors.append(item["artist"])
    
    
print("finished")
duration = time.time() - start
print(f'Ran in {duration} seconds/{duration/60} minutes')
print(f'There were {len(spotify_errors)} errors')

finished
Ran in 1160.000637769699 seconds/19.33334396282832 minutes
There were 43 errors


### EDA and Cleaning

In [222]:
df = pd.DataFrame(results)

In [223]:
df.head(10)

Unnamed: 0,song,artist,genius_artist,spotify_artist,artist_id,album,artist_genre(s),lyrics,audio_features
0,Satisfaction,The Rolling Stones,The Rolling Stones,The Rolling Stones,22bE4uQ6baNwSHPVcDxLCe,,"[british invasion, classic rock, rock, blues, ...",I can't get no satisfaction\nI can't get no sa...,{}
1,Respect,Aretha Franklin,Aretha Franklin,Aretha Franklin,7nwUJBm0HE4ZxD3f5cy5ok,,"[classic soul, jazz blues, memphis soul, soul,...","What you want, baby, I got it\nWhat you need, ...",{}
2,Stairway to Heaven,Led Zeppelin,Led Zeppelin,Led Zeppelin,36QJpDe2go2KgaRleHCDTp,,"[album rock, classic rock, hard rock, rock]",There's a lady who's sure all that glitters is...,{}
3,Like a Rolling Stone,Bob Dylan,Bob Dylan,Bob Dylan,74ASZWbe4lXaubB36ztrGX,,"[album rock, classic rock, country rock, folk,...",Once upon a time you dressed so fine\nThrew th...,{}
4,Born to Run,Bruce Springsteen,Bruce Springsteen,Bruce Springsteen,3eqjTLE0HfPfh78zjh6TqT,,"[classic rock, heartland rock, mellow gold, pe...","In the day, we sweat it out on the streets\nOf...",{}
5,Hotel California,The Eagles,Genius Lists,,,,[],In alphabetical order by title:\nCan't find yo...,{}
6,Light My Fire,The Doors,The Doors,The Doors,22WZ7M8sxp5THdruNY3gXt,,"[acid rock, classic rock, psychedelic rock, rock]",You know that it would be untrue\nYou know tha...,{}
7,Good Vibrations,The Beach Boys,The Beach Boys,The Beach Boys,3oDbviiivRWhXwIE8hxkVV,,"[baroque pop, brill building pop, classic rock...",I-I love the colorful clothes she wears\nAnd t...,{}
8,Hey Jude,The Beatles,The Beatles,The Beatles,3WrFJ7ztbogyGnTHbHJFl2,,"[beatlesque, british invasion, classic rock, m...","Hey Jude, don't make it bad\nTake a sad song a...",{}
9,Imagine,John Lennon,John Lennon,John Lennon,4x1nvY2FN8jxqAFA0DA02H,,"[album rock, art rock, beatlesque, classic roc...",Imagine there's no heaven\nIt's easy if you tr...,{}


#### Analyze Genres

In [248]:
# Get list of lists of genres
genres_lol = df["artist_genre(s)"].to_list()

In [249]:
genres_lol

[['british invasion',
  'classic rock',
  'rock',
  'blues',
  'blues rock',
  'chicago blues',
  'electric blues',
  'soul',
  'traditional blues'],
 ['classic soul', 'jazz blues', 'memphis soul', 'soul', 'southern soul'],
 ['album rock', 'classic rock', 'hard rock', 'rock'],
 ['album rock',
  'classic rock',
  'country rock',
  'folk',
  'folk rock',
  'mellow gold',
  'rock',
  'roots rock',
  'singer-songwriter'],
 ['classic rock',
  'heartland rock',
  'mellow gold',
  'permanent wave',
  'rock',
  'singer-songwriter'],
 [],
 ['acid rock', 'classic rock', 'psychedelic rock', 'rock'],
 ['baroque pop',
  'brill building pop',
  'classic rock',
  'psychedelic rock',
  'rock',
  'sunshine pop'],
 ['beatlesque',
  'british invasion',
  'classic rock',
  'merseybeat',
  'psychedelic rock',
  'rock'],
 ['album rock',
  'art rock',
  'beatlesque',
  'classic rock',
  'folk rock',
  'mellow gold',
  'rock',
  'soft rock'],
 ['classic garage rock'],
 ['beatlesque',
  'british invasion',
  '

In [255]:
# Initialize genre list
genres = []

# Remove genres from individual lists and add to master genre list
for genre_list in genres_lol:
    for i in genre_list:
        genres.append(i)
print(f'There are {len(genres)} non-unique genres')

There are 2667 non-unique genres


In [256]:
# Get list of unique Genres

# Initialize genre list
unique_genres = []

for item in genres:
    if item not in unique_genres:
        unique_genres.append(item)
print(f'There are {len(unique_genres)} unique genres')

There are 267 unique genres


In [252]:
unique_genres.sort()
unique_genres

['acid rock',
 'acoustic blues',
 'adult standards',
 'afrofuturism',
 'album rock',
 'alternative hip hop',
 'alternative metal',
 'alternative pop',
 'alternative rock',
 'american folk revival',
 'american modern classical',
 'anti-folk',
 'appalachian folk',
 'arkansas country',
 'art punk',
 'art rock',
 'atl hip hop',
 'australian rock',
 'bakersfield sound',
 'baroque pop',
 'battle rap',
 'bboy',
 'beatboxing',
 'beatlesque',
 'big band',
 'black punk',
 'bluegrass',
 'bluegrass gospel',
 'blues',
 'blues rock',
 'boogie-woogie',
 'boston rock',
 'brill building pop',
 'british blues',
 'british invasion',
 'bronx hip hop',
 'bubblegum pop',
 'cali rap',
 'california hardcore',
 'canadian country',
 'canadian pop',
 'candy pop',
 'celtic',
 'celtic rock',
 'chicago blues',
 'chicago rap',
 'chicano rap',
 'chopped and screwed',
 'choral',
 'classic canadian rock',
 'classic country pop',
 'classic garage rock',
 'classic girl group',
 'classic rock',
 'classic soul',
 'classica

In [275]:
# Get counts of each genre and add to list of dictionaries for sorting

for_sorting = []

for i in unique_genres:
    counter = 0
    for x in genres:
        if x == i:
            counter +=1
    print(i, counter)
#     for_sorting[i] = counter
    for_sorting.append({"genre":i, "count":counter})

british invasion 21
classic rock 111
rock 101
blues 67
blues rock 71
chicago blues 32
electric blues 66
soul 48
traditional blues 49
classic soul 20
jazz blues 14
memphis soul 12
southern soul 15
album rock 41
hard rock 22
country rock 73
folk 28
folk rock 41
mellow gold 53
roots rock 22
singer-songwriter 22
heartland rock 6
permanent wave 25
acid rock 6
psychedelic rock 29
baroque pop 3
brill building pop 27
sunshine pop 4
beatlesque 10
merseybeat 11
art rock 16
soft rock 34
classic garage rock 3
funk 20
motown 14
northern soul 3
quiet storm 12
adult standards 42
disco 6
rock-and-roll 31
rockabilly 31
rhythm and blues 4
british blues 6
indie r&b 2
lounge 3
glam rock 6
protopunk 8
bubblegum pop 8
melancholia 1
soul blues 26
classic girl group 2
southern rock 5
swamp rock 4
classic canadian rock 1
grunge 1
new wave 10
piano blues 11
vocal jazz 11
freakbeat 1
pop 3
r&b 7
piano rock 4
nashville sound 29
punk 40
japanese jazz 1
supergroup 1
jazz funk 1
post-disco 1
australian rock 1
metal 

In [282]:
# Create list of genres sorted by occurrence

sorting_df = pd.DataFrame(for_sorting)
sorting_df.sort_values(["count"], ascending=False, inplace=True)
sorted_genres = sorting_df.genre.to_list()

In [283]:
sorted_genres

['classic rock',
 'rock',
 'hip hop',
 'rap',
 'country rock',
 'blues rock',
 'blues',
 'electric blues',
 'country',
 'mellow gold',
 'gangster rap',
 'traditional blues',
 'soul',
 'east coast hip hop',
 'adult standards',
 'hardcore hip hop',
 'folk rock',
 'album rock',
 'punk',
 'soft rock',
 'old school hip hop',
 'chicago blues',
 'rockabilly',
 'rock-and-roll',
 'nashville sound',
 'modern blues',
 'psychedelic rock',
 'folk',
 'pop rap',
 'brill building pop',
 'soul blues',
 'permanent wave',
 'country road',
 'country dawn',
 'classic country pop',
 'golden age hip hop',
 'singer-songwriter',
 'hard rock',
 'roots rock',
 'british invasion',
 'funk',
 'classic soul',
 'southern hip hop',
 'outlaw country',
 'electro',
 'hip pop',
 'contemporary country',
 'art rock',
 'hardcore punk',
 'hip house',
 'texas blues',
 'acoustic blues',
 'southern soul',
 'jazz blues',
 'motown',
 'alternative rock',
 'conscious hip hop',
 'traditional country',
 'honky tonk',
 'quiet storm',
 

In [285]:
# Create finction to get most general genre for song/artist
def get_general_genre(x):
    for genre in sorted_genres:
        for artist_genre in x:
            if artist_genre == genre:
                return(artist_genre)

In [287]:
# Create column for most general genre

df["general_genre"] = df["artist_genre(s)"].apply(get_general_genre)

#### Clean Data, Tokenize, and Remove Stop Words

In [224]:
# Create fucntion to remove errors and "\n" from lyric strings

def remove_lyric_errors(x):
    x = re.sub('\n', ' ', x)
    x = re.sub(',', '', x)
    x = re.sub(';', '', x)
    x = x.lower()
    return(x)

In [225]:
# apply function
df["lyrics"] = df.lyrics.apply(remove_lyric_errors)

In [226]:
df.head(10)

Unnamed: 0,song,artist,genius_artist,spotify_artist,artist_id,album,artist_genre(s),lyrics,audio_features
0,Satisfaction,The Rolling Stones,The Rolling Stones,The Rolling Stones,22bE4uQ6baNwSHPVcDxLCe,,"[british invasion, classic rock, rock, blues, ...",i can't get no satisfaction i can't get no sat...,{}
1,Respect,Aretha Franklin,Aretha Franklin,Aretha Franklin,7nwUJBm0HE4ZxD3f5cy5ok,,"[classic soul, jazz blues, memphis soul, soul,...",what you want baby i got it what you need do y...,{}
2,Stairway to Heaven,Led Zeppelin,Led Zeppelin,Led Zeppelin,36QJpDe2go2KgaRleHCDTp,,"[album rock, classic rock, hard rock, rock]",there's a lady who's sure all that glitters is...,{}
3,Like a Rolling Stone,Bob Dylan,Bob Dylan,Bob Dylan,74ASZWbe4lXaubB36ztrGX,,"[album rock, classic rock, country rock, folk,...",once upon a time you dressed so fine threw the...,{}
4,Born to Run,Bruce Springsteen,Bruce Springsteen,Bruce Springsteen,3eqjTLE0HfPfh78zjh6TqT,,"[classic rock, heartland rock, mellow gold, pe...",in the day we sweat it out on the streets of a...,{}
5,Hotel California,The Eagles,Genius Lists,,,,[],in alphabetical order by title: can't find you...,{}
6,Light My Fire,The Doors,The Doors,The Doors,22WZ7M8sxp5THdruNY3gXt,,"[acid rock, classic rock, psychedelic rock, rock]",you know that it would be untrue you know that...,{}
7,Good Vibrations,The Beach Boys,The Beach Boys,The Beach Boys,3oDbviiivRWhXwIE8hxkVV,,"[baroque pop, brill building pop, classic rock...",i-i love the colorful clothes she wears and th...,{}
8,Hey Jude,The Beatles,The Beatles,The Beatles,3WrFJ7ztbogyGnTHbHJFl2,,"[beatlesque, british invasion, classic rock, m...",hey jude don't make it bad take a sad song and...,{}
9,Imagine,John Lennon,John Lennon,John Lennon,4x1nvY2FN8jxqAFA0DA02H,,"[album rock, art rock, beatlesque, classic roc...",imagine there's no heaven it's easy if you try...,{}


In [206]:
# Create copy for manipulation
df_tokens = df.copy()

In [212]:
# Tokenize lyrics
# df_tokens["lyrics_tokenized"] = df_tokens.lyrics.apply(lambda x: word_tokenize(x))
df_tokens["lyrics_tokenized"] = df_tokens.lyrics.apply(lambda x: x.split())

In [257]:
df_tokens.head(10)

Unnamed: 0,song,artist,genius_artist,spotify_artist,artist_id,album,artist_genre(s),lyrics,audio_features,lyrics_tokenized,lyrics_wo_stopwords
0,Satisfaction,The Rolling Stones,The Rolling Stones,The Rolling Stones,22bE4uQ6baNwSHPVcDxLCe,,"[british invasion, classic rock, rock, blues, ...",I can't get no satisfaction I can't get no sat...,{},"[I, can't, get, no, satisfaction, I, can't, ge...","[I, can't, get, satisfaction, I, can't, get, s..."
1,Respect,Aretha Franklin,Aretha Franklin,Aretha Franklin,7nwUJBm0HE4ZxD3f5cy5ok,,"[classic soul, jazz blues, memphis soul, soul,...",What you want baby I got it What you need do y...,{},"[What, you, want, baby, I, got, it, What, you,...","[What, want, baby, I, got, What, need, know, I..."
2,Stairway to Heaven,Led Zeppelin,Led Zeppelin,Led Zeppelin,36QJpDe2go2KgaRleHCDTp,,"[album rock, classic rock, hard rock, rock]",There's a lady who's sure all that glitters is...,{},"[There's, a, lady, who's, sure, all, that, gli...","[There's, lady, who's, sure, glitters, gold, A..."
3,Like a Rolling Stone,Bob Dylan,Bob Dylan,Bob Dylan,74ASZWbe4lXaubB36ztrGX,,"[album rock, classic rock, country rock, folk,...",Once upon a time you dressed so fine Threw the...,{},"[Once, upon, a, time, you, dressed, so, fine, ...","[Once, upon, time, dressed, fine, Threw, bums,..."
4,Born to Run,Bruce Springsteen,Bruce Springsteen,Bruce Springsteen,3eqjTLE0HfPfh78zjh6TqT,,"[classic rock, heartland rock, mellow gold, pe...",In the day we sweat it out on the streets Of a...,{},"[In, the, day, we, sweat, it, out, on, the, st...","[In, day, sweat, streets, Of, runaway, America..."
5,Hotel California,The Eagles,Genius Lists,,,,[],Can't find your song? Please add it and link ...,{},"[Can't, find, your, song?, Please, add, it, an...","[Can't, find, song?, Please, add, link, it!, A..."
6,Light My Fire,The Doors,The Doors,The Doors,22WZ7M8sxp5THdruNY3gXt,,"[acid rock, classic rock, psychedelic rock, rock]",You know that it would be untrue You know that...,{},"[You, know, that, it, would, be, untrue, You, ...","[You, know, would, untrue, You, know, I, would..."
7,Good Vibrations,The Beach Boys,The Beach Boys,The Beach Boys,3oDbviiivRWhXwIE8hxkVV,,"[baroque pop, brill building pop, classic rock...",I-I love the colorful clothes she wears And th...,{},"[I-I, love, the, colorful, clothes, she, wears...","[I-I, love, colorful, clothes, wears, And, way..."
8,Hey Jude,The Beatles,The Beatles,The Beatles,3WrFJ7ztbogyGnTHbHJFl2,,"[beatlesque, british invasion, classic rock, m...",Hey Jude don't make it bad Take a sad song and...,{},"[Hey, Jude, don't, make, it, bad, Take, a, sad...","[Hey, Jude, make, bad, Take, sad, song, make, ..."
9,Imagine,John Lennon,John Lennon,John Lennon,4x1nvY2FN8jxqAFA0DA02H,,"[album rock, art rock, beatlesque, classic roc...",Imagine there's no heaven It's easy if you try...,{},"[Imagine, there's, no, heaven, It's, easy, if,...","[Imagine, there's, heaven, It's, easy, try, No..."


In [214]:
def remove_stopwords(list_of_words):
    stop_words = set(stopwords.words('english'))
    tokens_without_sw = [word for word in list_of_words if not word in stop_words]
    return tokens_without_sw

In [215]:
# Remove stopwords
df_tokens["lyrics_wo_stopwords"] = df_tokens.lyrics_tokenized.apply(remove_stopwords)

In [216]:
df_tokens

Unnamed: 0,song,artist,genius_artist,spotify_artist,artist_id,album,artist_genre(s),lyrics,audio_features,lyrics_tokenized,lyrics_wo_stopwords
0,Satisfaction,The Rolling Stones,The Rolling Stones,The Rolling Stones,22bE4uQ6baNwSHPVcDxLCe,,"[british invasion, classic rock, rock, blues, ...",I can't get no satisfaction I can't get no sat...,{},"[I, can't, get, no, satisfaction, I, can't, ge...","[I, can't, get, satisfaction, I, can't, get, s..."
1,Respect,Aretha Franklin,Aretha Franklin,Aretha Franklin,7nwUJBm0HE4ZxD3f5cy5ok,,"[classic soul, jazz blues, memphis soul, soul,...",What you want baby I got it What you need do y...,{},"[What, you, want, baby, I, got, it, What, you,...","[What, want, baby, I, got, What, need, know, I..."
2,Stairway to Heaven,Led Zeppelin,Led Zeppelin,Led Zeppelin,36QJpDe2go2KgaRleHCDTp,,"[album rock, classic rock, hard rock, rock]",There's a lady who's sure all that glitters is...,{},"[There's, a, lady, who's, sure, all, that, gli...","[There's, lady, who's, sure, glitters, gold, A..."
3,Like a Rolling Stone,Bob Dylan,Bob Dylan,Bob Dylan,74ASZWbe4lXaubB36ztrGX,,"[album rock, classic rock, country rock, folk,...",Once upon a time you dressed so fine Threw the...,{},"[Once, upon, a, time, you, dressed, so, fine, ...","[Once, upon, time, dressed, fine, Threw, bums,..."
4,Born to Run,Bruce Springsteen,Bruce Springsteen,Bruce Springsteen,3eqjTLE0HfPfh78zjh6TqT,,"[classic rock, heartland rock, mellow gold, pe...",In the day we sweat it out on the streets Of a...,{},"[In, the, day, we, sweat, it, out, on, the, st...","[In, day, sweat, streets, Of, runaway, America..."
...,...,...,...,...,...,...,...,...,...,...,...
433,I'D Be Better Off (In a Pine Box),Doug Stone,Doug Stone,Doug Stone,4epBFW4UHEmgjIK5xOrBhk,,"[country, country road, country rock]",Well I said the night you left me nothin' wors...,{},"[Well, I, said, the, night, you, left, me, not...","[Well, I, said, night, left, nothin', worse, c..."
434,Amazed,Lonestar,Lonestar,Lonestar,3qbnxnvUqR14MJ9g8QwZJK,,"[contemporary country, country, country road]",Every time our eyes meet This feeling inside m...,{},"[Every, time, our, eyes, meet, This, feeling, ...","[Every, time, eyes, meet, This, feeling, insid..."
435,Faded Love,Bob Wills or Patsy Cline,Country Genius,,,,[],The Nashville Basin and Country MusicMusic has...,{},"[The, Nashville, Basin, and, Country, MusicMus...","[The, Nashville, Basin, Country, MusicMusic, b..."
436,Back in the Saddle Again,Gene Autry,Gene Autry,Gene Autry,5ixB75BQR3ADoWQkcHQJTs,,"[adult standards, cowboy western, oklahoma cou...",I'm back in the saddle again Out where a frien...,{},"[I'm, back, in, the, saddle, again, Out, where...","[I'm, back, saddle, Out, friend, friend, Where..."
