In [5]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50 )
import matplotlib.pyplot as plt
import seaborn as sns
import re

sns.set(font_scale=1.5)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import pprint
pp = pprint.PrettyPrinter(width=41, compact=True)

from tqdm import tqdm_notebook as tqdm


import warnings
warnings.simplefilter('ignore')

# part 1 - billboard

In [132]:
#https://github.com/guoguo12/billboard-charts
data = pd.read_csv('billboard_original.csv')

In [None]:
billboard = data[['Performer', 'Song', 'WeekID', 'Week Position', 'Instance', 'Weeks on Chart']]
billboard.columns = ['artist', 'track_name','weekid','week_position', 'instance', 'weeks_on_chart']

In [199]:
billboard.head()

Unnamed: 0,artist,track_name,weekid,week_position,instance,weeks_on_chart,artist_clean
0,jane child,don't wanna fall in love,2/10/1990,75,1,1,jane child
1,jane child,don't wanna fall in love,2/17/1990,53,1,2,jane child
2,jane child,don't wanna fall in love,2/24/1990,43,1,3,jane child
3,jane child,don't wanna fall in love,3/3/1990,37,1,4,jane child
4,jane child,don't wanna fall in love,3/10/1990,27,1,5,jane child


In [203]:
billboard['weekid'] = pd.to_datetime(billboard['weekid'],format='%m/%d/%Y')
pd.DataFrame(billboard['weekid'].value_counts())[pd.DataFrame(billboard['weekid'].value_counts()).weekid != 100]
print(billboard.weekid.min())
print(billboard.weekid.max())

1958-08-02 00:00:00
2019-06-22 00:00:00


In [152]:
billboard.shape

(317795, 8)

### clean billboard

In [142]:
billboard.artist = billboard.artist.apply(lambda x: x.lower())
billboard.track_name = billboard.track_name.apply(lambda x: x.lower())

In [149]:
billboard['artist_clean'] = billboard.loc[:, 'artist']

In [150]:
#billboard['artist_clean'] = billboard['artist_clean'].apply(lambda x: re.sub(r'(\(feat.*)','', x))
billboard.artist_clean =  billboard.artist_clean.apply(lambda x: x.replace('(','').replace(')','')) 
billboard.artist_clean =  billboard.artist_clean.apply(lambda x: x.replace('[','').replace(']',''))

### featuring

In [None]:
# I could say with certainty that artists listed after 'featuring' would in fact be featuring

In [151]:
billboard.artist_clean = billboard.artist_clean.apply(lambda x:  x.split(' featuring ')[0]) 
billboard.artist_clean = billboard.artist_clean.apply(lambda x:  x.split(' featuring: ')[0]) 
billboard.artist_clean = billboard.artist_clean.apply(lambda x:  x.split(' feauring ')[0]) 
billboard.artist_clean = billboard.artist_clean.apply(lambda x:  x.split(' feat. ')[0]) 
billboard.artist_clean = billboard.artist_clean.apply(lambda x:  x.split(' with ')[0]) 
billboard.artist_clean = billboard.artist_clean.apply(lambda x:  x.split(' x ')[0]) 
billboard.artist_clean = billboard.artist_clean.apply(lambda x:  x.split(' starring ')[0]) 
billboard.artist_clean = billboard.artist_clean.apply(lambda x:  x.split(' co-starring ')[0])

### comma

In [None]:
# However there were several ways of writing feauturing. As we can see in the table below, the comma can be part of the name 
# (earth, wind & fire) or it can mean featuring (diplo, french montana & lil pump)

In [156]:
billboard[(billboard.artist_clean == 'diplo, french montana & lil pump')|(billboard.artist_clean == 
                                        'earth, wind & fire')].drop_duplicates(subset='artist')

Unnamed: 0,artist,track_name,weekid,week_position,instance,weeks_on_chart,artist_clean
823,"earth, wind & fire",mighty mighty,3/16/1974,86,1,2,"earth, wind & fire"
15098,"earth, wind & fire with the emotions",boogie wonderland,5/12/1979,69,1,1,"earth, wind & fire"
252376,"diplo, french montana & lil pump featuring zha...",welcome to the party,6/2/2018,84,1,1,"diplo, french montana & lil pump"


In [None]:
# so i made some lists of when the comma is part of the name and when it means featuring

In [28]:
meaningful_comma = ['earth, wind & fire', 'ray, goodman & brown', 'crosby, stills & nash', 'peter, paul & mary', 
'mcguinn, clark & hillman',  'emerson, lake & palmer',  'blood, sweat & tears',  'ray, goodman & brown',
'crosby, stills, nash & young',  'artists of then, now & forever', 'hamilton, joe frank & reynolds', 
'tyler, the creator', 'isley, jasper, isley', 'lipps, inc.', 'dino, desi & billy', 'the checkmates, ltd.',
'cotton, lloyd & christian', 'emerson, lake & powell', 'bobby pedrick, jr.', 'dave dee, dozy, beaky, mick and tich']

feat_comma = ['benny blanco, halsey & khalid',  'benny blanco, tainy, selena gomez & j balvin', 
'casper magico, nio garcia, darell, nicky jam, ozuna & bad bunny', 'diplo, french montana & lil pump',
 'farruko, nicki minaj, bad bunny, 21 savage & rvssian', 'jay rock, kendrick lamar, future & james blake',
'joe sherman, his orchestra and chorus',  'kristen bell, agatha lee monn & katie lopez',
 'lunay, daddy yankee & bad bunny', 'ross lynch, grace phipps and jason evigan',
     'sofia carson, cameron boyce, booboo stewart & mitchell hope']

miscellaneous_comma = {'g. dep, p. diddy & black rob':'g. dep', 'ramsey lewis and earth, wind & fire':'ramsey lewis',
'hugh jackman, keala settle, zac efron, zendaya & the greatest showman ensemble':'the greatest showman ensemble',  
'barry devorzon and perry botkin, jr.':'barry devorzon',  "r.l., snoop dogg & lil' kim":'snoop dogg', 
'ramsey lewis and earth, wind & fire':'ramsey lewis', 'tragedy, capone, infinite':'capone'}


sorted_comma = []
main_appears_elsewhere_comma = []
wrong_check_comma = []

for artist in billboard['artist_clean']:
    main_artist = artist.split(',')[0] 
    if ',' in artist:
        
        # comma part of name (from wrong_check_comma)
        if artist in meaningful_comma:
            sorted_comma.append(artist)
            
        # [0] is a legit solo artist aka comma means feat. (from wrong_check_comma)
        elif artist in feat_comma: 
            sorted_comma.append(main_artist)
            
        elif artist in list(miscellaneous_comma.keys()):
            sorted_comma.append(miscellaneous_comma[artist])
        
        # if [0] appears in artists then append [0]
        # check: [0] might not be a legit solo artist
        elif main_artist in billboard.artist_clean.unique(): 
            sorted_comma.append(main_artist)
            main_appears_elsewhere_comma.append(main_artist)
        
        # append the rest
        # check: [0] could be a legit solo artist (when [0] doesnt appear in artists)
        else:
            sorted_comma.append(artist)
            wrong_check_comma.append(artist)
            
    else:
        sorted_comma.append(artist)     

In [30]:
billboard.artist_clean = sorted_comma

### slash

In [31]:
# 'm' was found in main_appears elsewhere which is m/a/r/r/s

In [32]:
# if slash is meaningful
meaningful_slash = ['ac/dc', 'rufus/chaka khan', 'm/a/r/r/s', 'hughes/thrall']

after_slash_real_artist = ['john & yoko/the plastic ono band', 'genius/gza', 'jay-z/linkin park', 
                              "michael zager's moon band/peabo bryson"]

actually_featuring = [ 'marvin hamlisch/"the sting"',  'the raelets/ray charles orchestra',
                      'zayn / taylor swift']

sorted_slash = []
main_appears_elsewhere_slash = []
wrong_check_slash = []

for artist in billboard['artist_clean']:
    main_artist = artist.split('/')[0] 
    if '/' in artist:
        
        if artist in meaningful_slash:
            sorted_slash.append(artist)
            
        elif artist in after_slash_real_artist:
            sorted_slash.append(artist.split('/')[1])
            
        elif artist in actually_featuring:
            sorted_slash.append(artist.split('/')[0])
        
        elif main_artist in billboard.artist_clean.unique():
            sorted_slash.append(main_artist)
        
        else:
            sorted_slash.append(artist)
            
    else:
        sorted_slash.append(artist)   

In [33]:
billboard.artist_clean = sorted_slash

#### for reference: example of checking whether certain artist has genre tag

In [58]:
results = sp.search(q='artist:paul simon', type='artist', limit=1)

for i, t in enumerate(results['artists']['items']):

    print(t['genres'])

['classic rock', 'folk', 'folk rock', 'folk-pop', 'mellow gold', 'permanent wave', 'rock', 'roots rock', 'singer-songwriter', 'soft rock']


### &

In [41]:
actually_feat = ['chris thompson & night', 'paul humphrey & his cool aid chemists', 'jorgen ingmann & his guitar'
'mos def & pharoahe monch', "al greene & the soul mate's", 'mantovani & his orch.', 'cliff nobles & co.',
'martin solveig & dragonette', 'mc nas-d & dj fred', 'doc box & b. fresh', 'buster poindexter & his banshees of blue',
'gesaffelstein & the weeknd', 'adam levine & javier colon', 'the swell season glen hansard & marketa irglova',
'ike & tina turner & the ikettes', 'anuel aa & romeo santos', 'a r rahman & the pussycat dolls', 
'redhead kingpin & the f.b.i.', 'jonell & method man', 'lucas grabeel & ashley tisdale', 'anuel aa & karol g',
'edward byrnes & connie stevens', 'dillon francis & dj snake', 'dj sammy & yanou', 'mustard & migos',
'doctor dre & ed lover', 'mos def & kweli are black star', "opetaia foa'i & lin-manuel miranda"]

miscellaneous_ampersand = {'gaylord & holiday':'gaylords', 'sylvia r&b':'sylvia', 'dionne & friends':
'dionne warwick', 'dionne':'dionne warwick', 'deliverance/eric weissberg & steve mandell': 
'deliverance', 'lillywood & robin schulz': 'lilly wood', 'delaney & bonnie & friends/eric clapton'
:'delaney & bonnie', 'paul & linda mccartney':'paul mccartney', 'jon & robin and the in crowd': 
'jon & robin', 'keala settle & the greatest showman ensemble':'the greatest showman ensemble', 
'delaney & bonnie & friends': 'delaney & bonnie', 'a-trak + milo & otis':'a-trak',
                               'sonny & cher': 'cher'}

main_appears_elsewhere_wrong = ['Rufus & Carla']

sorted_ampersand = []

for artist in billboard['artist_clean']:
    main_artist = artist.split(' & ')[0] 
    if ' & ' in artist:
        
        if artist in list(miscellaneous_ampersand.keys()):
            sorted_ampersand.append(miscellaneous_ampersand[artist])
        
        elif artist in actually_feat:
            sorted_ampersand.append(main_artist)
            
        elif artist in main_appears_elsewhere_wrong:
            sorted_ampersand.append(artist)
            
        elif main_artist in billboard.artist_clean.unique():
            sorted_ampersand.append(main_artist)
        
        else:
            sorted_ampersand.append(artist)
            
    else:
        sorted_ampersand.append(artist) 

In [42]:
billboard.artist_clean = sorted_ampersand

### and

In [44]:
actually_feat_and = [ 'grand master melle mel and the furious five',  'the used and my chemical romance']

sorted_and = []

for artist in billboard['artist_clean']:
    main_artist = artist.split(' and ')[0] 
    if ' and ' in artist:
            
        if artist in actually_feat_and:
            sorted_and.append(artist.split(' & ')[0])
        
        elif main_artist in billboard.artist_clean.unique() and 'love' not in main_artist:
            sorted_and.append(main_artist)
        
        else:
            sorted_and.append(artist)
            
    else:
        sorted_and.append(artist) 
        

In [45]:
billboard.artist_clean = sorted_ampersand

#### check special characters

In [380]:
for x in billboard[billboard.artist_clean.str.contains(pat = '[^\w\s.]+', regex = True)].artist_clean.unique()[:8]:  
    print(x)

yarbrough & peoples
huey "piano" smith
49-ers
bone thugs-n-harmony
jay-z
ke$ha
p!nk
eagle-eye cherry


#### miscelleanous incorrect entries 

In [51]:
billboard.artist_clean.replace({'m:g':'m.g', 'stars on 54: ultra nate':'ultra nate',
                               'delaney & bonnie & friends/eric clapton':'delaney & bonnie',
                                "fu-schnickens w/ shaquille o'neal":'fu-schnickens',
                                'esther phillips "little esther"':'esther phillips',
                                'Jackson 5': 'The Jackson 5', 'Beatles': 'The Beatles'
                                'wham! u.k.':'wham'},inplace=True)

# *

### save billboard

In [180]:
billboard.to_csv('billboard_final', encoding='utf-8', index=False)

### load billboard

In [181]:
billboard = pd.read_csv('billboard_final')

# part 2

# get artist genre/artist popularity spotipi

### authenticate

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

client_id = 'c70a3fc87cf541198f93adf301838ee5'
client_secret = '744cfea7c14448ed98d9ac8c80052241'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) #spotify object to access API

### scrape


In [None]:
artist_name = []
genre = []
popularity = []
followers = []
artist_uri = []
billboard_artist = []

import time
start_time = time.time()
request_count = 0

for artist in billboard.artist_clean.unique():
    
    billboard_artist.append(artist)
    
    results = sp.search(q='artist:' + artist, type='artist', limit=1)
    
    if len(results['artists']['items']):

        for i, t in enumerate(results['artists']['items']): 

            try:
                artist_name.append(t['name'])
            except:
                artist_name.append(np.nan)

            try:
                genre.append(t['genres'])
            except:
                genre.append(np.nan)

            try:
                popularity.append(t['popularity'])
            except:
                popularity.append(np.nan)

            try:
                followers.append(t['followers']['total'])
            except:
                followers.append(np.nan)

            try:
                artist_uri.append(t['uri'])
            except:
                artist_uri.append(np.nan)
                
    else:
        artist_name.append(np.nan)
        genre.append(np.nan)
        popularity.append(np.nan)
        followers.append(np.nan)
        artist_uri.append(np.nan)

        request_count+=1
        if request_count % 50 == 0:
            print(str(request_count) + " artists completed")
            print('elapsed: {} seconds'.format(time.time() - start_time))


### create artist_info dataframe

In [319]:
artist_info = pd.DataFrame({'spotify_artist':artist_name, 'billboard_artist':billboard_artist , 
                            'artist_uri':artist_uri, 'genre' : genre, 'artist_popularity' : popularity, 
                            'followers' : followers})

In [166]:
artist_info.head()

Unnamed: 0,spotify_artist,billboard_artist,artist_uri,genre,artist_popularity,followers
0,Jane Child,jane child,spotify:artist:5JBif5ahOKFFVuEpHhrp8Y,['new wave pop'],36.0,4884.0
1,Jason Derulo,jason derulo,spotify:artist:07YZf4WDAMNwqr4jfgOZ8y,"['dance pop', 'pop', 'pop rap', 'post-teen pop...",83.0,7195281.0
2,Maroon 5,maroon 5,spotify:artist:04gDigrS5kc9YWfZHwBETP,"['pop', 'pop rock']",93.0,20940000.0
3,Bee Gees,bee gees,spotify:artist:1LZEQNv7sE11VDY3SdxQeN,"['adult standards', 'disco', 'mellow gold', 's...",78.0,2636380.0
4,Gloria Estefan,gloria estefan,spotify:artist:5IFCkqu9J6xdWeYMk5I889,"['dance pop', 'latin pop', 'new wave pop', 'so...",68.0,347934.0


### save 

In [321]:
artist_info.to_csv('artist_info_final', encoding='utf-8', index=False)

### load artist_info

In [158]:
artist_info = pd.read_csv('artist_info_final')

# ------------------------------------------------------------------------------------------------------------

# part 3

# merge to get billboard + artist_info

In [682]:
billboard_merged = pd.merge(billboard.drop('artist', axis=1) , artist_info , 
                                   how='inner', left_on='artist_clean', right_on='billboard_artist')
billboard_merged.head(2)

Unnamed: 0,track_name,WeekID,Week Position,Instance,Weeks on Chart,Peak Position,artist_clean,spotify_artist,billboard_artist,artist_uri,genre,artist_popularity,followers
0,don't wanna fall in love,2/10/1990,75,1,1,75,jane child,Jane Child,jane child,spotify:artist:5JBif5ahOKFFVuEpHhrp8Y,['new wave pop'],36.0,4884.0
1,don't wanna fall in love,2/17/1990,53,1,2,53,jane child,Jane Child,jane child,spotify:artist:5JBif5ahOKFFVuEpHhrp8Y,['new wave pop'],36.0,4884.0


### check that the merged columns are the same

In [511]:
incorrect = billboard_merged.loc[(billboard_merged['artist_clean'] != [x.lower() for x in 
billboard_merged['spotify_artist']])].drop_duplicates(subset='artist_clean', 
keep='first')[['artist_clean', 'spotify_artist']].head(10)
print(incorrect.to_string())

                        artist_clean                   spotify_artist
2996                     john mayall  John Mayall & The Bluesbreakers
3111                 vincent edwards               J. Vincent Edwards
4833                          linear                      Linear Labs
5198                           touch                 A Touch Of Class
11221  perez prado and his orchestra      Perez Prado & his Orchestra
17590                          jay-z                            JAY Z
20906                    celine dion                      Céline Dion
21514                          ke$ha                      Ha Ke Kgomo
25208                     puff daddy                Mase & Puff Daddy
26577                        beyonce                          Beyoncé


#### create a dictionary of entries where billboard doesnt match up with spotify results

In [436]:
incorrect_dict = {}
for artist_name in incorrect.artist_clean:
    incorrect_dict[artist_name]=  list(incorrect.groupby('artist_clean')['track_name'].get_group(artist_name).values)

#### scrape again including track name to try get more accurate scrape

In [479]:
billboard_artist = []
improved_artist = []

for artist_name in incorrect_dict:
    for track in set(incorrect_dict[artist_name]):
        
        billboard_artist.append(artist_name)
        
        results = sp.search(q='artist:' + artist_name + ' track:' + track, type='track', limit=1)

        if len(results['tracks']['items']):

            for i, t in enumerate(results['tracks']['items']):

                improved_artist.append(t['album']['artists'][0]['name'])
                
        else:
            improved_artist.append(np.nan)           

retrying ...1secs
retrying ...1secs


#### put into dataframe

In [536]:
improved = pd.DataFrame({'improved_artist':improved_artist, 'billboard_artist':billboard_artist })
improved = improved[pd.notnull(improved.improved_artist)]
add_to_original = improved.loc[(improved.billboard_artist == [x.lower() for x in improved.improved_artist])]
add_to_original 

Unnamed: 0,improved_artist,billboard_artist
24,Blaque,blaque
27,Prince,prince
29,Free,free
30,Holly Knight,holly knight
32,The Originals,the originals
...,...,...
1135,Notorious,notorious
1137,Tag,tag
1149,Lake,lake
1156,the Triplets,the triplets


In [564]:
check_these = improved.loc[(improved.billboard_artist != [x.lower() 
                            for x in improved.improved_artist])][check_these.improved_artist != 'Various Artists' ]
check_these.head()

Unnamed: 0,improved_artist,billboard_artist
6,Céline Dion,celine dion
9,Beyoncé,beyonce
10,Janet Jackson,janet
13,John Mellencamp,john cougar mellencamp
15,Herb Alpert & The Tijuana Brass,herb alpert


In [None]:
check_these['ratio'] = check_these.apply(lambda x: fuzz.ratio(x['improved_artist'], x['billboard_artist']), axis=1)
check_these = check_these[check_these.ratio > 85].drop('ratio', axis=1)
add_to_original.append(check_these)

In [683]:
billboard_merged = billboard_merged[~billboard_merged.billboard_artist.isin(list(add_to_original.billboard_artist))]

## null values 

In [354]:
billboard_merged.to_csv('billboard_artist_merge_including_nans', encoding='utf-8', index=False)

In [684]:
print(billboard_merged.shape)
billboard_merged.dropna(inplace=True)
print(billboard_merged.shape)

(311213, 13)
(301721, 13)


#### duplicates check

In [678]:
print(len(artist_info[artist_info.duplicated(keep='first')]))

0


#### nice presentation

In [688]:
billboard_merged.columns = [c.lower().replace(' ', '_') for c in billboard_merged.columns]

In [690]:
billboard_merged = billboard_merged [['spotify_artist', 'track_name', 'billboard_artist', 'artist_uri', 'genre', 'artist_popularity',
                    'followers', 'weekid', 'week_position', 'instance', 'weeks_on_chart', 'peak_position']]

In [681]:
billboard_merged.head(2)

Unnamed: 0,spotify_artist,track_name,artist_uri,genre,artist_popularity,followers,weekid,week_position,instance,weeks_on_chart,peak_position
0,Jane Child,don't wanna fall in love,spotify:artist:5JBif5ahOKFFVuEpHhrp8Y,['new wave pop'],36.0,4884.0,2/10/1990,75,1,1,75
1,Jane Child,don't wanna fall in love,spotify:artist:5JBif5ahOKFFVuEpHhrp8Y,['new wave pop'],36.0,4884.0,2/17/1990,53,1,2,53


### save 

In [691]:
billboard_merged.to_csv('billboard_merged_final', encoding='utf-8', index=False)

### load

In [204]:
billboard_merged = pd.read_csv('billboard_merged_final')

# ------------------------------------------------------------------------------------------------------------

# part 4 - individual track info

## clean track name

In [598]:
for x in billboard_merged[billboard_merged.track_name.str.contains(pat = '[^\w\s.]+', regex = True)].track_name.unique()[:10]:  
    print(x)

don't wanna fall in love
don't wanna go home
ridin' solo
if it ain't love
don't wanna know
won't go home without you
this summer's gonna hurt...
don't wanna live inside myself
stayin' alive
new york mining disaster 1941 (have you seen my wife, mr. jones)


In [599]:
billboard_merged.track_name.replace({'new york mining disaster 1941 (have you seen my wife, mr. jones)':'new york mining disaster 1941', 
    "ev'rybody's twistin'": "everybody's twistin'"},inplace=True)

## track id (to get musical components)/ track popularity

In [600]:
artist_dict={}
for artist_name in billboard_merged.spotify_artist.unique():
    artist_dict[artist_name]=list(billboard_merged.groupby('spotify_artist')['track_name'
                                                                            ].get_group(artist_name).values)

In [None]:
for artist_name in artist_dict:
    for track_name in set(artist_dict[artist_name]):
            print(artist_name, track_name )

In [None]:
original_track = []
original_artist = []

track_name = []
artist_name = []
track_popularity = []
track_uri = []

import time
start_time = time.time()
request_count = 0
    
for artist in artist_dict:
    for track in set(artist_dict[artist]):
        
        original_track.append(track)
        original_artist.append(artist)
        
        track_results = sp.search(q='artist:' + artist + ' track:' + track, type='track', limit=1)
        
        if len(track_results['tracks']['items']):

            for i, t in enumerate(track_results['tracks']['items']):

                try:
                    track_name.append(t['name'])
                except:
                    track_name.append(np.nan)

                try:
                    artist_name.append(t['album']['artists'][0]['name'])
                except:
                    artist_name.append(np.nan)

                try:
                    track_popularity.append(t['popularity'])
                except:
                    track_popularity.append(np.nan)

                try:
                    track_uri.append(t['uri'])
                except:
                    track_uri.append(np.nan)
                    
        else:
            track_name.append(np.nan)
            artist_name.append(np.nan)
            track_popularity.append(np.nan)
            track_uri.append(np.nan)
                    
        request_count+=1
        if request_count % 50 == 0:
            print(str(request_count) + " track completed")
            print('elapsed: {} seconds'.format(time.time() - start_time))


#### track name / track popularity / track ID dataframe

In [608]:
track_info = pd.DataFrame({'original_track':original_track, 'original_artist':original_artist, 'artistor':artist_name, 
                           'track_name':track_name, 'track_popularity':track_popularity, 'track_uri': track_uri })

In [609]:
track_info.head()

Unnamed: 0,original_track,original_artist,artist,track_name,track_popularity,track_uri
0,don't wanna fall in love,Jane Child,,,,
1,welcome to the real world,Jane Child,Jane Child,Welcome to the Real World,19.0,spotify:track:37o0UBywazf2Xcy3297Que
2,it girl,Jason Derulo,Jason Derulo,It Girl,72.0,spotify:track:4fINc8dnfcz7AdhFYVA4i7
3,if it ain't love,Jason Derulo,Jason Derulo,If It Ain't Love,62.0,spotify:track:5YHR8wBR5jXfGtP766NiAT
4,in my head,Jason Derulo,Jason Derulo,In My Head,65.0,spotify:track:5BoIP8Eha5hwmRVURkC2Us


In [610]:
len(track_info.artist.unique())

4511

In [611]:
len(track_info.track_name.unique())

18937

In [612]:
track_info.isnull().sum()

original_track         0
original_artist        0
artist              5419
track_name          5419
track_popularity    5419
track_uri           5419
dtype: int64

In [613]:
print(track_info.shape)
track_info.dropna(inplace=True)
print(track_info.shape)

(26517, 6)
(21098, 6)


In [615]:
print(len(track_info[track_info.duplicated(keep='first')]))

0


# *

## musical components

In [None]:
acousticness = []
danceability = []
energy = []
instrumentalness = []
liveness = []
loudness = []
speechiness = []
tempo = []
valence = []
track_uri = []

start_time = time.time()
request_count = 0   

for track in track_info.track_uri:

    feature_results = sp.audio_features(track)
    
    for i, t in enumerate(feature_results):
        
        try:
            track_uri.append(t['uri'])
        except:
            track_uri.append(np.nan)
        
        try:
            acousticness.append(t['acousticness'])
        except:
            acousticness.append(np.nan)
 
        try:
            danceability.append(t['danceability'])
        except:
            danceability.append(np.nan)
       
        try:
            energy.append(t['energy'])
        except:
            energy.append(np.nan)
        
        try:
            instrumentalness.append(t['instrumentalness'])
        except:
            instrumentalness.append(np.nan)
            
        try:
            liveness.append(t['liveness'])
        except:
            liveness.append(np.nan)
            
        try:
            loudness.append(t['loudness'])
        except:
            loudness.append(np.nan)
             
        try:
            speechiness.append(t['speechiness'])
        except:
            speechiness.append(np.nan)
             
        try:
            tempo.append(t['tempo'])
        except:
            tempo.append(np.nan)
        
        try:
            valence.append(t['valence'])
        except:
            valence.append(np.nan)
            
        try:
            track_uri.append(t['uri'])
        except:
            track_uri.append(np.nan)

        try:
            duration.append(t['duration_ms'])
        except:
            duration.append(np.nan)

        try:
            key.append(t['key'])
        except:
            key.append(np.nan)

        try:
            mode.append(t['mode'])
        except:
            mode.append(np.nan)

        try:
            time_signature.append(t['time_signature'])
        except:
            time_signature.append(np.nan)
        
        
    request_count+=1
    if request_count % 100 == 0:
        print(str(request_count) + " musical components completed")
        print('elapsed: {} seconds'.format(time.time() - start_time))


### musical components df

In [617]:
musical_components = pd.DataFrame({'track_uri':track_uri, 'acousticness':acousticness, 'danceability':danceability, 
                                   'energy' : energy, 'instrumentalness' : instrumentalness, 'liveness' : liveness, 
                                   'loudness' : loudness, 'speechiness' : speechiness, 
                                    'tempo' : tempo, 'valence' : valence})

new_features = pd.DataFrame({'track_uri':track_uri,'duration':duration, 'key':key, 'mode':mode,
                             'time_signature':time_signature})

In [628]:
print(len(musical_components[musical_components.duplicated(keep='first')]))
musical_components.drop_duplicates(inplace=True)

75


# merge track_all

In [663]:
track_all = pd.merge(track_info, musical_components, left_on="track_uri", right_on="track_uri", how='inner')

In [75]:
track_all.shape

(21094, 15)

In [667]:
print(len(track_all[track_all.duplicated(keep='first')]))

0


In [669]:
track_all.isnull().sum().sum()

0

#### save tracks to csv

In [72]:
track_all.to_csv('track_all_old', encoding='utf-8', index=False)
track_all_new.to_csv('track_all_new', encoding='utf-8', index=False)

#### load tracks from csv

In [170]:
track_all_new = pd.read_csv('track_all_new')

# *

# part 5 - merge track_all + billboard_merged

In [97]:
everything = pd.merge(billboard_merged.drop('billboard_artist', axis=1), track_all_new.drop(['original_artist'],  axis=1), 
          left_on=['spotify_artist', 'track_name'],
          right_on=['artist', 'original_track'], how ='inner')

In [98]:
everything.shape

(236237, 29)

In [99]:
everything.columns

Index(['spotify_artist', 'track_name_x', 'artist_uri', 'genre',
       'artist_popularity', 'followers', 'weekid', 'week_position', 'instance',
       'weeks_on_chart', 'peak_position', 'original_track', 'artist',
       'track_name_y', 'track_popularity', 'track_uri', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'duration', 'key', 'mode',
       'time_signature'],
      dtype='object')

#### nice presentation

In [101]:
everything = everything[['spotify_artist', 'track_name_y', 'artist_popularity', 'followers', 'artist_uri', 'genre', 
                        'weekid', 'week_position', 'instance', 'weeks_on_chart','peak_position', 
                        'track_popularity', 'track_uri', 'acousticness', 'danceability',
                        'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
                        'tempo', 'valence', 'duration', 'key', 'mode','time_signature']]
everything.rename(columns={'track_name_y': 'track_name', 'spotify_artist':'artist'}, inplace=True)
everything['weekid'] = pd.to_datetime(everything['weekid'],format='%m/%d/%Y')

#### make sure artists / tracks matched up 

In [183]:
print(len(everything.track_name.unique()))
print(len(everything.artist.unique()))

17693
4323


In [125]:
everything.drop_duplicates(keep='first', inplace=True)

#### save tracks to csv

In [127]:
everything.to_csv('everything_scraped_new', encoding='utf-8', index=False)

#### load tracks from csv

In [3]:
everything = pd.read_csv('everything_scraped_new')

# ------------------------------------------------------------------------------------------------------------

In [6]:
everything.head(4)

Unnamed: 0,artist,track_name,artist_popularity,followers,artist_uri,genre,weekid,week_position,instance,weeks_on_chart,peak_position,track_popularity,track_uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,duration,key,mode,time_signature
0,Jane Child,Welcome to the Real World,36.0,4884.0,spotify:artist:5JBif5ahOKFFVuEpHhrp8Y,['new wave pop'],1990-06-02,90,1,1,90,19.0,spotify:track:37o0UBywazf2Xcy3297Que,0.00703,0.625,0.721,1e-06,0.317,-12.512,0.0325,104.651,0.851,299640,8,0,4
1,Jane Child,Welcome to the Real World,36.0,4884.0,spotify:artist:5JBif5ahOKFFVuEpHhrp8Y,['new wave pop'],1990-06-09,77,1,2,77,19.0,spotify:track:37o0UBywazf2Xcy3297Que,0.00703,0.625,0.721,1e-06,0.317,-12.512,0.0325,104.651,0.851,299640,8,0,4
2,Jane Child,Welcome to the Real World,36.0,4884.0,spotify:artist:5JBif5ahOKFFVuEpHhrp8Y,['new wave pop'],1990-06-16,60,1,3,60,19.0,spotify:track:37o0UBywazf2Xcy3297Que,0.00703,0.625,0.721,1e-06,0.317,-12.512,0.0325,104.651,0.851,299640,8,0,4
3,Jane Child,Welcome to the Real World,36.0,4884.0,spotify:artist:5JBif5ahOKFFVuEpHhrp8Y,['new wave pop'],1990-06-23,56,1,4,56,19.0,spotify:track:37o0UBywazf2Xcy3297Que,0.00703,0.625,0.721,1e-06,0.317,-12.512,0.0325,104.651,0.851,299640,8,0,4
