# Music Analysis

In [7]:
import re
import sys
import time
import json
import math
import random
import spotipy
import requests
import itertools
import collections
import pandas as pd
from tqdm import tqdm
import spotipy.util as util
from bs4 import BeautifulSoup
from collections import defaultdict
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
def get_track_features(client_id = None, client_secret = None):
    if not client_id or not client_secret:
        print("Please add your client_id and client_secret")
        print("For more information, please visit ")
        print("https://developer.spotify.com/my-applications/#!/applications")
        return 
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, \
                                   client_secret = client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    # Get track info
    artist_name = []
    track_name = []
    track_id = []
    popularity = []
    years = []

    for year in tqdm(range(1960, 2020)):
        try:
            track_results = sp.search(q='year:{0}'.format(year), type='track', limit=50)
        except:
            break
        while track_results:
            for i, t in enumerate(track_results['tracks']['items']):
                artist_name.append(t['artists'][0]['name'])
                track_name.append(t['name'])
                track_id.append(t['id'])
                popularity.append(t['popularity'])
                years.append(str(year))
            if track_results['tracks']['next']:
                try:
                    track_results = sp.next(track_results['tracks'])
                except:
                    break
            else:
                track_results = None
    print("Get {0} songs from spotify".format(len(track_id)))
    # Get featrues
    feature_keys = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', \
                      'instrumentalness', 'liveness', 'valence', 'tempo','duration_ms', 'time_signature']

    audio_features = {key:[] for key in feature_keys}
    audio_features['artist_name'] = artist_name
    audio_features['track_name'] = track_name
    audio_features['track_id'] = track_id
    audio_features['popularity'] = popularity
    audio_features['year'] = years # list(itertools.chain(*[[i] * 10000 for i in range(1960, 2020)])) # years

    for i in tqdm(range(0, len(track_id), 50)):
        features = sp.audio_features(track_id[i: i + 50])  
        for feature in features:
            for key in feature_keys:
                if feature and key in feature:
                    audio_features[key].append(feature[key])
                else:
                    audio_features[key].append(None)
    
    # Save to csv file
    df_tracks = pd.DataFrame(audio_features)
    df_tracks.head()
    df_tracks.to_csv(r'audio_features.csv')

In [3]:
# We've already fetch all the data and corresponding features,
# and saved it in "./audio_features.csv"
# no need to run this function again
client_id = None
client_secret = None
get_track_features(client_id, client_secret)

Please add your client_id and client_secret
For more information, please visit 
https://developer.spotify.com/my-applications/#!/applications


In [4]:
df_tracks = pd.read_csv('audio_features.csv', index_col=0)
df_tracks.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_name,track_name,track_id,popularity,year
0,0.171,0.33,5.0,-9.699,1.0,0.0329,0.707,0.00381,0.302,0.315,174.431,182400.0,3.0,Etta James,At Last,4Hhv2vrOTy89HFRcjU3QOx,75,1960
1,0.508,0.287,1.0,-12.472,1.0,0.0523,0.764,0.0,0.153,0.644,154.759,175987.0,4.0,Ella Fitzgerald,Sleigh Ride,4ukUoXLuFzMixyZyabSGc4,70,1960
2,0.579,0.502,8.0,-7.57,1.0,0.0513,0.733,0.0,0.281,0.836,76.816,131733.0,4.0,Ella Fitzgerald,Frosty The Snowman,65irrLqfCMRiO3p87P4C0D,69,1960
3,0.553,0.291,4.0,-10.426,0.0,0.0301,0.878,0.0,0.129,0.407,96.217,165560.0,4.0,Sam Cooke,You Send Me,0BFEyqJ9DJXS7gKg0Kj46R,66,1960
4,0.482,0.569,0.0,-5.897,1.0,0.0328,0.617,0.0,0.155,0.609,70.843,165280.0,4.0,Sam Cooke,Bring It On Home To Me,4yjz1aazw6R8ZURpGbCAkp,64,1960


In [5]:
df_by_year = df_tracks.dropna().groupby('year').mean().reset_index()
df_by_year.head()

Unnamed: 0,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,popularity
0,1960,0.464192,0.313453,5.0053,-14.693014,0.7152,0.100909,0.80923,0.253353,0.222327,0.494803,110.969244,215520.8149,3.7509,5.4481
1,1961,0.474276,0.320147,4.948585,-14.829432,0.720816,0.107911,0.799795,0.258537,0.218472,0.493421,110.856446,222618.542363,3.752826,5.115635
2,1962,0.463054,0.330539,5.049005,-14.832738,0.719772,0.10038,0.782238,0.254475,0.225328,0.484301,111.21002,214440.192719,3.749175,6.244124
3,1963,0.478339,0.35719,5.0907,-13.994034,0.7426,0.078978,0.758253,0.238693,0.220458,0.526543,112.315946,202682.4428,3.7692,7.717
4,1964,0.478219,0.37129,5.084208,-13.754991,0.747575,0.080756,0.746077,0.217783,0.228598,0.528866,113.116872,198727.756076,3.767377,8.243324


In [10]:
import plotly.graph_objects as go
from ipywidgets import widgets
from ipywidgets import interactive
import plotly.express as px

In [7]:
virtualize_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', \
                      'instrumentalness', 'liveness', 'valence', 'tempo','duration_ms', 'time_signature', 'popularity']

feature_box = widgets.Dropdown(options=virtualize_features,
                          value=virtualize_features[0],
                          description='Feature: ')

trace1 = go.Scatter(x=df_by_year['year'], y=df_by_year['danceability'])

g = go.FigureWidget(data=[trace1],
                   layout=go.Layout(
                        title=dict(
                            text='Music Features Over Years'
                        ),
                        xaxis=dict(
                            title="Year"
                        ),
                        yaxis=dict(
                            title="danceability"
                        ),
                    ))

def response(change):
    new_feature = change['new']
    g.layout.yaxis.title = new_feature
    g.data[0].y = df_by_year[new_feature]

feature_box.observe(response, names="value")

widgets.VBox([feature_box, g])

VBox(children=(Dropdown(description='Feature: ', options=('danceability', 'energy', 'key', 'loudness', 'mode',…

In [8]:
all_musicians = list(df_tracks['artist_name'].unique())
print(len(all_musicians))

53981


Fetch the birthplace of musicians using Wikidata SPARQL query

In [None]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/
# https://stackoverflow.com/questions/45171646/how-to-get-birth-location-using-wikipedia-api
def search_birthlocation(musician):
    from SPARQLWrapper import SPARQLWrapper, JSON
    endpoint_url = "https://query.wikidata.org/sparql"
    query = """SELECT DISTINCT ?item ?itemLabel ?birthLocation ?birthLocationLabel WHERE {{
              ?item (wdt:P31|wdt:P101|wdt:P106)/wdt:P279* wd:Q639669 ;
            rdfs:label "{0}"@en ;wdt:P19 ?birthLocation
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
            }}""".format(musician)
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    try:
        query_result = sparql.query().convert()
    except:
        musician_birth['birthplace'].append("NaN")
        return
    if not query_result["results"]["bindings"]:
        musician_birth['birthplace'].append("NaN")
    else:
        result = query_result["results"]["bindings"][0]
        if 'birthLocationLabel' in result:
            musician_birth['birthplace'].append(result['birthLocationLabel']['value'])
        else:
            musician_birth['birthplace'].append("None")

def get_musician_birth_by_wikisql(run = False):
    if not run:
        return 
    musician_birth = {key:[] for key in ['name', 'birthplace']}

    for i in tqdm(all_musicians):
        time.sleep(0.1)
        musician_birth['name'].append(i)
        search_birthlocation(i)

    print(musician_birth)
    df_musician = pd.DataFrame(musician_birth)
    df_musician.head()
    df_musician.to_csv(r'musician_birth.csv', index=[0])
    

# We've already ran this funtion and saved result in "./musician_birth.csv"
# no need to run this function again because it will take several hours.

# get_musician_birth_by_wikisql()

In [10]:
df_musician = pd.read_csv('musician_birth.csv', index_col=0)
df_musician.head()

Unnamed: 0,name,birthplace
0,Etta James,Los Angeles
1,Ella Fitzgerald,Newport News
2,Sam Cooke,
3,Neil Sedaka,Brooklyn
4,Dean Martin,Steubenville


Some musicians don't have birthplace record in Wikidata database, so we move to fetch the birthplace of remained musicians by scraping Wikipedia page directly. 

In [11]:
def get_musician_birth_by_wiki(run = False):
    if not run:
        return 
    null = 0
    find = 0
    for i, name in enumerate(df_musician.name):
        if i != 0 and i % 1000 == 0:
            print(i)
            print(find)
            print(null)
        if i < 34369:
            continue
        if type(df_musician.loc[i, "birthplace"]) != float or not math.isnan(df_musician.loc[i, "birthplace"]):
            continue
        null += 1
        url = "https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles={0}&rvsection=0".format(name)
        r = requests.get(url)
        soup = BeautifulSoup(r._content, 'html.parser')
        all_matches = re.findall(r"birth_place [^\n]+\n\|", soup.prettify())
        if len(all_matches) == 0:
            all_matches = re.findall(r"origin [^\n]+\n\|", soup.prettify())
        if len(all_matches):
            df_musician.loc[i, "birthplace"] = all_matches[0].strip('\n|').replace('[', '').replace(']', '').split('=')[1].strip()
            find += 1
        time.sleep(random.uniform(0.1, 0.4))
        df_musician.to_csv(r'musician_birth1.csv', index=[0])

# We've already ran this funtion and saved result in "./musician_birth1.csv"
# no need to run this function again because it will take several hours.

# get_musician_birth_by_wiki()

In [15]:
df_musician = pd.read_csv('musician_birth1.csv', index_col=0)
df_musician.head()
print(len(df_musician[df_musician['birthplace'].isnull()]))
new_df = df_musician[df_musician['birthplace'].isnull()]
l = set(list(new_df.name))
print(len(df_tracks[df_tracks['artist_name'].apply(lambda x : x not in l)]))
# df_tracks[df_tracks['artist_name'].apply(lambda x : x not in l)].dropna().groupby('year').count()

33403
400205


Now, there are still 33403 musicians which don't have birthplace information. So we try to add "(singer)" or "(band)" at the end of the musician name when scraping the Wikipedia because of the duplication of name.

Also, to save time, we only query unknown birthplace musicians who have more than 10 songs, the number of them is 3633.

In [17]:
df = df_tracks[df_tracks['artist_name'].apply(lambda x : x in l)]
df = df.groupby('artist_name').size().sort_values()
df = df > 10
df = df[df]
names = ([name for name in df.index])
print(len(names))

3633


In [None]:
res = {}
def get_musician_birth_by_wiki(run = False):
    if not run:
        return 
    null = 0
    find = 0
    for name in tqdm(names):
        null += 1
        base_url = "https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles={0}&rvsection=0".format(name)
        singer_url = base_url + " (singer)"
        band_url = base_url + " (band)"
        for url in [singer_url, band_url]:
            r = requests.get(url)
            soup = BeautifulSoup(r._content, 'html.parser')
            all_matches = re.findall(r"birth_place [^\n]+\n\|", soup.prettify())
            if len(all_matches) == 0:
                all_matches = re.findall(r"origin [^\n]+\n\|", soup.prettify())
            if len(all_matches):
                res[name] = all_matches[0].strip('\n|').replace('[', '').replace(']', '').split('=')[1].strip()
                find += 1
                break
        time.sleep(random.uniform(0.1, 0.4))
    print(len(res))
    df_musician = pd.read_csv('musician_birth1.csv', index_col=0)
    df_musician.head()
    for i, name in enumerate(df_musician.name):
        if name in res:
            df_musician.loc[i, 'birthplace'] = res[name]
    df_musician.to_csv(r'musician_birth_final.csv', index=[0])

# We've already ran this funtion and saved result in "./musician_birth_final.csv"
# no need to run this function again because it will take several hours.
get_musician_birth_by_wiki(run = False)                                                                                                                         

After that, we have successfully fetch birthplace of 709 more musicians and update these information into dataframe. And the number of tracks composed by known birthplace artists has increased to 430K.

In [20]:
df_musician = pd.read_csv('musician_birth_final.csv', index_col=0)
df_musician.head()
df_musician = df_musician.drop(columns=['birth_place'])
print(len(df_musician[df_musician['birthplace'].isnull()]))
new_df = df_musician[df_musician['birthplace'].isnull()]
l = set(list(new_df.name))
print(len(df_tracks[df_tracks['artist_name'].apply(lambda x : x not in l)]))

32719
430865


Right now, we have got the birthplace information of musicians birthplace. The next step is to fetch the latitude and longtitude of these birthplaces because we want to draw these places on the map.

We used API from Nominatim and LocationIQ to maxiamize the number of successful fetching result and load balance to avoid rate limiting (10 requests per second for Nominatim, 1 request per second for LocationIQ).  

In [2]:
def fetch_lat_lon_Nominatim(location, api_key = None):
    if not api_key:
        print("Please add your api_key of Nominatim API")
        print("For more information, please visit ")
        print("https://developer.mapquest.com/documentation/open/nominatim-search/")
    return 
    url = "http://open.mapquestapi.com/nominatim/v1/search.php"

    data = {
        'key': api_key,
        'q': location,
        'format': 'json',
        'limit': 1
    }

    response = requests.get(url, params=data)

    response_json = json.loads(response.text)
    
    if len(response_json) < 1:
        return None, None
    else:
        return response_json[0]['lat'], response_json[0]['lon']

In [3]:
def fetch_lat_lon_locationiq(location, api_key = None):
    if not api_key:
        print("Please add your api_key of LocationIQ API")
        print("For more information, please visit ")
        print("https://locationiq.com/")
    return 

    url = "https://us1.locationiq.com/v1/search.php"
    
    data = {
        'key': api_key,
        'q': location,
        'format': 'json',
    }

    response = requests.get(url, params=data)

    response_json = json.loads(response.text)
    
    if len(response_json) < 1:
        return None, None
    else:
        return response_json[0]['lat'], response_json[0]['lon']

In [5]:
def fetch_all_lat_lon(run = False):
    if not run: return
    lat_res = []
    lon_res = []
    failed_location_dict = {}
    for i, location in tqdm(enumerate(df_musician['birthplace'])):
        if type(location) is str:
            time.sleep(0.1)
            try:
                lat, lon = fetch_lat_lon_Nominatim(location)
            except:
                time.sleep(1)
                try:
                    lat, lon = fetch_lat_lon_locationiq(location)
                except:
                    failed_location_dict[i] = location
                    continue
            lat_res.append(lat)
            lon_res.append(lon)
        else:
            lat_res.append(None)
            lon_res.append(None)

    with open('lat_res.json', 'w') as outfile:
        json.dump(lat_res, outfile)
    with open('lon_res.json', 'w') as outfile:
        json.dump(lon_res, outfile)
    with open('fail_dict.json', 'w') as outfile:
        json.dump(failed_location_dict, infile)

# We've already ran this funtion and saved result in "./lat_res.json", 
# "./lon_res.json" and "fail_dict.json". 
# no need to run this function again because it will take several hours.
fetch_all_lat_lon(run = False)

In [114]:
df_musician = pd.read_csv('musician_birth_final.csv', index_col=0)
df_musician.head()

df_musician = df_musician.drop(columns=['birth_place'])

print(len(df_musician[df_musician['birthplace'].isnull()]))
with open('lat_res.json', 'r') as infile: 
    lat_res = json.load(infile)
with open('lon_res.json', 'r') as infile:
    lon_res = json.load(infile)
with open('fail_dict.json', 'r') as infile:
    failed_location_dict = json.load(infile)

df_musician['long'] = lon_res
df_musician['lat'] = lat_res
df_musician.to_csv(r'musician_birth_with_lat_long.csv', index=[0])

df_tracks = pd.read_csv('audio_features.csv', index_col=0)
df_tracks.head()

df = pd.merge(df_tracks, df_musician, left_on='artist_name', right_on='name', how='left')

df['birthplace'][df['birthplace'].isnull()] = df[df['birthplace'].isnull()]['birthplace'].apply(lambda x : None)

df.to_csv('feature_with_long_and_lat.csv', index_col=0)

32719


Now, we have got all the information needed to draw this geography distribution picture.

In [8]:
df = pd.read_csv('feature_with_long_and_lat.csv', index_col=0)
df = df.dropna()
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,time_signature,artist_name,track_name,track_id,popularity,year,name,birthplace,long,lat
0,0.171,0.33,5.0,-9.699,1.0,0.0329,0.707,0.00381,0.302,0.315,...,3.0,Etta James,At Last,4Hhv2vrOTy89HFRcjU3QOx,75,1960,Etta James,Los Angeles,-118.244476,34.054935
1,0.508,0.287,1.0,-12.472,1.0,0.0523,0.764,0.0,0.153,0.644,...,4.0,Ella Fitzgerald,Sleigh Ride,4ukUoXLuFzMixyZyabSGc4,70,1960,Ella Fitzgerald,Newport News,-76.432109,36.978645
2,0.579,0.502,8.0,-7.57,1.0,0.0513,0.733,0.0,0.281,0.836,...,4.0,Ella Fitzgerald,Frosty The Snowman,65irrLqfCMRiO3p87P4C0D,69,1960,Ella Fitzgerald,Newport News,-76.432109,36.978645
3,0.553,0.291,4.0,-10.426,0.0,0.0301,0.878,0.0,0.129,0.407,...,4.0,Sam Cooke,You Send Me,0BFEyqJ9DJXS7gKg0Kj46R,66,1960,Sam Cooke,"Clarksdale, Mississippi, U.S.",-90.570932,34.200109
4,0.482,0.569,0.0,-5.897,1.0,0.0328,0.617,0.0,0.155,0.609,...,4.0,Sam Cooke,Bring It On Home To Me,4yjz1aazw6R8ZURpGbCAkp,64,1960,Sam Cooke,"Clarksdale, Mississippi, U.S.",-90.570932,34.200109


In [11]:
import folium
from folium import plugins
from ipywidgets import interact
from folium.plugins import HeatMap

radius = widgets.IntSlider(min = 1960, max = 2019, step = 1, value = 1980, description = 'Year: ')

def response(year):    
    m = folium.Map(location=[48, -102], zoom_start=3)
    matrix = df[df['year'] == year][['lat', 'long']]
    m.add_child(plugins.HeatMap(matrix, radius = 15))
    return m

interact(response, year = radius)

interactive(children=(IntSlider(value=1980, description='Year: ', max=2019, min=1960), Output()), _dom_classes…

<function __main__.response(year)>