# Music Analysis

In [42]:
import re
import sys
import time
import math
import random
import spotipy
import requests
import itertools
import collections
import pandas as pd
from tqdm import tqdm
import spotipy.util as util
from bs4 import BeautifulSoup
from collections import defaultdict
from spotipy.oauth2 import SpotifyClientCredentials

In [15]:
def get_track_features(client_id = None, client_secret = None):
    if not client_id or not client_secret:
        print("Please add your client_id and client_secret")
        print("For more information, please visit ")
        print("https://developer.spotify.com/my-applications/#!/applications")
        return 
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, \
                                   client_secret = client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    # Get track info
    artist_name = []
    track_name = []
    track_id = []
    popularity = []
    years = []

    for year in tqdm(range(1960, 2020)):
        try:
            track_results = sp.search(q='year:{0}'.format(year), type='track', limit=50)
        except:
            break
        while track_results:
            for i, t in enumerate(track_results['tracks']['items']):
                artist_name.append(t['artists'][0]['name'])
                track_name.append(t['name'])
                track_id.append(t['id'])
                popularity.append(t['popularity'])
                years.append(str(year))
            if track_results['tracks']['next']:
                try:
                    track_results = sp.next(track_results['tracks'])
                except:
                    break
            else:
                track_results = None
    print("Get {0} songs from spotify".format(len(track_id)))
    # Get featrues
    feature_keys = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', \
                      'instrumentalness', 'liveness', 'valence', 'tempo','duration_ms', 'time_signature']

    audio_features = {key:[] for key in feature_keys}
    audio_features['artist_name'] = artist_name
    audio_features['track_name'] = track_name
    audio_features['track_id'] = track_id
    audio_features['popularity'] = popularity
    audio_features['year'] = years # list(itertools.chain(*[[i] * 10000 for i in range(1960, 2020)])) # years

    for i in tqdm(range(0, len(track_id), 50)):
        features = sp.audio_features(track_id[i: i + 50])  
        for feature in features:
            for key in feature_keys:
                if feature and key in feature:
                    audio_features[key].append(feature[key])
                else:
                    audio_features[key].append(None)
    
    # Save to csv file
    df_tracks = pd.DataFrame(audio_features)
    df_tracks.head()
    df_tracks.to_csv(r'audio_features.csv')

In [16]:
# We've already fetch all the data and corresponding features,
# and saved it in "./audio_features.csv"
# no need to run this function again
client_id = None
client_secret = None
get_track_features(client_id, client_secret)

Please add your client_id and client_secret
For more information, please visit 
https://developer.spotify.com/my-applications/#!/applications


In [17]:
df_tracks = pd.read_csv('audio_features.csv', index_col=0)
df_tracks.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_name,track_name,track_id,popularity,year
0,0.171,0.33,5.0,-9.699,1.0,0.0329,0.707,0.00381,0.302,0.315,174.431,182400.0,3.0,Etta James,At Last,4Hhv2vrOTy89HFRcjU3QOx,75,1960
1,0.508,0.287,1.0,-12.472,1.0,0.0523,0.764,0.0,0.153,0.644,154.759,175987.0,4.0,Ella Fitzgerald,Sleigh Ride,4ukUoXLuFzMixyZyabSGc4,70,1960
2,0.579,0.502,8.0,-7.57,1.0,0.0513,0.733,0.0,0.281,0.836,76.816,131733.0,4.0,Ella Fitzgerald,Frosty The Snowman,65irrLqfCMRiO3p87P4C0D,69,1960
3,0.553,0.291,4.0,-10.426,0.0,0.0301,0.878,0.0,0.129,0.407,96.217,165560.0,4.0,Sam Cooke,You Send Me,0BFEyqJ9DJXS7gKg0Kj46R,66,1960
4,0.482,0.569,0.0,-5.897,1.0,0.0328,0.617,0.0,0.155,0.609,70.843,165280.0,4.0,Sam Cooke,Bring It On Home To Me,4yjz1aazw6R8ZURpGbCAkp,64,1960


In [18]:
df_by_year = df_tracks.dropna().groupby('year').mean().reset_index()
df_by_year.head()

Unnamed: 0,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,popularity
0,1960,0.464192,0.313453,5.0053,-14.693014,0.7152,0.100909,0.80923,0.253353,0.222327,0.494803,110.969244,215520.8149,3.7509,5.4481
1,1961,0.474276,0.320147,4.948585,-14.829432,0.720816,0.107911,0.799795,0.258537,0.218472,0.493421,110.856446,222618.542363,3.752826,5.115635
2,1962,0.463054,0.330539,5.049005,-14.832738,0.719772,0.10038,0.782238,0.254475,0.225328,0.484301,111.21002,214440.192719,3.749175,6.244124
3,1963,0.478339,0.35719,5.0907,-13.994034,0.7426,0.078978,0.758253,0.238693,0.220458,0.526543,112.315946,202682.4428,3.7692,7.717
4,1964,0.478219,0.37129,5.084208,-13.754991,0.747575,0.080756,0.746077,0.217783,0.228598,0.528866,113.116872,198727.756076,3.767377,8.243324


In [6]:
import plotly.graph_objects as go
from ipywidgets import widgets
from ipywidgets import interactive
import plotly.express as px

In [7]:
virtualize_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', \
                      'instrumentalness', 'liveness', 'valence', 'tempo','duration_ms', 'time_signature', 'popularity']

feature_box = widgets.Dropdown(options=virtualize_features,
                          value=virtualize_features[0],
                          description='Feature: ')

trace1 = go.Scatter(x=df_by_year['year'], y=df_by_year['danceability'])

g = go.FigureWidget(data=[trace1],
                   layout=go.Layout(
                        title=dict(
                            text='Music Features Over Years'
                        ),
                        xaxis=dict(
                            title="Year"
                        ),
                        yaxis=dict(
                            title="danceability"
                        ),
                    ))

def response(change):
    new_feature = change['new']
    g.layout.yaxis.title = new_feature
    g.data[0].y = df_by_year[new_feature]

feature_box.observe(response, names="value")

widgets.VBox([feature_box, g])

VBox(children=(Dropdown(description='Feature: ', options=('danceability', 'energy', 'key', 'loudness', 'mode',…

In [8]:
all_musicians = list(df_tracks['artist_name'].unique())
print(len(all_musicians))

53981


In [9]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/
# https://stackoverflow.com/questions/45171646/how-to-get-birth-location-using-wikipedia-api

from SPARQLWrapper import SPARQLWrapper, JSON

def search_birthlocation(musician):
    endpoint_url = "https://query.wikidata.org/sparql"
    query = """SELECT DISTINCT ?item ?itemLabel ?birthLocation ?birthLocationLabel WHERE {{
              ?item (wdt:P31|wdt:P101|wdt:P106)/wdt:P279* wd:Q639669 ;
            rdfs:label "{0}"@en ;wdt:P19 ?birthLocation
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
            }}""".format(musician)
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    try:
        query_result = sparql.query().convert()
    except:
        musician_birth['birthplace'].append("NaN")
        return
    if not query_result["results"]["bindings"]:
        musician_birth['birthplace'].append("NaN")
    else:
        result = query_result["results"]["bindings"][0]
        if 'birthLocationLabel' in result:
            musician_birth['birthplace'].append(result['birthLocationLabel']['value'])
        else:
            musician_birth['birthplace'].append("NaN")


musician_birth = {key:[] for key in ['name', 'birthplace']}

for i in tqdm(all_musicians):
    time.sleep(0.1)
    musician_birth['name'].append(i)
    search_birthlocation(i)
    
print(musician_birth)
df_musician = pd.DataFrame(musician_birth)
df_musician.head()
df_musician.to_csv(r'musician_birth.csv', index=[0])

100%|██████████| 53981/53981 [7:33:43<00:00,  1.98it/s]    




In [64]:
df_musician = pd.read_csv('musician_birth.csv', index_col=0)
df_musician.head()

Unnamed: 0,name,birthplace
0,Etta James,Los Angeles
1,Ella Fitzgerald,Newport News
2,Sam Cooke,
3,Neil Sedaka,Brooklyn
4,Dean Martin,Steubenville


In [79]:
null = 0
find = 0
for i, name in enumerate(df_musician.name):
    if i != 0 and i % 1000 == 0:
        print(i)
        print(find)
        print(null)
    if i < 34369:
        continue
    if type(df_musician.loc[i, "birthplace"]) != float or not math.isnan(df_musician.loc[i, "birthplace"]):
        continue
    null += 1
    url = "https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles={0}&rvsection=0".format(name)
    r = requests.get(url)
    soup = BeautifulSoup(r._content, 'html.parser')
    all_matches = re.findall(r"birth_place [^\n]+\n\|", soup.prettify())
    if len(all_matches) == 0:
        all_matches = re.findall(r"origin [^\n]+\n\|", soup.prettify())
    if len(all_matches):
        df_musician.loc[i, "birthplace"] = all_matches[0].strip('\n|').replace('[', '').replace(']', '').split('=')[1].strip()
        find += 1
    time.sleep(random.uniform(0.1, 0.4))

1000
0
0
2000
0
0
3000
0
0
4000
0
0
5000
0
0
6000
0
0
7000
0
0
8000
0
0
9000
0
0
10000
0
0
11000
0
0
12000
0
0
13000
0
0
14000
0
0
15000
0
0
16000
0
0
17000
0
0
18000
0
0
19000
0
0
20000
0
0
21000
0
0
22000
0
0
23000
0
0
24000
0
0
25000
0
0
26000
0
0
27000
0
0
28000
0
0
29000
0
0
30000
0
0
31000
0
0
32000
0
0
33000
0
0
34000
0
0
35000
191
520
36000
486
1350
37000
778
2166
38000
1056
2996
39000
1309
3820
40000
1578
4674
41000
1869
5511
42000
2118
6341
43000
2353
7174
44000
2586
8048
45000
2823
8881
46000
3058
9714
47000
3230
10597
48000
3410
11449
49000
3559
12331
50000
3673
13237
51000
3779
14172
52000
3904
15103
53000
3989
16053


In [80]:
df_musician.to_csv(r'musician_birth1.csv', index=[0])


In [82]:
print(len(df_musician[df_musician['birthplace'].isnull()]))

32793


In [83]:
df_musician.head()

Unnamed: 0,name,birthplace
0,Etta James,Los Angeles
1,Ella Fitzgerald,Newport News
2,Sam Cooke,"Clarksdale, Mississippi, U.S."
3,Neil Sedaka,Brooklyn
4,Dean Martin,Steubenville


In [91]:
new_df = df_musician[df_musician['birthplace'].isnull()]
l = set(list(new_df.name))
print(len(df_tracks[df_tracks['artist_name'].apply(lambda x : x in l)]))
df_tracks[df_tracks['artist_name'].apply(lambda x : x in l)].dropna().groupby('year').count()

196177


Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,popularity
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1960,1653.4153,1187.062508,17033.0,-44083.882,2345.0,330.9747,2682.870589,638.256999,766.524,1900.2101,372826.894,626817858.0,12343.0,10031
1961,1787.7321,1254.92779,16884.0,-46949.244,2493.0,437.6259,2633.807698,779.675179,747.4705,1948.0515,386907.701,693145778.0,12994.0,10419
1962,1501.0382,1057.7751,14749.0,-41078.628,2104.0,358.1059,2301.364679,683.888097,645.8452,1608.5425,337302.699,580701034.0,11147.0,9675
1963,1518.37,1197.42726,15002.0,-37830.824,2182.0,271.9215,2198.09888,671.051746,656.6711,1743.6042,341114.01,557038637.0,11309.0,12933
1964,1549.8583,1212.210571,15784.0,-41577.545,2294.0,288.21,2315.289642,732.914871,671.1905,1751.89803,354711.78,562955548.0,11724.0,14361
1965,1467.7152,1210.746675,14697.0,-35991.276,2101.0,240.4271,1987.897157,644.122178,595.3204,1707.0461,333412.604,510277845.0,10963.0,15315
1966,1537.6059,1241.54909,14416.0,-34644.179,2092.0,416.239,1806.618652,534.899273,645.5409,1705.7628,327240.905,489666993.0,10826.0,18622
1967,1310.9986,1116.577403,12516.0,-30132.443,1825.0,339.7825,1504.298604,378.022515,540.3871,1496.3595,288728.788,456113412.0,9446.0,20325
1968,1414.9428,1348.763807,13935.0,-31834.614,2003.0,288.6427,1445.224815,438.509825,665.4721,1662.2386,319364.586,546906120.0,10533.0,28879
1969,1433.9651,1423.75369,13997.0,-30887.56,1947.0,323.5203,1392.106283,439.789947,611.8943,1621.45498,317430.803,589368654.0,10456.0,30073


In [88]:
print('Etta James' in list(df_musician.name))

True
