# MusicBrainz artist lookup

Get artist information, including place name, for each artist that has a music brainz id in my data set generated by the [lastfm_downloader](lastfm_downloader.ipynb).

Documentation:
 - Web service: https://wiki.musicbrainz.org/Development/XML_Web_Service/Version_2/Search
 - Artist entities: https://musicbrainz.org/doc/Artist
 - Area entities: https://musicbrainz.org/doc/Area

Sample queries:
 - Artist: https://musicbrainz.org/ws/2/artist/d4659efb-b8eb-4f03-95e9-f69ce35967a9
 - Area: https://musicbrainz.org/ws/2/area/0a70f24b-1263-4341-8d70-17b8df84154f?inc=area-rels

In [1]:
import pandas as pd, requests, time, json

pause_standard = 0.1
pause_exceeded_rate = 19

In [2]:
# configure URLs and user-agent header
artist_name_url = u'https://musicbrainz.org/ws/2/artist/?query=artist:{}&fmt=json'
artist_id_url = u'https://musicbrainz.org/ws/2/artist/{}?fmt=json'
area_id_url = u'https://musicbrainz.org/ws/2/area/{}?inc=area-rels&fmt=json'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

## Define functions

In [3]:
def make_request(url, headers=headers):
    
    global pause_standard
    global pause_exceeded_rate
    
    time.sleep(pause_standard)
    #print url
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200: #if status OK
        return {'status_code':response.status_code, 'json':response.json()}
    
    elif response.status_code == 503: #if status error (server busy or rate limit exceeded)
        if 'exceeding the allowable rate limit' in response.json()['error']:
            #print '\n{}\n{}\npausing {} secs and re-try\n'.format(response.status_code, response.json(), pause_exceeded_rate)
            pause_standard = pause_standard + 0.1
            print 'pause_standard={}'.format(pause_standard),
            time.sleep(pause_exceeded_rate)
        return make_request(url)
    
    else: #if other status code, print info and return None for caller to handle
        print 'make_request error code {} {}'.format(response.status_code, response.json())
        return None

In [4]:
def geocode(address):
    time.sleep(pause) #pause for some duration before each request, to not hammer their server
    url = u'http://maps.googleapis.com/maps/api/geocode/json?address={}&sensor=false' #api url with placeholders
    request = url.format(address) #fill in the placeholder with a variable
    response = requests.get(request) #send the request to the server and get the response
    data = json.loads(response.text) #convert the response json string into a dict
    
    if len(data['results']) > 0: #if google was able to geolocate our address, extract lat-long from result
        latitude = data['results'][0]['geometry']['location']['lat']
        longitude = data['results'][0]['geometry']['location']['lng']
        return '{},{}'.format(latitude, longitude)

In [5]:
def get_artist_id_by_name(name):
    try:
        response = make_request(artist_name_url.format(name))
        if response is not None:
            result = response['json']
            artist_id = result['artists'][0]['id']
            return artist_id
    except:
        print 'get_artist_id_by_name error: {}'.format(response)

In [6]:
def get_artist_by_id(artist_id):
    try:
        response = make_request(artist_id_url.format(artist_id))
        if response is not None:
            result = response['json']
            data = {'id':artist_id,
                    'name':result['name'],            
                    'type':result['type'],
                    'gender':result['gender'],
                    'country':result['country'],
                    'begin_date':None,
                    'end_date':None,
                    'area_id':None,
                    'area_name':None,
                    'area_name_full':None,
                    'area_latlng':None,
                    'begin_area_id':None,
                    'begin_area_name':None,
                    'begin_area_name_full':None,
                    'begin_area_latlng':None,
                    'place':None,
                    'place_id':None,
                    'place_full':None,
                    'place_latlng':None}

            if result['life-span'] is not None and 'begin' in result['life-span'] and 'end' in result['life-span']:
                data['begin_date'] = result['life-span']['begin']
                data['end_date'] = result['life-span']['end']
            if result['area'] is not None and 'id' in result['area'] and 'name' in result['area']:
                data['area_id'] = result['area']['id']
                data['area_name'] = result['area']['name']
            if result['begin_area'] is not None and 'id' in result['begin_area'] and 'name' in result['begin_area']:
                data['begin_area_id'] = result['begin_area']['id']
                data['begin_area_name'] = result['begin_area']['name']
            
            # populate place with begin_area_name if it's not null, else area_name if it's not null, else None
            if data['begin_area_name'] is not None:
                data['place'] = data['begin_area_name']
                data['place_id'] = data['begin_area_id']
            elif data['area_name'] is not None:
                data['place'] = data['area_name']
                data['place_id'] = data['area_id']
            
            return data
    
    except:
        print 'get_artist_by_id error: {}'.format(response)

In [7]:
def get_area(area_id, area_str=''):
    try: 
        response = make_request(area_id_url.format(area_id))
        if response is not None:
            result = response['json']
            
            if area_str == '':
                area_str = result['name']
            
            if 'relations' in result:
                for relation in result['relations']:
                    if relation['direction']=='backward' and relation['type']=='part of':
                        area_id = relation['area']['id']
                        area_str = u'{}, {}'.format(area_str, relation['area']['name'])
                        return area_id, area_str
            else:
                print 'get_area no relations error: {}'.format(result)
            return None, area_str
    except:
        print 'get_area error: {}'.format(response)

In [8]:
def get_place_full_name_by_id(area_id):
    area_name=''
    while area_id is not None:
        area_id, area_name = get_area(area_id, area_name)
    return area_name

In [29]:
def make_artists_df(artist_ids, row_labels=None, df=None, status_interval=10):
    
    # create a list of row labels if caller didn't pass one in
    if row_labels is None:
        row_labels = range(len(artist_ids))
    
    # create a new dataframe if caller didn't pass an existing one in
    cols = ['id', 'name', 'type', 'gender', 'country', 'begin_date', 'end_date', 'begin_area_name', 
            'begin_area_name_full', 'begin_area_latlng', 'area_name', 'area_name_full', 'area_latlng',
            'place_id', 'place', 'place_full', 'place_latlng']
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(columns=cols)
    
    start_time = time.time()
    for artist_id, n in zip(artist_ids, row_labels):

        if n % status_interval == 0 :
            print n,
            df.to_csv('data/mb.csv', index=False, encoding='utf-8')

        try:
            # get the artist info object
            artist = get_artist_by_id(artist_id)

            # create (or update) a df row containing the data from this artist object
            df.loc[n] = [ artist[col] for col in cols ]

            # update the row's place_full cell with the full place name
            df.loc[n, 'place_full'] = get_place_full_name_by_id(artist['place_id'])

        except Exception as e:
            print '#{} failed'.format(n)
            print e
            pass

    df.to_csv('data/mb.csv', index=False, encoding='utf-8')
    finish_time = time.time()
    print '\nprocessed {:,} artists in {:,} seconds'.format(len(artist_ids), round(finish_time-start_time, 2))
    
    return df

## Test it with a demo

In [10]:
name = 'david bowie'
artist_id = get_artist_id_by_name(name)
artist = get_artist_by_id(artist_id)
artist['place_full'] = get_place_full_name_by_id(artist['place_id'])
artist['place_full']

u'Brixton, Lambeth, London, England, United Kingdom'

## Now run it

In [11]:
scrobbles = pd.read_csv('data/lastfm_scrobbles.csv', encoding='utf-8')
artist_ids = scrobbles['artist_mbid'].dropna().unique()
len(artist_ids)

12501

In [12]:
df = make_artists_df(artist_ids)

0 10 pause_standard=0.2 20 pause_standard=0.3 30 40 50 60 70 80 pause_standard=0.4 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360 370 380 390 400 410 420 430 440 450 460 470 480 490 500 510 520 530 540 550 560 570 580 590 600 pause_standard=0.5 610 620 630 640 650 660 670 680 690 700 710 720 730 740 750 760 770 780 790 pause_standard=0.6 800 810 820 830 840 850 860 870 880 890 900 910 920 930 940 950 960 970 980 990 1000 1010 1020 1030 1040 1050 1060 1070 1080 1090 1100 1110 1120 1130 1140 1150 1160 1170 1180 1190 1200 1210 1220 1230 1240 1250 1260 1270 1280 1290 1300 1310 1320 1330 make_request error code 404 {u'error': u'Not Found'}
#1335 failed
1340 1350 1360 1370 1380 1390 1400 1410 1420 1430 1440 1450 1460 1470 1480 1490 1500 1510 1520 1530 1540 1550 1560 1570 1580 1590 1600 1610 1620 1630 1640 1650 1660 1670 1680 1690 1700 1710 1720 1730 1740 1750 1760 1770 1780 1790 1800 1810 1820 1830 1840 1850 1860 1870 1880 1890 

In [13]:
df.head()

Unnamed: 0,id,name,type,gender,country,begin_date,end_date,begin_area_name,begin_area_name_full,begin_area_latlng,area_name,area_name_full,area_latlng,place_id,place,place_full,place_latlng
0,4d550bdc-7c0b-4a50-bd36-18584ad5fb70,Slim Twig,Person,Male,CA,1988,,Toronto,,,Canada,,,74b24e62-d2fe-42d2-9d96-31f2da756c77,Toronto,"Toronto, Ontario, Canada",
1,469d6414-1f06-43de-80d5-17762d4a356a,Weyes Blood,Person,Female,US,,,Santa Monica,,,United States,,,dbacf2e3-7e3e-4cee-8804-999b109285fa,Santa Monica,"Santa Monica, Los Angeles County, California, ...",
2,3754dc74-381e-4237-bd44-65f5600a4d88,Wooden Shjips,Group,,US,2003,,San Francisco,,,United States,,,83f22bb6-4631-443c-bace-9fae8540362a,San Francisco,"San Francisco, California, United States",
3,8546949d-f46c-45ab-8391-85b26dda6b65,Tim Buckley,Person,Male,US,1947-02-14,1975-06-29,"Washington, D.C.",,,United States,,,af59135f-38b5-4ea4-b4e2-dd28c5f0bad7,"Washington, D.C.","Washington, D.C., United States",
4,65aac2dc-216c-4b1a-8501-45567c901c0e,Wipers,Group,,US,1977,2001,Portland,,,United States,,,2b748d6e-bc1c-4434-9f7b-ecd6332bc557,Portland,"Portland, Multnomah County, Oregon, United States",


## Re-try any failed rows one more time

In [27]:
# first get all the rows missing place_full (ie, row was created but couldn't get full place name)
rows_missing_place_full = list(df[pd.isnull(df['place_full'])].index)

# then get all the row labels missing in the df (due to errors that prevented row creation)
missing_row_labels = [ label for label in range(len(artist_ids)) if label not in df.index ]

# combine the two lists then get the artist mbid for each
row_labels_to_retry = sorted(rows_missing_place_full + missing_row_labels)
artist_ids_to_retry = [ artist_ids[label] for label in row_labels_to_retry ]

print '{} artists to retry'.format(len(artist_ids_to_retry))

119 artists to retry


In [30]:
df = make_artists_df(artist_ids_to_retry, row_labels_to_retry, df, status_interval=1)

1335 make_request error code 404 {u'error': u'Not Found'}
#1335 failed
'NoneType' object has no attribute '__getitem__'
4816 make_request error code 404 {u'error': u'Not Found'}
#4816 failed
'NoneType' object has no attribute '__getitem__'
8147 8148 8149 8150 8571 8572 10871 10873 10874 10899 10900 10905 10928 10929 10935 10951 10953 10954 10959 10960 10961 10962 10965 10966 10968 10972 11018 11019 11020 11024 11027 11028 11080 11092 11093 11094 11096 11113 11117 11120 11123 11129 11132 11146 11147 11150 11158 11166 11167 11172 11173 11180 11181 11183 11185 11187 11188 11189 11191 11194 11197 11205 11211 11212 11226 11227 11230 11231 11232 11238 11240 11241 11242 11244 11245 11246 11250 11260 11322 11327 11330 11332 11334 11335 11341 11346 11348 11352 11353 11363 11371 11372 11398 11399 11402 11403 11404 11405 11406 11407 11409 11412 11432 11436 11439 11441 11444 11446 11447 11448 11449 11450 11452 11453 11458 11861 make_request error code 404 {u'error': u'Not Found'}
#11861 failed
'No

In [31]:
df.head()

Unnamed: 0,id,name,type,gender,country,begin_date,end_date,begin_area_name,begin_area_name_full,begin_area_latlng,area_name,area_name_full,area_latlng,place_id,place,place_full,place_latlng
0,4d550bdc-7c0b-4a50-bd36-18584ad5fb70,Slim Twig,Person,Male,CA,1988,,Toronto,,,Canada,,,74b24e62-d2fe-42d2-9d96-31f2da756c77,Toronto,"Toronto, Ontario, Canada",
1,469d6414-1f06-43de-80d5-17762d4a356a,Weyes Blood,Person,Female,US,,,Santa Monica,,,United States,,,dbacf2e3-7e3e-4cee-8804-999b109285fa,Santa Monica,"Santa Monica, Los Angeles County, California, ...",
2,3754dc74-381e-4237-bd44-65f5600a4d88,Wooden Shjips,Group,,US,2003,,San Francisco,,,United States,,,83f22bb6-4631-443c-bace-9fae8540362a,San Francisco,"San Francisco, California, United States",
3,8546949d-f46c-45ab-8391-85b26dda6b65,Tim Buckley,Person,Male,US,1947-02-14,1975-06-29,"Washington, D.C.",,,United States,,,af59135f-38b5-4ea4-b4e2-dd28c5f0bad7,"Washington, D.C.","Washington, D.C., United States",
4,65aac2dc-216c-4b1a-8501-45567c901c0e,Wipers,Group,,US,1977,2001,Portland,,,United States,,,2b748d6e-bc1c-4434-9f7b-ecd6332bc557,Portland,"Portland, Multnomah County, Oregon, United States",


In [32]:
# OK, one final check - see how many artist ids did not make it into the final dataframe
# first get all the rows missing place_full (ie, row was created but couldn't get full place name)
rows_missing_place_full = list(df[pd.isnull(df['place_full'])].index)

# then get all the row labels missing in the df (due to errors that prevented row creation)
missing_row_labels = [ label for label in range(len(artist_ids)) if label not in df.index ]

print len(rows_missing_place_full)
print len(missing_row_labels)

0
4
