# MusicBrainz artist lookup

Documentation:
 - Web service: https://wiki.musicbrainz.org/Development/XML_Web_Service/Version_2/Search
 - Artist entities: https://musicbrainz.org/doc/Artist
 - Area entities: https://musicbrainz.org/doc/Area

Sample queries:
 - Artist: https://musicbrainz.org/ws/2/artist/d4659efb-b8eb-4f03-95e9-f69ce35967a9
 - Area: https://musicbrainz.org/ws/2/area/0a70f24b-1263-4341-8d70-17b8df84154f?inc=area-rels

In [1]:
import pandas as pd, requests, time, json

pause_standard = 0.1
pause_exceeded_rate = 19

In [2]:
# configure URLs and user-agent header
artist_name_url = u'https://musicbrainz.org/ws/2/artist/?query=artist:{}&fmt=json'
artist_id_url = u'https://musicbrainz.org/ws/2/artist/{}?fmt=json'
area_id_url = u'https://musicbrainz.org/ws/2/area/{}?inc=area-rels&fmt=json'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

In [3]:
def make_request(url, headers=headers):
    
    global pause_standard
    global pause_exceeded_rate
    
    time.sleep(pause_standard)
    #print url
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200: #if status OK
        return {'status_code':response.status_code, 'json':response.json()}
    
    elif response.status_code == 503: #if status error (server busy or rate limit exceeded)
        if 'exceeding the allowable rate limit' in response.json()['error']:
            #print '\n{}\n{}\npausing {} secs and re-try\n'.format(response.status_code, response.json(), pause_exceeded_rate)
            pause_standard = pause_standard + 0.1
            print 'pause_standard={}'.format(pause_standard),
            time.sleep(pause_exceeded_rate)
        return make_request(url)
    
    else: #if other status code, print info and return None for caller to handle
        print '\nmake_request error code {} {}\n'.format(response.status_code, response.json())
        return None

In [4]:
def geocode(address):
    time.sleep(pause) #pause for some duration before each request, to not hammer their server
    url = u'http://maps.googleapis.com/maps/api/geocode/json?address={}&sensor=false' #api url with placeholders
    request = url.format(address) #fill in the placeholder with a variable
    response = requests.get(request) #send the request to the server and get the response
    data = json.loads(response.text) #convert the response json string into a dict
    
    if len(data['results']) > 0: #if google was able to geolocate our address, extract lat-long from result
        latitude = data['results'][0]['geometry']['location']['lat']
        longitude = data['results'][0]['geometry']['location']['lng']
        return '{},{}'.format(latitude, longitude)

In [5]:
def get_artist_id_by_name(name):
    try:
        response = make_request(artist_name_url.format(name))
        if response is not None:
            result = response['json']
            artist_id = result['artists'][0]['id']
            return artist_id
    except:
        print 'get_artist_id_by_name error: {}'.format(response)

In [6]:
def get_artist_by_id(artist_id):
    try:
        response = make_request(artist_id_url.format(artist_id))
        if response is not None:
            result = response['json']
            data = {'id':artist_id,
                    'name':result['name'],            
                    'type':result['type'],
                    'gender':result['gender'],
                    'country':result['country'],
                    'begin_date':None,
                    'end_date':None,
                    'area_id':None,
                    'area_name':None,
                    'area_name_full':None,
                    'area_latlng':None,
                    'begin_area_id':None,
                    'begin_area_name':None,
                    'begin_area_name_full':None,
                    'begin_area_latlng':None,
                    'place':None,
                    'place_id':None,
                    'place_full':None,
                    'place_latlng':None}

            if result['life-span'] is not None and 'begin' in result['life-span'] and 'end' in result['life-span']:
                data['begin_date'] = result['life-span']['begin']
                data['end_date'] = result['life-span']['end']
            if result['area'] is not None and 'id' in result['area'] and 'name' in result['area']:
                data['area_id'] = result['area']['id']
                data['area_name'] = result['area']['name']
            if result['begin_area'] is not None and 'id' in result['begin_area'] and 'name' in result['begin_area']:
                data['begin_area_id'] = result['begin_area']['id']
                data['begin_area_name'] = result['begin_area']['name']
            
            # populate place with begin_area_name if it's not null, else area_name if it's not null, else None
            if data['begin_area_name'] is not None:
                data['place'] = data['begin_area_name']
                data['place_id'] = data['begin_area_id']
            elif data['area_name'] is not None:
                data['place'] = data['area_name']
                data['place_id'] = data['area_id']
            
            return data
    
    except:
        print 'get_artist_by_id error: {}'.format(response)

In [7]:
def get_area(area_id, area_str=''):
    try: 
        response = make_request(area_id_url.format(area_id))
        if response is not None:
            result = response['json']
            
            if area_str == '':
                area_str = result['name']
            
            if 'relations' in result:
                for relation in result['relations']:
                    if relation['direction']=='backward' and relation['type']=='part of':
                        area_id = relation['area']['id']
                        area_str = u'{}, {}'.format(area_str, relation['area']['name'])
                        return area_id, area_str
            else:
                print 'get_area no relations error: {}'.format(result)
            return None, area_str
    except:
        print 'get_area error: {}'.format(response)

In [8]:
def get_place_full_name_by_id(area_id):
    area_name=''
    while area_id is not None:
        area_id, area_name = get_area(area_id, area_name)
    return area_name

## Test it with a demo

In [9]:
name = 'david bowie'
artist_id = get_artist_id_by_name(name)
artist = get_artist_by_id(artist_id)
artist['place_full'] = get_place_full_name_by_id(artist['place_id'])
artist['place_full']

u'Brixton, Lambeth, London, England, United Kingdom'

## Now run it

In [10]:
df = pd.DataFrame(columns=['id', 'name', 'type', 'gender', 'country', 'begin_date', 'end_date', 
                           'begin_area_name', 'begin_area_name_full', 'begin_area_latlng', 
                           'area_name', 'area_name_full', 'area_latlng',
                           'place_id', 'place', 'place_full', 'place_latlng'])

In [None]:
scrobbles = pd.read_csv('data/lastfm_scrobbles.csv', encoding='utf-8')
artist_ids = scrobbles['artist_mbid'].dropna().unique()
len(artist_ids)

12484

In [None]:
start_time = time.time()

for artist_id, n in zip(artist_ids, range(len(artist_ids))):
    
    if n % 10 == 0 :
        print n,
        df.to_csv('data/mb.csv', index=False, encoding='utf-8')
        
    try:
        # get the artist info object
        artist = get_artist_by_id(artist_id)
        
        # create a new df row containing the data from this artist object
        df.loc[n] = [artist['id'], artist['name'], artist['type'], artist['gender'], artist['country'], 
                     artist['begin_date'], artist['end_date'],
                     artist['begin_area_name'], artist['begin_area_name_full'], artist['begin_area_latlng'],
                     artist['area_name'], artist['area_name_full'], artist['area_latlng'],
                     artist['place_id'], artist['place'], artist['place_full'], artist['place_latlng']]
        
        # update the row's place_full cell with the full place name
        df.loc[n, 'place_full'] = get_place_full_name_by_id(artist['place_id'])
        
    except:
        print '#{} failed'.format(n)
        pass

df.to_csv('data/mb.csv', index=False, encoding='utf-8')
finish_time = time.time()

0 10 pause_standard=0.2 20 30 pause_standard=0.3 40 pause_standard=0.4 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360 370 380 390 400 410 420 430 440 450 460 470 480 490 500 510 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670 680 690 700 710 720 730 pause_standard=0.5 740 750 760 770 780 790 800 810 820 830 840 850 860 870 880 890 900 910 920 930 940 950 960 970 980 990 1000 1010 1020 1030 1040 1050 1060 1070 1080 1090 1100 1110 1120 1130 1140 1150 1160

In [None]:
print 'processed {:,} artists in {:,} seconds'.format(len(artist_ids), round(finish_time-start_time, 2))
df