https://wiki.musicbrainz.org/Development/XML_Web_Service/Version_2/Search

https://musicbrainz.org/doc/Artist

https://musicbrainz.org/doc/Area

https://musicbrainz.org/ws/2/area/0a70f24b-1263-4341-8d70-17b8df84154f?inc=area-rels

https://musicbrainz.org/ws/2/artist/d4659efb-b8eb-4f03-95e9-f69ce35967a9

In [1]:
import pandas as pd, requests, time, json

In [2]:
pause = 0.2
artist_name_url = 'https://musicbrainz.org/ws/2/artist/?query=artist:{}&fmt=json'
artist_id_url = 'https://musicbrainz.org/ws/2/artist/{}?fmt=json'
area_id_url = 'https://musicbrainz.org/ws/2/area/{}?inc=area-rels&fmt=json'

In [3]:
def make_request(url):
    time.sleep(pause)
    return requests.get(url).json()

In [4]:
def geocode(address):
    time.sleep(pause) #pause for some duration before each request, to not hammer their server
    url = 'http://maps.googleapis.com/maps/api/geocode/json?address={}&sensor=false' #api url with placeholders
    request = url.format(address) #fill in the placeholder with a variable
    response = requests.get(request) #send the request to the server and get the response
    data = json.loads(response.text) #convert the response json string into a dict
    
    if len(data['results']) > 0: #if google was able to geolocate our address, extract lat-long from result
        latitude = data['results'][0]['geometry']['location']['lat']
        longitude = data['results'][0]['geometry']['location']['lng']
        return '{},{}'.format(latitude, longitude)

In [5]:
def get_area(area_id, area_str=''):
    response = make_request(area_id_url.format(area_id))
    if 'relations' in response:
        for relation in response['relations']:
            if relation['direction']=='backward' and relation['type']=='part of':
                return relation['area']['id'], '{}, {}'.format(area_str, relation['area']['name'])
    return None, area_str

In [6]:
def get_artist_id_by_name(name):
    response = make_request(artist_name_url.format(name))
    artist_id = response['artists'][0]['id']
    return artist_id

In [7]:
def get_artist_by_id(artist_id):
    response = make_request(artist_id_url.format(artist_id))
    data = {'id':artist_id,
            'name':response['name'],            
            'type':response['type'],
            'gender':response['gender'],
            'country':response['country'],
            'begin_date':response['life-span']['begin'],
            'end_date':response['life-span']['end'],
            'area_id':response['area']['id'],
            'area_name':response['area']['name'],
            'area_name_full':None,
            'area_latlng':None,
            'begin_area_id':response['begin_area']['id'],
            'begin_area_name':response['begin_area']['name'],
            'begin_area_name_full':None,
            'begin_area_latlng':None}
    return data

In [8]:
df = pd.DataFrame(columns=['name', 'type', 'gender', 'country', 'begin_date', 'end_date', 
                           'begin_area_name', 'begin_area_name_full', 'begin_area_latlng', 
                           'area_name', 'area_name_full', 'area_latlng'])

In [9]:
artist_names = ['modest mouse', 'david byrne', 'pink floyd', 'tinariwen', 'david bowie']

In [10]:
for name in artist_names:
    
    artist_id = get_artist_id_by_name(name)
    artist = get_artist_by_id(artist_id)
    
    begin_area_id = artist['begin_area_id']
    begin_area_name = artist['begin_area_name']
    while begin_area_id is not None:
        begin_area_id, begin_area_name = get_area(begin_area_id, begin_area_name)
    artist['begin_area_name_full'] = begin_area_name
    artist['begin_area_latlng'] = geocode(begin_area_name)
    
    area_id = artist['area_id']
    area_name = artist['area_name']
    while area_id is not None:
        area_id, area_name = get_area(area_id, area_name)
    artist['area_name_full'] = area_name
    artist['area_latlng'] = geocode(area_name)
    #if artist['country'] is None, save this country element as its value?
    
    df.loc[len(df)] = [artist['name'], artist['type'], artist['gender'], artist['country'], 
                       artist['begin_date'], artist['end_date'],
                       artist['begin_area_name'], artist['begin_area_name_full'], artist['begin_area_latlng'],
                       artist['area_name'], artist['area_name_full'], artist['area_latlng']]

In [11]:
df.to_csv('data/mb.csv', index=False, encoding='utf-8')
df.head()

Unnamed: 0,name,type,gender,country,begin_date,end_date,begin_area_name,begin_area_name_full,begin_area_latlng,area_name,area_name_full,area_latlng
0,Modest Mouse,Group,,,1993,,Issaquah,"Issaquah, King County, Washington, United States","47.5301011,-122.0326191",Portland,"Portland, Multnomah County, Oregon, United States","45.5230622,-122.6764816"
1,David Byrne,Person,Male,US,1952-05-14,,Dumbarton,"Dumbarton, West Dunbartonshire, Scotland, Unit...","55.945287,-4.564554",United States,United States,"37.09024,-95.712891"
2,Pink Floyd,Group,,GB,1965,2014,London,"London, England, United Kingdom","51.5073509,-0.1277583",United Kingdom,United Kingdom,"55.378051,-3.435973"
3,Tinariwen,Group,,ML,1982,,Kidal,"Kidal, Mali","18.4520713,1.4096535",Mali,Mali,"17.570692,-3.996166"
4,David Bowie,Person,Male,GB,1947-01-08,2016-01-10,Brixton,"Brixton, Lambeth, London, England, United Kingdom","51.4612794,-0.1156148",United Kingdom,United Kingdom,"55.378051,-3.435973"
