In [None]:
try:
    with open('discord_token.txt', 'r') as f:
        token = f.read()
except FileNotFoundError:
    print("No discogs_token.txt file found. Please create one with your discogs API token in it.")

# Data collection **Italian Progressive Rock Ontology**

In [None]:
import time

from tqdm import tqdm
import requests
from pprint import pp
import re
from bs4 import BeautifulSoup
import json
import os

## Download artists from Wikidata

 Only italian progressive rock bands formed from 1966 to 1975

In [None]:
def sparql_query(query):
    url = 'https://query.wikidata.org/sparql'
    r = requests.get(url, params = {'format': 'json', 'query': query})
    return r.json()

In [None]:
get_artists = '''
SELECT ?item ?itemLabel ?inizio ?discogsId ?progarchivesId WHERE {
  ?item wdt:P31 wd:Q215380.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?item wdt:P136 wd:Q49451;
    wdt:P495 wd:Q38;
    wdt:P2031 ?inizio.
  FILTER((?inizio >= "1966-01-01T00:00:00Z"^^xsd:dateTime) && (?inizio < "1975-01-01T00:00:00Z"^^xsd:dateTime))
  OPTIONAL { ?item wdt:P1953 ?discogsId. }
  OPTIONAL { ?item wdt:P7670 ?progarchivesId. }
}
ORDER BY (?itemLabel)
LIMIT 100
'''

result = sparql_query(get_artists)
artist_list = result['results']['bindings']

os.makedirs('wikidata', exist_ok=True)
with open('wikidata/artists.json', 'w+') as f:
    json.dump(artist_list, f)

## Download releases from Discogs

Only *Main release* published from 1966 to 1975

In [None]:
def discogs_query_artist_releases(artist_id):
    releases = []
    n_page = 1
    num_pages = 1
    while n_page <= num_pages:
        r = requests.get(f'https://api.discogs.com/artists/{artist_id}/releases?page={n_page}&token={token}')
        if r.status_code != 200:
            raise Exception(f'Error: {r.status_code}, {r.content}, {r.url}, {r.headers}')
        json = r.json()
        releases.extend([r for r in json['releases'] if r['role'] == 'Main' and 'year' in r and 1966 <= r['year'] <= 1975 and r['type'] == 'master'])
        num_pages = json['pagination']['pages']
        n_page += 1
    return releases

In [None]:
with open('wikidata/artists.json', 'r') as f:
    artists = json.load(f)

    releases = []

    for artist in tqdm(artists):
        releases.append({
            'discogsID': artist['discogsId']['value'],
            'releases': discogs_query_artist_releases(artist['discogsId']['value'])
            })
        time.sleep(1)

    os.makedirs('discogs', exist_ok=True)
    with open('discogs/releases.json', 'w+') as out:
        json.dump(releases, out)

100%|██████████| 34/34 [01:50<00:00,  3.26s/it]


## Download master releases from Discogs

Download complete information about a main release

In [None]:
def discogs_query_resource_url(obj):
    r = requests.get(obj['resource_url'] + f'?token={token}')
    if r.status_code != 200:
        raise Exception(f'Error: {r.status_code}, {r.content}, {r.url}, {r.headers}\nObject: {obj}')
    return r.json()
    

In [None]:
release_details_list = []
with open('discogs/releases.json', 'r') as f:
    releases = json.load(f)
   
    for artist_releases in tqdm(releases):
        for release in artist_releases['releases']:
            release_details_list.append({
                'releaseID': release['id'],
                'master': discogs_query_resource_url(release)
                })
            time.sleep(1)
    
    with open('discogs/release_details.json', 'w+') as out:
        json.dump(release_details_list, out)

100%|██████████| 35/35 [06:19<00:00, 10.86s/it]


## Download artist information from Discogs

In [None]:
def discogs_query_artist(artist_id):
    r = requests.get(f'https://api.discogs.com/artists/{artist_id}?token={token}')
    if r.status_code != 200:
        raise Exception(f'Error: {r.status_code}, {r.content}, {r.url}, {r.headers}')
    return r.json()

In [None]:
with open('wikidata/artists.json', 'r') as f:
    artist_list_discogs = []
    num_requests = 0

    for artist in tqdm(artist_list):
        artist_list_discogs.append({
            'discogsID': artist['discogsId']['value'],
            'artist': discogs_query_artist(artist['discogsId']['value'])
            })

    with open('discogs/artists.json', 'w+') as out:
        json.dump(artist_list_discogs, out)

100%|██████████| 35/35 [00:18<00:00,  1.91it/s]


## Download members information

In [None]:
artist_details_discogs = []

with open('discogs/artists.json', 'r') as f:
    artist_list = json.load(f)

    for artist in tqdm(artist_list):
        for member in artist['artist']['members']:
            try:
                m = discogs_query_resource_url(member)
            except Exception:
                m = None

            artist_details_discogs.append({
                'discogsID': member['id'],
                'mermber': m
                })
            time.sleep(1)

    with open('discogs/members.json', 'w+') as out:
        json.dump(artist_details_discogs, out)

100%|██████████| 35/35 [04:41<00:00,  8.04s/it]


## Download start time of band members from Wikidata

In [None]:
band_members = dict()

with open('wikidata/artists.json', 'r') as f:
    artist_list = json.load(f)

    for artist in tqdm(artist_list):
        wikidata_id = artist['item']['value'].split('/')[-1]

        query = """
                SELECT ?member ?memberLabel (YEAR(?start) AS ?year) ?discogsId WHERE {
                SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                wd:"""+ wikidata_id + """ p:P527 ?statement. # musical_group
                ?statement ps:P527 ?member; # has_part ?member
                    pq:P580 ?start. # start_time ?start
                FILTER((?start >= "1966-01-01T00:00:00Z"^^xsd:dateTime) && (?start < "1975-01-01T00:00:00Z"^^xsd:dateTime))
                OPTIONAL { ?member wdt:P1953 ?discogsId. }
                }
                LIMIT 1000
                """

        result = sparql_query(query)
        band_members[wikidata_id] = result['results']['bindings']
        time.sleep(1)
    
    with open('wikidata/members.json', 'w+') as out:
        json.dump(band_members, out)
        

100%|██████████| 35/35 [00:47<00:00,  1.35s/it]


## Scrape genres information from Discogs

In [None]:
def discogs_query_genre_and_style(genre_name):
    r = requests.get(f'https://www.discogs.com/genre/{genre_name}')
    if r.status_code != 200:
        r = requests.get(f'https://www.discogs.com/style/{genre_name}')
    if r.status_code != 200:
        return None

    soup = BeautifulSoup(r.content,'html.parser')
    for a in soup.find_all('a', href=True):
        if f'http://reference.discogs.com/' in a['href']:
            lines = a.parent.text.split(f'Contribute')[0].strip().split('\n')[1:]
            return ' '.join(l for l in lines if 'Provided by Wikipedia under Creative Commons Attribution CC-BY-SA 4.0' not in l)

In [None]:
genres = set()

with open('discogs/release_details.json', 'r') as f:
    release_details = json.load(f)
    for release in release_details:
        for genre in release['master']['genres']:
            genres.add(genre.lower().replace(' ', '+'))

        if 'styles' in release['master']:    
            for style in release['master']['styles']:
                genres.add(style.lower().replace(' ', '+'))
            
genre_dict = dict()
for genre in tqdm(genres):
    genre_dict[genre] = discogs_query_genre_and_style(genre)
    time.sleep(1)

with open('discogs/genres.json', 'w+') as out:
    json.dump(genre_dict, out)

100%|██████████| 47/47 [01:46<00:00,  2.26s/it]


## Scrape Prog Archives artists from album

In [None]:
def progarchives_album_id(artist_id, album_name):
    r = requests.get(f'http://www.progarchives.com/artist.asp?id={artist_id}')
    if r.status_code != 200:
        raise Exception(f'Error: {r.status_code}, {r.content}, {r.url}, {r.headers}')

    soup = BeautifulSoup(r.content,'html.parser')
    for a in soup.find_all('a', href=True):
        if 'album.asp' in a['href'] and '#' not in a['href'] and album_name in a.text:
            return a['href'].split('=')[-1]

def progarchives_query_members(album_id):
    r = requests.get(f'http://www.progarchives.com/album.asp?id={album_id}')
    if r.status_code != 200:
        raise Exception(f'Error: {r.status_code}, {r.content}, {r.url}, {r.headers}')

    soup = BeautifulSoup(r.content,'html.parser')
    raw = ''
    for p in soup.find_all('strong', text='Line-up / Musicians'):
        raw = re.sub('<.*>','', str(p.find_next_sibling()).replace('<br/>', '\n'))
        break

    try:
        s = {l.strip()[1:].strip() for l in raw.split('\n\n')[0].strip().split('\n') if l.strip() if l.strip()[0] == '-'}
        return {l.split('/')[0].strip(): [role.strip() for role in l.split('/')[1].split(',') if role.strip()]  for l in s}
    except Exception:
        raise Exception(f'Error: {r.url}, {raw}')
    

In [None]:
with open('wikidata/artists.json', 'r') as f:
    artists = json.load(f)
    progarchives_id = {a['discogsId']['value']: a['progarchivesId']['value'] for a in artists}
    
from collections import defaultdict
artist_release_member = defaultdict(dict)
with open('discogs/releases.json', 'r') as f:
    releases_json = json.load(f)

    for artist in tqdm(releases_json):
        for release in artist['releases']:
            artist_pa_id = progarchives_id[artist['discogsID']]
            release_pa_id = progarchives_album_id(artist_pa_id, release['title'])
            if release_pa_id is not None:
                artist_release_member[artist_pa_id][release_pa_id] = progarchives_query_members(release_pa_id)

os.makedirs('progarchives', exist_ok=True)
with open('progarchives/members.json', 'w+') as out:
    json.dump(artist_release_member, out)

100%|██████████| 34/34 [08:36<00:00, 15.18s/it]


## List instruments

In [None]:
def filter_instruments(instruments, *keywords):
    return set(i for i in instruments if any(k.lower() in i.lower() for k in keywords))

In [None]:
with open('progarchives/members.json', 'r') as f:
    members = json.load(f)
    instruments = {instr for mem in members.values() for alb in mem.values()
                   for l in alb.values() for instr in l}
    instruments_all = instruments.copy()
    print(len(instruments), 'instruments')

    category = dict()
    instr_in_category = dict()

    categories_keywords = {
        'vocals': ['voice', 'vocal'],
        'synths': ['synth'],
        'guitar': ['guitar', 'gtr'],
        'bass': ['bass'],
        'drums': ['drum'],
        'percussion': ['percussion', 'timpani', 'triangle', 'whip', 'timbales', 'congas', 'cymbal', 'tabla', 'bell', 'vibraphone', 'gong', 'vibes', 'bongos', 'celesta', 'tambourine', 'sistro'],
        'keyboards': ['keys', 'keyboard', 'chord', 'mellotron', 'clavinet', 'spinet'],
        'piano': ['piano'],
        'organ': ['organ', 'hammond', 'harmonium'],
        'wind': ['flute', 'piccolo', 'clarinet', 'oboe', 'bassoon', 'clarino', 'melodica'],
        'brass': ['brass', 'trombone', 'trumpet', 'tuba', 'horn', 'cornet', 'harmonica', 'sax'],
        'strings_drawn': ['violin', 'viola', 'cello', 'sarangi'],
        'strings_plucked': ['balalaika', 'mandolin', 'sitar', 'theorbo', 'mandola', 'zither'],
        'production_roles': ['producer', 'director', 'orchestra', 'composer', 'arrangement', 'recorder'],
        'effects': ['effect', 'fx', 'audio', 'freq']
    }

    for cat, keywords in categories_keywords.items():
        instr_in_category[cat] = filter_instruments(instruments_all, *keywords)
        instruments.difference_update(instr_in_category[cat])
        category.update({instr: cat for instr in instr_in_category[cat]})
    
    print(len(instruments), 'not filtered')
    pp(instr_in_category)

    with open('progarchives/instruments.json', 'w+') as out:
        json.dump(category, out)

109 instruments
0 not filtered
{'vocals': {'backing vocals', 'vocals'},
 'synths': {'ARP 2600 synthesizer',
            'ARP Solina (synth)',
            'ARP synthesizer',
            'EMS synthesizer',
            'Eminent synthesizer',
            'MiniMoog synthesizer',
            'Moog synthesizer',
            'VCS synthesizer',
            'synthesizer'},
 'guitar': {'12-string acoustic guitar',
            '9-string guitar',
            'acoustic guitar',
            'classical guitar',
            'electric guitar',
            'guitar',
            'steel guitar'},
 'bass': {'acoustic bass',
          'bass',
          'bass clarinet',
          'bowed contrabass',
          'double bass',
          'electric bass'},
 'drums': {'drums', 'steel drums'},
 'percussion': {'bells',
                'bongos',
                'celesta',
                'congas',
                'cymbals',
                'gong',
                'percussions',
                'sistro',
              

## Download comment of band from Progarchives

In [None]:
def progarchives_query_biography(artist_id):
    r = requests.get(f'http://www.progarchives.com/artist.asp?id={artist_id}')
    if r.status_code != 200:
        raise Exception(f'Error: {r.status_code}, {r.content}, {r.url}, {r.headers}')

    soup = BeautifulSoup(r.content,'html.parser')
    bios = list(soup.find_all('span', id='moreBio'))
    if bios:
        element = bios[0]
    else: 
        for p in soup.find_all('strong'):
            if 'biography' in p.text.lower():
                element = p.parent
    
    if element is None:
        raise Exception(f'Biography not found url {r.url}')

    raw = str(element).replace('<br/>', '\n')
    raw = re.sub('<.*>','', raw)
    raw = re.sub(r'\n{2,}', r'\n', raw)
    raw = re.sub(r' {2,}', r' ', raw)
    lines = [l.strip() for l in raw.split('\n') if l.strip()]
    try:
        end = [i for i, l in enumerate(lines) if 'see also' in l.lower()][0]
    except IndexError:
        end = len(lines)
    lines = lines[:end]
    return '\n'.join(lines)

In [None]:
with open('wikidata/artists.json', 'r') as f:
    artists = json.load(f)
    progarchives_biography = dict()
    for a in tqdm(artists):
        progarchives_biography[a['progarchivesId']['value']] = progarchives_query_biography(a['progarchivesId']['value'])

with open('progarchives/biographies.json', 'w+') as out:
    json.dump(progarchives_biography, out)