In [1]:
import requests
import time
import pickle
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
import os
from tqdm import tqdm

In [2]:
BASE_URL = 'https://reference.discogs.com/'
DATA_FOLDER = 'data/'
CRAWL_FOLDER = 'crawl/discogs/'
TESTING = False

In [3]:
# get genre of a style

def get_genre_of_style(style_url):
    genre_dict = dict()
    
    query_url = BASE_URL + style_url
    response = requests.get(query_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    try:
        genre_info = soup.find('td', text='Related Genre/s').find_next('td').li.a
        genre_url = genre_info['href']
        genre_name = genre_info['title']
    
        genre_dict = {
            'genre_url': genre_url,
            'genre_name': genre_name,
        }
    except AttributeError:
        pass
    
    return genre_dict

In [6]:
# get all the styles available in discogs

def get_discogs_styles():
    page_nbr = 1
    discogs_styles_dict = dict()

    while page_nbr < 17:
        query_url = BASE_URL + 'browse/style?page=' + str(page_nbr)
        response = requests.get(query_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        styles_data = soup.find(attrs={'id': 'cards'})['data-props']
        styles_data_dict = json.loads(styles_data)

        styles_data_dict['data']['entities'][0]
        for style in styles_data_dict['data']['entities']:
            style_url = style['url']
            style_name = style['title']
            print(style_url)

            genre_dict = get_genre_of_style(style_url)
            if genre_dict:
                genre_url = genre_dict['genre_url']
                genre_name = genre_dict['genre_name']
            else:
                genre_url = ''
                genre_name = ''

            discogs_styles_dict[style_url] = {
                'style_name': style_name,
                'genre_url': genre_url,
                'genre_name': genre_name,
            }

        page_nbr += 1

    return discogs_styles_dict

In [7]:
# save discogs styles to csv

def save_discogs_styles(discogs_styles_dict):
    discogs_styles_dict = get_discogs_styles()
    discogs_styles_df = pd.DataFrame.from_dict(discogs_styles_dict, orient='index')
    discogs_styles_df.to_csv('data/data_discogs_styles.csv', encoding='utf-8')