In [44]:
library(tidyverse)
library(magrittr)
library(glue)
library(rvest)
library(polite)
library(xml2)
library(httr)

### Part 1: scrape genres and styles

In [39]:
# scrape genres and styles, style are sub-category of the genre
base_url <- 'https://www.allmusic.com/advanced-search'


doc <- read_html(base_url)

# filter using two classes together
genre_filter_node <- doc %>% html_nodes('.filter.genres')

genre_nodes <- genre_filter %>% html_nodes('.genre label')

style_nodes <- genre_filter %>% html_nodes('.style')

In [42]:
parse_genre_node <- function(node) {
    return (tibble(
        id = node %>% html_attrs() %>% map_chr("for"), 
        genre = node %>% html_text() %>% trimws()
    ))  
}

parse_style_node <- function(node) {
    return (tibble(
        id = node %>% html_nodes('label') %>% html_attrs() %>% map_chr("for"), 
        style = node %>% html_nodes('label') %>% html_text() %>% trimws(), 
        genre_id = node %>% html_attrs() %>% map_chr("data-parent")
    ))   
}

In [43]:
# define the structure of genre data frame
genre_df = tribble(
    ~id, ~genre
)

# define the structure of style data frame
style_df = tribble(
    ~id, ~style, ~genre_id
)

for (i in seq_along(genre_nodes)) {
    genre_df %<>% rbind(parse_genre_node(genre_nodes[i]))
}

for (i in seq_along(style_nodes)) {
    style_df %<>% rbind(parse_style_node(style_nodes[i]))
}

write.csv(genre_df, './genre.csv', row.names = FALSE)
write.csv(style_df, './style.csv', row.names = FALSE)

### Part2: scrape albums