In [3]:
library(tidyverse)
library(magrittr)
library(glue)
library(rvest)
library(polite)
library(xml2)
library(httr)

In [4]:
base_url <- "https://www.allmusic.com"

In [5]:
get_attr_value <- function(node, attr) {
  return(node %>% html_attrs() %>% map_chr(attr))
}

get_href <- function(node) {
  return(get_attr_value(node, "href"))
}

get_text <- function(node) {
  return(node %>% html_text() %>% trimws())
}

### Part 1: scrape genres and styles

In [6]:
# scrape genres and styles, style are sub-category of the genre
search_path <- "/advanced-search"
doc <- read_html(glue(base_url, search_path))

# filter using two classes together
genre_filter_node <- doc %>% html_nodes(".filter.genres")

genre_nodes <- genre_filter_node %>% html_nodes(".genre label")
style_nodes <- genre_filter_node %>% html_nodes(".style")

In [8]:
parse_genre_node <- function(node) {
  return(tibble(
    id = node %>% html_attrs() %>% map_chr("for"),
    genre = node %>% html_text() %>% trimws()
  ))
}

parse_style_node <- function(node) {
  return(tibble(
    id = node %>% html_nodes("label") %>% html_attrs() %>% map_chr("for"),
    style = node %>% html_nodes("label") %>% html_text() %>% trimws(),
    genre_id = node %>% html_attrs() %>% map_chr("data-parent")
  ))
}

In [9]:
# define the structure of genre data frame
genre_df = tribble(
  ~id, ~genre
)

# define the structure of style data frame
style_df = tribble(
  ~id, ~style, ~genre_id
)

for (i in seq_along(genre_nodes)) {
  genre_df %<>% rbind(parse_genre_node(genre_nodes[i]))
}

for (i in seq_along(style_nodes)) {
  style_df %<>% rbind(parse_style_node(style_nodes[i]))
}

write.csv(genre_df, "./genre.csv", row.names = FALSE)
write.csv(style_df, "./style.csv", row.names = FALSE)

### Part2: scrape albums

In [13]:
parse_album <- function(doc) {
  title <- doc %>% html_node(".album-title") %>% get_text()

  album_artist_urls= c()
  album_artist_names= c()
  album_artist_nodes <- doc %>% html_nodes(".album-artist a")
  for (i in seq_along(album_artist_nodes)) {
    album_artist_urls %<>% append(album_artist_nodes[i] %>% get_href())
    album_artist_names %<>% append(album_artist_nodes[i] %>% get_text())
  }

  # get the ratings
  all_music_rating <- doc %>% html_node(".ratings .allmusic-rating") %>% get_text()

  # TODO(dexter): should we add review columns?
  album_info_node <- doc %>% html_node(".sidebar .basic-info")
  release_date <- album_info_node %>% html_node(".release-date span") %>% get_text()
  duration <- album_info_node %>% html_node(".duration span") %>% get_text()
  recording_date <- album_info_node %>% html_node(".recording-date div") %>% get_text()

  # there might be multiple locations, since it is a list in the html doc
  recording_locations = c()
  recording_locations_nodes <- album_info_node %>% html_nodes(".recording-location li")
  for (i in seq_along(recording_locations_nodes)) {
    recording_locations %<>% append(recording_locations_nodes[i] %>% get_text())
  }

  # genres
  genre_urls = c()
  genre_names = c()
  genre_nodes <- album_info_node %>% html_nodes(".genre a")
  for (i in seq_along(genre_nodes)) {
    genre_urls %<>% append(genre_nodes[i] %>% get_href())
    genre_names %<>% append(genre_nodes[i] %>% get_text())
  }

  # styles
  style_urls = c()
  style_names = c()
  style_nodes <- album_info_node %>% html_nodes(".styles a")
  for (i in seq_along(style_nodes)) {
    style_urls %<>% append(style_nodes[i] %>% get_href())
    style_names %<>% append(style_nodes[i] %>% get_text())
  }


  # get album moods
  mood_urls = c()
  mood_names = c()
  mood_nodes = doc %>% html_nodes(".sidebar .moods .mood a")
  for (i in seq_along(mood_nodes)) {
    mood_urls %<>% append(mood_nodes[i] %>% get_href())
    mood_names %<>% append(mood_nodes[i] %>% get_text())
  }

  # get album themes
  theme_urls = c()
  theme_names = c()
  theme_nodes = doc %>% html_nodes(".sidebar .themes .theme a")
  for (i in seq_along(theme_nodes)) {
    theme_urls %<>% append(theme_nodes[i] %>% get_href())
    theme_names %<>% append(theme_nodes[i] %>% get_text())
  }

  return(tibble(
    album = title,
    duration = duration,
    release_date = release_date,
    all_music_rating = all_music_rating,
    recording_date = recording_date,
    recording_locations = paste(recording_locations, collapse = ";"),
    genre_names = paste(genre_names, collapse = ";"),
    genre_urls = paste(genre_urls, collapse = ";"),
    style_names = paste(style_names, collapse = ";"),
    style_urls = paste(style_urls, collapse = ";"),
    mood_names = paste(mood_names, collapse = ";"),
    mood_urls = paste(mood_urls, collapse = ";"),
    theme_names = paste(theme_names, collapse = ";"),
    theme_urls = paste(theme_urls, collapse = ";")
  ))
}

In [14]:
parse_track <- function(node) {
  num <- node %>% html_node(".tracknum") %>% get_text()

  title_node <- node %>% html_node(".title-composer .title a")
  title <- title_node %>% get_text()
  url <- title_node %>% get_href()

  # artist related
  # TODO(dexter): 1:N ? 1:1?
  composer_node <- node %>% html_node(".title-composer .composer a")
  # there may be no composer data, ignore the error when no composer in in the track
  composer_name <- NA
  composer_url <- NA
  result <- tryCatch({
    composer_name <- composer_node %>% get_text()
    composer_url <- composer_node %>% get_href()
  }, error = function(err) {})

  performer_urls = c()
  performer_names = c()
  performer_nodes <- node %>% html_nodes(".performer .primary a")
  for(i in seq_along(performer_nodes)) {
    performer_urls %<>% append(performer_nodes[i] %>% get_href())
    performer_names %<>% append(performer_nodes[i] %>% get_text())
  }

  duration <- node %>% html_node(".time") %>% get_text()

  return(tibble(
    num = num,
    title = title,
    duration = duration,
    url = url,
    composer_url = composer_url,
    composer_name = composer_name,
    performer_urls = paste(performer_urls, collapse = ";"),
    performer_names = paste(performer_names, collapse = ";")
  ))
}

In [15]:
scrape_album <- function(url) {
  doc <- read_html(url)

  album <- parse_album(doc)
  # add url to album
  album %<>% mutate(url = url)

  tracks = tibble()
  # get the track list and parse the tracks
  track_nodes <- doc %>% html_nodes(".track-listing .track")

  for(i in seq_along(track_nodes)) {
    track <- parse_track(track_nodes[i])
    tracks %<>% bind_rows(track)
  }

  return(list(
    album = album,
    tracks = tracks
  ))
}


album,duration,release_date,all_music_rating,recording_date,recording_locations,genre_names,genre_urls,style_names,style_urls,mood_names,mood_urls,theme_names,theme_urls,url
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Live: Cookin' with Blue Note at Montreux,45:22,"December 9, 2022",10,,,R&B;Jazz,https://www.allmusic.com/genre/r-b-ma0000002809;https://www.allmusic.com/genre/jazz-ma0000002674,Crossover Jazz;Funk;Jazz-Funk;Jazz Instrument;Trumpet Jazz,https://www.allmusic.com/style/crossover-jazz-ma0000012142;https://www.allmusic.com/style/funk-ma0000002606;https://www.allmusic.com/style/jazz-funk-ma0000011905;https://www.allmusic.com/style/jazz-instrument-ma0000002947;https://www.allmusic.com/style/trumpet-jazz-ma0000002485,Confident;Hedonistic;Sophisticated;Amiable/Good-Natured;Celebratory;Street-Smart;Stylish;Brassy;Driving;Earthy;Fierce;Freewheeling;Gritty;Improvisatory;Kinetic;Powerful;Rollicking;Rousing;Swaggering;Tight;Uplifting;Visceral,https://www.allmusic.com/mood/confident-xa0000000748;https://www.allmusic.com/mood/hedonistic-xa0000001018;https://www.allmusic.com/mood/sophisticated-xa0000001100;https://www.allmusic.com/mood/amiable-good-natured-xa0000000934;https://www.allmusic.com/mood/celebratory-xa0000000703;https://www.allmusic.com/mood/street-smart-xa0000001112;https://www.allmusic.com/mood/stylish-xa0000001113;https://www.allmusic.com/mood/brassy-xa0000000950;https://www.allmusic.com/mood/driving-xa0000000978;https://www.allmusic.com/mood/earthy-xa0000000981;https://www.allmusic.com/mood/fierce-xa0000001001;https://www.allmusic.com/mood/freewheeling-xa0000001005;https://www.allmusic.com/mood/gritty-xa0000001014;https://www.allmusic.com/mood/improvisatory-xa0000001023;https://www.allmusic.com/mood/kinetic-xa0000001030;https://www.allmusic.com/mood/powerful-xa0000000747;https://www.allmusic.com/mood/rollicking-xa0000001080;https://www.allmusic.com/mood/rousing-xa0000000759;https://www.allmusic.com/mood/swaggering-xa0000001118;https://www.allmusic.com/mood/tight-xa0000001448;https://www.allmusic.com/mood/uplifting-xa0000001132;https://www.allmusic.com/mood/visceral-xa0000001135,Guys Night Out;TGIF;Girls Night Out;Partying,https://www.allmusic.com/theme/guys-night-out-ma0000006291;https://www.allmusic.com/theme/tgif-ma0000006308;https://www.allmusic.com/theme/girls-night-out-ma0000006292;https://www.allmusic.com/theme/partying-ma0000004272,https://www.allmusic.com/album/live-cookin-with-blue-note-at-montreux-mw0003845952

num,title,duration,url,composer_url,composer_name,performer_urls,performer_names
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,Black Byrd,08:11,https://www.allmusic.com/song/black-byrd-mt0061536559,https://www.allmusic.com/artist/laurence-mizell-mn0000785399,Laurence Mizell,https://www.allmusic.com/artist/donald-byrd-mn0000149946,Donald Byrd
2,You've Got It Bad Girl,07:42,https://www.allmusic.com/song/youve-got-it-bad-girl-mt0061536560,https://www.allmusic.com/artist/stevie-wonder-mn0000622805,Stevie Wonder,https://www.allmusic.com/artist/donald-byrd-mn0000149946,Donald Byrd
3,The East,09:19,https://www.allmusic.com/song/the-east-mt0061536561,https://www.allmusic.com/artist/donald-byrd-mn0000149946,Donald Byrd,https://www.allmusic.com/artist/donald-byrd-mn0000149946,Donald Byrd
4,Introductions,02:58,https://www.allmusic.com/song/introductions-mt0061536562,,,https://www.allmusic.com/artist/donald-byrd-mn0000149946,Donald Byrd
5,Kwame,11:50,https://www.allmusic.com/song/kwame-mt0061536563,https://www.allmusic.com/artist/donald-byrd-mn0000149946,Donald Byrd,https://www.allmusic.com/artist/donald-byrd-mn0000149946,Donald Byrd
6,Poco-Mania,05:22,https://www.allmusic.com/song/poco-mania-mt0061536564,https://www.allmusic.com/artist/donald-byrd-mn0000149946,Donald Byrd,https://www.allmusic.com/artist/donald-byrd-mn0000149946,Donald Byrd


In [12]:
headers <- c(
  "User-Agent" = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
   # This header is crucial or we will get nothing in the response
  "Content-Type" = "application/x-www-form-urlencoded; charset=UTF-8",
  # This header is crucial or we will get 404
  "referer" = "https://www.allmusic.com/advanced-search"
)

In [13]:
scrape_albums_by_year_page <- function(year, page) {
  # there are some special char in form data that should be encoded, 
  # here we use raw data to reduce complexity
  url <- glue("https://www.allmusic.com/advanced-search/results/{page}")
  raw_data <- glue("filters%5B%5D=%26releaseYearStart%3Dsy{year}&filters%5B%5D=%26releaseYearEnd%3Dey{year}&sort=")

  response <- POST(url, body = raw_data, add_headers(.headers = headers))
  # Check if the request was successful (status code 200)
  if (status_code(response) == 200) {
    nodes <- response %>% content("text") %>% read_html() %>% html_nodes(".discography-item-container .cover a")
    album_path_list <- nodes %>% html_attrs() %>% map_chr("href")

    albums <- tibble()
    tracks <- tibble()
    for (i in seq_along(album_path_list)) {
      album_url <- glue(base_url, album_path_list[i])
      tryCatch({
        data = scrape_album(album_url)

        albums %<>% bind_rows(data$album)
        tracks %<>% bind_rows(data$tracks)

        Sys.sleep(0.5)
      }, error = function(err) {
        cat(glue("Error when scrape album {album_url}"))
        stop(err)
      })
    }

    # save the result by page since we don"s want loose lots of data if error happens
    dir_path <- glue("./data/{year}/{page}")
    if (!dir.exists(dir_path)) {
      dir.create(dir_path, recursive = TRUE)
    }
    write.csv(albums, glue("{dir_path}/albums.csv"), row.names = FALSE)
    write.csv(tracks, glue("{dir_path}/tracks.csv"), row.names = FALSE)

    return(nrow(albums))
  } else {
    stop(glue("Error: {http_status(response)$reason}\n"))
  }
}

In [15]:
scrape_albums_by_year_page(2022, 2)

Error when scrape album https://www.allmusic.com/album/strauss-mw0003652556

ERROR: Error in open.connection(x, "rb"): Timeout was reached: [www.allmusic.com] SSL connection timeout
