## Before started

In [77]:
library(tidyverse)
library(magrittr)
library(glue)
library(rvest)
library(polite)
library(xml2)
library(httr)

In [78]:
# This is the base url for all the scrape
base_url <- "https://www.allmusic.com"

This block contains some functions to simpilify the scraping code, they are added during the coding.

In [79]:

# get the value of the html node attribute
get_attr_value <- function(node, attr) {
  return(node %>% html_attrs() %>% map_chr(attr))
}

# get the link(href) of the html node
get_href <- function(node) {
  return(get_attr_value(node, "href"))
}

# get the text of the html node
get_text <- function(node) {
  return(node %>% html_text() %>% trimws())
}

## Part 1: scrape genres, styles, themes and moods

In part 1, we scrape all the genres, styles, themes and moods, since they are all under ["advanced search"](https://www.allmusic.com/advanced-search) page, so we only have to run Part 1 once, and all these data can be scraped. And the scraped data will be saved at: 
  * ./original/genre.csv
  * ./original/style.csv
  * ./original/theme.csv
  * ./original/mood.csv

In [80]:
# scrape genres, styles, themes and moods
search_path <- "/advanced-search"
doc <- read_html(glue(base_url, search_path))

### Part 1.1 scrape genres & styles

In [81]:
# define the structure of genre data frame
genre_df = tribble(
  ~id, ~genre
)

# define the structure of style data frame, style is a sub-category of genre
style_df = tribble(
  ~id, ~style, ~genre_id
)

In [82]:
parse_genre_node <- function(node) {
  return(tibble(
    id = node %>% get_attr_value("for"),
    genre = node %>% get_text()
  ))
}

parse_style_node <- function(node) {
  label_node <- node %>% html_nodes("label")

  return(tibble(
    id = label_node %>% get_attr_value("for"),
    style = label_node %>% get_text(),
    genre_id = node %>% get_attr_value("data-parent")
  ))
}

# filter using two classes together
genre_filter_node <- doc %>% html_nodes(".filter.genres")
genre_nodes <- genre_filter_node %>% html_nodes(".genre label")
style_nodes <- genre_filter_node %>% html_nodes(".style")


for (i in seq_along(genre_nodes)) {
  genre_df %<>% rbind(parse_genre_node(genre_nodes[i]))
}

for (i in seq_along(style_nodes)) {
  style_df %<>% rbind(parse_style_node(style_nodes[i]))
}

write.csv(genre_df, "./original/genre.csv", row.names = FALSE)
write.csv(style_df, "./original/style.csv", row.names = FALSE)

### Part 1.2 scrape moods

In [83]:
# define the structure of style data frame
mood_df = tribble(
  ~id, ~mood
)

In [84]:
parse_mood_node <- function(node) {
  return(tibble(
    id = node %>% get_attr_value("id"),
    mood = node %>% get_attr_value("value")
  ))
}

mood_nodes <- doc %>% html_nodes(".filter.moods .options input")

for (i in seq_along(mood_nodes)) {
  mood_df %<>% rbind(parse_mood_node(mood_nodes[i]))
}

write.csv(mood_df, "./original/mood.csv", row.names = FALSE)

### Part 1.3 scrape themes

In [85]:
# define the structure of style data frame
theme_df = tribble(
  ~id, ~theme
)

In [86]:
parse_theme_node <- function(node) {
  return(tibble(
    id = node %>% get_attr_value("id"),
    theme = node %>% get_attr_value("value")
  ))
}

theme_nodes <- doc %>% html_nodes(".filter.themes .options input")

for (i in seq_along(theme_nodes)) {
  theme_df %<>% rbind(parse_theme_node(theme_nodes[i]))
}

write.csv(theme_df, "./original/theme.csv", row.names = FALSE)

## Part2: scrape albums

In part2, we scrape the details of albums and tracks.

### Part2.1 Function that parse the album

In [87]:
parse_album <- function(doc) {
  title <- doc %>% html_node(".album-title") %>% get_text()

  album_artist_urls <- c()
  album_artist_names <- c()
  album_artist_nodes <- doc %>% html_nodes(".album-artist a")
  for (i in seq_along(album_artist_nodes)) {
    album_artist_urls %<>% append(album_artist_nodes[i] %>% get_href())
    album_artist_names %<>% append(album_artist_nodes[i] %>% get_text())
  }

  # get the ratings
  all_music_rating <- doc %>% html_node(".ratings .allmusic-rating") %>% get_text()

  album_info_node <- doc %>% html_node(".sidebar .basic-info")
  release_date <- album_info_node %>% html_node(".release-date span") %>% get_text()
  duration <- album_info_node %>% html_node(".duration span") %>% get_text()
  recording_date <- album_info_node %>% html_node(".recording-date div") %>% get_text()

  # there might be multiple locations, since it is a list in the html doc
  recording_locations = c()
  recording_locations_nodes <- album_info_node %>% html_nodes(".recording-location li")
  for (i in seq_along(recording_locations_nodes)) {
    recording_locations %<>% append(recording_locations_nodes[i] %>% get_text())
  }

  # genres
  genre_urls <- c()
  genre_names <- c()
  genre_nodes <- album_info_node %>% html_nodes(".genre a")
  for (i in seq_along(genre_nodes)) {
    genre_urls %<>% append(genre_nodes[i] %>% get_href())
    genre_names %<>% append(genre_nodes[i] %>% get_text())
  }

  # styles
  style_urls <- c()
  style_names <- c()
  style_nodes <- album_info_node %>% html_nodes(".styles a")
  for (i in seq_along(style_nodes)) {
    style_urls %<>% append(style_nodes[i] %>% get_href())
    style_names %<>% append(style_nodes[i] %>% get_text())
  }


  # get album moods
  mood_urls <- c()
  mood_names <- c()
  mood_nodes <- doc %>% html_nodes(".sidebar .moods .mood a")
  for (i in seq_along(mood_nodes)) {
    mood_urls %<>% append(mood_nodes[i] %>% get_href())
    mood_names %<>% append(mood_nodes[i] %>% get_text())
  }

  # get album themes
  theme_urls <- c()
  theme_names <- c()
  theme_nodes <- doc %>% html_nodes(".sidebar .themes .theme a")
  for (i in seq_along(theme_nodes)) {
    theme_urls %<>% append(theme_nodes[i] %>% get_href())
    theme_names %<>% append(theme_nodes[i] %>% get_text())
  }

  return(tibble(
    album = title,
    duration = duration,
    release_date = release_date,
    all_music_rating = all_music_rating,
    recording_date = recording_date,
    recording_locations = paste(recording_locations, collapse = ";"),
    genre_names = paste(genre_names, collapse = ";"),
    genre_urls = paste(genre_urls, collapse = ";"),
    style_names = paste(style_names, collapse = ";"),
    style_urls = paste(style_urls, collapse = ";"),
    mood_names = paste(mood_names, collapse = ";"),
    mood_urls = paste(mood_urls, collapse = ";"),
    theme_names = paste(theme_names, collapse = ";"),
    theme_urls = paste(theme_urls, collapse = ";")
  ))
}

### Part 2.2 Function that parse the track

In [88]:
parse_track <- function(node) {
  num <- node %>% html_node(".tracknum") %>% get_text()

  title_node <- node %>% html_node(".title-composer .title a")
  title <- title_node %>% get_text()
  url <- title_node %>% get_href()

  # artist related
  composer_names <- c()
  composer_urls <- c()
  composer_nodes <- node %>% html_nodes(".title-composer .composer a")
  for(i in seq_along(composer_nodes)) {
    composer_urls %<>% append(composer_nodes[i] %>% get_href())
    composer_names %<>% append(composer_nodes[i] %>% get_text())
  }

  performer_urls <- c()
  performer_names <- c()
  performer_nodes <- node %>% html_nodes(".performer .primary a")
  for(i in seq_along(performer_nodes)) {
    performer_urls %<>% append(performer_nodes[i] %>% get_href())
    performer_names %<>% append(performer_nodes[i] %>% get_text())
  }

  duration <- node %>% html_node(".time") %>% get_text()

  return(tibble(
    num = num,
    title = title,
    duration = duration,
    url = url,
    composer_urls = paste(composer_urls, collapse = ";"),
    composer_names = paste(composer_names, collapse = ";"),
    performer_urls = paste(performer_urls, collapse = ";"),
    performer_names = paste(performer_names, collapse = ";")
  ))
}

### Part 2.3 Function that scrape the album page, parse album and tracks

In [89]:
scrape_album <- function(url) {
  doc <- read_html(url)

  album <- parse_album(doc)
  # add url to album
  album %<>% mutate(url = url)

  tracks = tibble()
  # get the track list and parse the tracks
  track_nodes <- doc %>% html_nodes(".track-listing .track")

  for (i in seq_along(track_nodes)) {
    tracks_album <- parse_track(track_nodes[i])
    # add url to album
    tracks_album %<>% mutate(album_url = url)

    tracks %<>% bind_rows(tracks_album)
  }

  return(list(
    album = album,
    tracks = tracks
  ))
}


### Part2.4 Function that scrape the album list(advanced search result)

In [90]:
headers <- c(
  "User-Agent" = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
   # This header is crucial or we will get nothing in the response
  "Content-Type" = "application/x-www-form-urlencoded; charset=UTF-8",
  # This header is crucial or we will get 404
  "referer" = "https://www.allmusic.com/advanced-search"
)

In [91]:
scrape_albums_by_year_page <- function(year, page) {
  cat(glue('scraping page {page} of {year}\n'))
  
  # there are some special char in form data that should be encoded, 
  # here we use raw data to reduce complexity
  url <- glue("https://www.allmusic.com/advanced-search/results/{page}")
  raw_data <- glue("filters%5B%5D=%26releaseYearStart%3Dsy{year}&filters%5B%5D=%26releaseYearEnd%3Dey{year}&sort=")

  response <- POST(url, body = raw_data, add_headers(.headers = headers))
  # Check if the request was successful (status code 200)
  if (status_code(response) == 200) {
    nodes <- response %>% content("text") %>% read_html() %>% html_nodes(".discography-item-container .cover a")
    album_path_list <- nodes %>% get_href()

    albums <- tibble()
    tracks <- tibble()
    for (i in seq_along(album_path_list)) {
      album_url <- glue(base_url, album_path_list[i])

      # add some retries if something happens, stop retry after 5 times
      retry <- 0
      data <- NULL
      while (is.null(data) && retry <= 5) {
        tryCatch({
          retry <- retry + 1
          data <- scrape_album(album_url)

          albums %<>% bind_rows(data$album)
          tracks %<>% bind_rows(data$tracks)

          Sys.sleep(0.2)
        }, error = function(err) {
          cat(glue("Error when scrape album {album_url}, attempt {retry}, will stop after retrying 5 times\n"))
        })
      }
    }

    # save the result by page since we don"s want loose lots of data if error happens
    dir_path <- glue("./original/{year}/{page}")
    if (!dir.exists(dir_path)) {
      dir.create(dir_path, recursive = TRUE)
    }
    write.csv(albums, glue("{dir_path}/albums.csv"), row.names = FALSE)
    write.csv(tracks, glue("{dir_path}/tracks.csv"), row.names = FALSE)

    return(nrow(albums))
  } else {
    stop(glue("Error: {http_status(response)$reason}\n"))
  }
}

### Part 2.5 Actually scrape the album and track data

Since the search result does not give the pagination informations, so we assume that if we don't get response from the website, we have iterated all the pages. And we use page as a parameter because the scraping is a time-comsuming process, if something happens and the process is shut down, we can see where it goes by checking the "original" folder, so we can contiue the process from the page it stopped.

In [94]:
year <- 2022
page <- 1

while (TRUE) {
  num <- scrape_albums_by_year_page(year, page)
  if (num == 0) {
    break
  }
  page <- page + 1
}


## Limitations
