In [11]:
library(stringr)
library(dplyr)
  # Standardizing Date: Ensure release_date is in the Date format for any date-related operations.
library(lubridate)
library(tidyverse)
library(magrittr)


In [17]:
wrangle_album <- function(data) {
  # Extract album IDs from the url column
  data$album_id <- str_extract(data$url, "mw\\d+")


  # Load the dplyr package


  # Remove some columns
  data <- select(data, -genre_names, -genre_urls, -style_names, -style_urls, -mood_names, -mood_urls, -theme_names, -theme_urls, -recording_date, -recording_locations)


  # move the 'id' column to be the first column:

  # Get the names of all columns
  col_names <- names(data)

  # Remove 'id' from the list of column names
  col_names <- setdiff(col_names, "album_id")

  # Combine 'id' as the first column with the rest of the column names
  new_order <- c("album_id", col_names)

  # Reorder the columns in the data frame
  data <- data[, new_order]



  # Use the mdy() function to convert the date
  data$release_date <- mdy(data$release_date)

  # Normalizing Text Data: Ensure text data is in a consistent format (e.g., all lowercase).
  data$album <- tolower(data$album)

  data <- unique(data)

  # Reordering columns to move 'url' to the last
  data <- data[, c(setdiff(names(data), "url"), "url")]


  # Remove duplicate rows
  data <- distinct(data)

  # Rename the 'album' column to 'album_name'
  data <- data %>%
    rename(album_name = album)
  
  return(data)
}


In [25]:
wrangle_track <- function(data) {
  # remove some columns that needed for this table
  data <- select(data, -composer_names, -composer_urls, -performer_names, -performer_urls)


  # Extract track IDs from the url column
  data$track_id <- str_extract(data$url, "(mq|mt)\\d+")


  # Extract album IDs from the album url column
  data$album_id <- str_extract(data$album_url, "mw\\d+")

  data <- select(data, -num, -album_url)


  # Rename the 'title' column to 'track_title'
  data <- data %>%
    rename(track_title = title)

  data$track_title <- tolower(data$track_title)


  # Reorder the columns
  data <- data %>%
    select(track_id, track_title, duration, album_id, url)

  # Remove duplicate rows
  data <- distinct(data)

  return(data)
}



In [28]:
albums <- tibble()
tracks <- tibble()
directory_path <- "./original"
year_folders <- list.dirs(directory_path, full.names = TRUE, recursive = FALSE)
for (year_folder in year_folders) {
  page_folders <- list.dirs(year_folder, full.names = TRUE, recursive = FALSE)
  for (page_folder in page_folders) {
    files <- list.files(page_folder, full.names = TRUE, recursive = FALSE)
    for (file in files) {
      if (endsWith(file, "albums.csv")) {
        albums %<>% bind_rows(read.csv(file))
      }
      if (endsWith(file, "tracks.csv")) {
        tracks %<>% bind_rows(read.csv(file))
      }
    }
  }
}

wrangled_albums <- wrangle_album(albums)
wrangled_tracks <- wrangle_track(tracks)
write.csv(wrangled_albums, "./wrangled/albums.csv", row.names = FALSE)
write.csv(wrangled_tracks, "./wrangled/tracks.csv", row.names = FALSE)

“ 1 failed to parse.”
