In [1]:
library(stringr)
library(dplyr)
  # Standardizing Date: Ensure release_date is in the Date format for any date-related operations.
library(lubridate)
library(tidyverse)
library(magrittr)



Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



Attaching package: 'lubridate'


The following objects are masked from 'package:base':

    date, intersect, setdiff, union


── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats[39m 1.0.0     [32m✔[39m [34mreadr  [39m 2.1.4
[32m✔[39m [34mggplot2[39m 3.4.3     [32m✔[39m [34mtibble [39m 3.2.1
[32m✔[39m [34mpurrr  [39m 1.0.1     [32m✔[39m [34mtidyr  [39m 1.3.0
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflic

In [2]:
wrangle_album <- function(data) {
  # Extract album IDs from the url column
  data$album_id <- str_extract(data$url, "mw\\d+")


  # Load the dplyr package


  # Remove some columns
  data <- select(data, -genre_names, -style_names, -mood_names, -theme_names, -recording_date, -recording_locations)


  # move the 'id' column to be the first column:

  # Get the names of all columns
  col_names <- names(data)

  # Remove 'id' from the list of column names
  col_names <- setdiff(col_names, "album_id")

  # Combine 'id' as the first column with the rest of the column names
  new_order <- c("album_id", col_names)

  # Reorder the columns in the data frame
  data <- data[, new_order]



  # Use the mdy() function to convert the date
  data$release_date <- mdy(data$release_date)

  # Normalizing Text Data: Ensure text data is in a consistent format (e.g., all lowercase).
  data$album <- tolower(data$album)

  data <- unique(data)

  # Reordering columns to move 'url' to the last
  data <- data[, c(setdiff(names(data), "url"), "url")]


  # Remove duplicate rows
  data <- distinct(data)

  # Rename the 'album' column to 'album_name'
  data <- data %>%
    rename(album_name = album)

  genre_albums_map = tibble()
  style_albums_map = tibble()
  mood_albums_map = tibble()
  theme_albums_map = tibble()
  # iterate every row of the dataframe
    for(i in 1:nrow(data)) {
        genre_urls <- strsplit(data[i,]$genre_urls, ";")[[1]]
        for (j in 1:length(genre_urls)) {
            genre_url = genre_urls[j]
            genre_id <- substr(genre_url, start = nchar(genre_url) - 11, nchar(genre_url))
            genre_albums_map %<>% bind_rows(
                tibble(album_id=data[i,]$album_id, genre_id = genre_id))
        }

        style_urls <- strsplit(data[i,]$style_urls, ";")[[1]]
        for (j in 1:length(style_urls)) {
            style_url = style_urls[j]
            style_id <- substr(style_url, start = nchar(style_url) - 11, nchar(style_url))
            style_albums_map %<>% bind_rows(
                tibble(album_id=data[i,]$album_id, style_id = style_id))
        }

        mood_urls <- strsplit(data[i,]$mood_urls, ";")[[1]]
        for (j in 1:length(mood_urls)) {
            mood_url = mood_urls[j]
            mood_id <- substr(mood_url, start = nchar(mood_url) - 11, nchar(mood_url))
            mood_albums_map %<>% bind_rows(
                tibble(album_id=data[i,]$album_id, mood_id = mood_id))
        }

        theme_urls <- strsplit(data[i,]$theme_urls, ";")[[1]]
        for (j in 1:length(theme_urls)) {
            theme_url = theme_urls[j]
            theme_id <- substr(theme_url, start = nchar(theme_url) - 11, nchar(theme_url))
            theme_albums_map %<>% bind_rows(
                tibble(album_id=data[i,]$album_id, theme_id = theme_id))
        }
    }
    #print(genre_albums_map)
    #print(style_albums_map)
    #print(mood_albums_map)
    #print(theme_albums_map)
  return(data)
}


In [3]:
wrangle_track <- function(data) {
  # remove some columns that needed for this table
  data <- select(data, -composer_names, -composer_urls, -performer_names, -performer_urls)


  # Extract track IDs from the url column
  data$track_id <- str_extract(data$url, "(mq|mt)\\d+")


  # Extract album IDs from the album url column
  data$album_id <- str_extract(data$album_url, "mw\\d+")

  data <- select(data, -album_url)


  # Rename the 'title' column to 'track_title'
  data <- data %>%
    rename(track_title = title)

  data$track_title <- tolower(data$track_title)


  # Reorder the columns
  data <- data %>%
    select(track_id, num, track_title, duration, album_id, url)

  # Remove duplicate rows
  data <- distinct(data)

  return(data)
}



In [4]:
albums <- tibble()
tracks <- tibble()
directory_path <- "./original"
year_folders <- list.dirs(directory_path, full.names = TRUE, recursive = FALSE)
for (year_folder in year_folders) {
  page_folders <- list.dirs(year_folder, full.names = TRUE, recursive = FALSE)
  for (page_folder in page_folders) {
    files <- list.files(page_folder, full.names = TRUE, recursive = FALSE)
    for (file in files) {
      if (endsWith(file, "albums.csv")) {
        albums %<>% bind_rows(read.csv(file))
      }
      if (endsWith(file, "tracks.csv")) {
        tracks %<>% bind_rows(read.csv(file))
      }
    }
  }
}

wrangled_albums <- wrangle_album(albums)
# wrangled_tracks <- wrangle_track(tracks)
if (!dir.exists("./wrangled")) {
  dir.create("./wrangled", recursive = TRUE)
}
write.csv(wrangled_albums, "./wrangled/albums.csv", row.names = FALSE)
#write.csv(wrangled_tracks, "./wrangled/tracks.csv", row.names = FALSE)

" 1 failed to parse."
