In [11]:
library(stringr)
library(dplyr)
  # Standardizing Date: Ensure release_date is in the Date format for any date-related operations.
library(lubridate)
library(tidyverse)
library(magrittr)


In [17]:
wrangle_album <- function(data) {
  # Extract album IDs from the url column
  data$album_id <- str_extract(data$url, "mw\\d+")


  # Load the dplyr package


  # Remove some columns
  data <- select(data, -genre_names, -genre_urls, -style_names, -style_urls, -mood_names, -mood_urls, -theme_names, -theme_urls, -recording_date, -recording_locations)


  # move the 'id' column to be the first column:

  # Get the names of all columns
  col_names <- names(data)

  # Remove 'id' from the list of column names
  col_names <- setdiff(col_names, "album_id")

  # Combine 'id' as the first column with the rest of the column names
  new_order <- c("album_id", col_names)

  # Reorder the columns in the data frame
  data <- data[, new_order]



  # Use the mdy() function to convert the date
  data$release_date <- mdy(data$release_date)

  # Normalizing Text Data: Ensure text data is in a consistent format (e.g., all lowercase).
  data$album <- tolower(data$album)

  data <- unique(data)

  # Reordering columns to move 'url' to the last
  data <- data[, c(setdiff(names(data), "url"), "url")]


  # Remove duplicate rows
  data <- distinct(data)

  # Rename the 'album' column to 'album_name'
  data <- data %>%
    rename(album_name = album)
  
  return(data)
}


In [20]:
albums <- tibble()
directory_path <- "./original"
year_folders <- list.dirs(directory_path, full.names = TRUE, recursive = FALSE)
for (year_folder in year_folders) {
  page_folders <- list.dirs(year_folder, full.names = TRUE, recursive = FALSE)
  for (page_folder in page_folders) {
    album_files <- list.files(page_folder, full.names = TRUE, recursive = FALSE)
    for (album_file in album_files) {
      if (endsWith(album_file, "albums.csv")) {
        albums %<>% bind_rows(read.csv(album_file))
      }
    }
  }
}

wrangled_albums <- wrangle_album(albums)
write.csv(wrangled_albums, "./wrangled/albums.csv", row.names = FALSE)



“ 1 failed to parse.”


In [None]:
albums_2023_1 <- read.csv(file = "albums_2023_1.csv")
albums_2023_2 <- read.csv(file = "albums_2023_2.csv")

# Concatenate albums_2023_1 and albums_2023_2 into one data frame
data <- rbind(albums_2023_1, albums_2023_2)


# Extract album IDs from the url column
data$album_id <- str_extract(data$url, "mw\\d+")



# Remove some columns
data <- select(data, -genre_names, -genre_urls, -style_names, -style_urls, -mood_names, -mood_urls, -theme_names, -theme_urls, -recording_date, -recording_locations)


# move the 'id' column to be the first column:

# Get the names of all columns
col_names <- names(data)

# Remove 'id' from the list of column names
col_names <- setdiff(col_names, "album_id")

# Combine 'id' as the first column with the rest of the column names
new_order <- c("album_id", col_names)

# Reorder the columns in the data frame
data <- data[, new_order]


# Use the mdy() function to convert the date
data$release_date <- mdy(data$release_date)


#  Parsing duration and convert it to a uniform time format.
# Function to convert time to minutes and round to 2 decimal places
time_to_minutes <- function(time) {
  # Split the time into parts (hours, minutes, seconds)
  time_parts <- unlist(strsplit(time, ":"))
  len <- length(time_parts)
  
  # Calculate the total minutes based on the number of parts
  total_minutes <- 0
  if (len == 3) {
    # If there are three parts (hours, minutes, seconds)
    total_minutes <- as.numeric(time_parts[1]) * 60 +
      as.numeric(time_parts[2]) +
      as.numeric(time_parts[3]) / 60
  } else if (len == 2) {
    # If there are two parts (minutes, seconds)
    total_minutes <- as.numeric(time_parts[1]) +
      as.numeric(time_parts[2]) / 60
  } else if (len == 1) {
    # If there is only one part (minutes)
    total_minutes <- as.numeric(time_parts[1]) / 60
  }
  
  # Return the total minutes rounded to 2 decimal places
  return(round(total_minutes, 2))
}

# Apply the function to the duration column
data$duration_minutes <- sapply(data$duration, function(x) {
  time_to_minutes(x)
})

data <- select(data, -duration)

# Normalizing Text Data: Ensure text data is in a consistent format (e.g., all lowercase).
data$album <- tolower(data$album)

data <- unique(data)

# Reordering columns to move 'url' to the last
data <- data[, c(setdiff(names(data), "url"), "url")]


# Remove duplicate rows
data <- distinct(data)

# Rename the 'album' column to 'album_name'
data <- data %>%
  rename(album_name = album)