In [9]:
using CSV
using DataFrames

In [10]:
df = DataFrame()
origin_path = "./original"  # Replace with your directory path

# List all files and directories
for year in readdir(origin_path)
    year_path = joinpath(origin_path, year)
    if isdir(year_path)
        for page in readdir(year_path)
            page_path = joinpath(year_path, page)
            if isdir(page_path)
                file = joinpath(page_path, "tracks.csv")
                single = CSV.File(file) |> DataFrame
                append!(df, single, promote=true)
            end
        end
    end
end

first(df, 5)


Row,num,title,duration,url,composer_urls,composer_names,performer_urls,performer_names,album_url
Unnamed: 0_level_1,Int64,String,Any,String,String?,String?,String,String,String
1,1,I Don't Worry About a Thing,02:19,https://www.allmusic.com/song/i-dont-worry-about-a-thing-mt0059358822,https://www.allmusic.com/artist/mose-allison-mn0000927627;https://www.allmusic.com/artist/count-basie-mn0000127044;https://www.allmusic.com/artist/harry-edison-mn0002784146,Mose Allison;Count Basie;Harry Edison,https://www.allmusic.com/artist/mose-allison-mn0000927627,Mose Allison,https://www.allmusic.com/album/complete-atlantic-elektra-albums-1962-1983-mw0003533754
2,2,It Didn't Turn Out That Way,02:44,https://www.allmusic.com/song/it-didnt-turn-out-that-way-mt0059358823,https://www.allmusic.com/artist/mose-allison-mn0000927627;https://www.allmusic.com/artist/count-basie-mn0000127044;https://www.allmusic.com/artist/harry-edison-mn0002784146,Mose Allison;Count Basie;Harry Edison,https://www.allmusic.com/artist/mose-allison-mn0000927627,Mose Allison,https://www.allmusic.com/album/complete-atlantic-elektra-albums-1962-1983-mw0003533754
3,3,Your Mind Is on Vacation,02:38,https://www.allmusic.com/song/your-mind-is-on-vacation-mt0059358824,https://www.allmusic.com/artist/mose-allison-mn0000927627;https://www.allmusic.com/artist/count-basie-mn0000127044;https://www.allmusic.com/artist/harry-edison-mn0002784146,Mose Allison;Count Basie;Harry Edison,https://www.allmusic.com/artist/mose-allison-mn0000927627,Mose Allison,https://www.allmusic.com/album/complete-atlantic-elektra-albums-1962-1983-mw0003533754
4,4,Let Me See,04:12,https://www.allmusic.com/song/let-me-see-mt0059358825,https://www.allmusic.com/artist/mose-allison-mn0000927627;https://www.allmusic.com/artist/count-basie-mn0000127044;https://www.allmusic.com/artist/harry-edison-mn0002784146,Mose Allison;Count Basie;Harry Edison,https://www.allmusic.com/artist/mose-allison-mn0000927627,Mose Allison,https://www.allmusic.com/album/complete-atlantic-elektra-albums-1962-1983-mw0003533754
5,5,Everything I Have Is Yours,04:09,https://www.allmusic.com/song/everything-i-have-is-yours-mt0059358826,https://www.allmusic.com/artist/harold-adamson-mn0000665176;https://www.allmusic.com/artist/mose-allison-mn0000927627;https://www.allmusic.com/artist/burton-lane-mn0000011535,Harold Adamson;Mose Allison;Burton Lane,https://www.allmusic.com/artist/mose-allison-mn0000927627,Mose Allison,https://www.allmusic.com/album/complete-atlantic-elektra-albums-1962-1983-mw0003533754


In [11]:
# Define a function to extract ID from a given URL
function extract_id_from_url(url::String)
    m = match(r"([a-z]+\d+)$", url)
    return m === nothing ? "unknown" : m.captures[1]
end

# Create an empty DataFrame for Composers with the specified column names
composers = DataFrame("artist_id" => String[], "artist_name" => String[], "url" => String[])

# Populate the Composers DataFrame
for i in 1:nrow(df)
    # Check for missing values before splitting
    composer_urls = ismissing(df[i, :composer_urls]) ? [] : split(df[i, :composer_urls], ";")
    composer_names = ismissing(df[i, :composer_names]) ? [] : split(df[i, :composer_names], ";")
    
    # names and urls should be same size

    for i in 1:length(composer_urls)
        composer_id = extract_id_from_url(String(composer_urls[i]))
        push!(composers, [composer_id, composer_names[i], composer_urls[i]])
    end
end

# Remove duplicate entries
composers = unique!(composers);
size(composers)

(2054, 3)

In [12]:
# Create an empty DataFrame for Performers
performers = DataFrame("artist_id" => String[], "artist_name" => String[], "url" => String[])

# Populate the Performers DataFrame
for i in 1:nrow(df)
    for performer_url in split(df[i, :performer_urls], ";"), performer_name in split(df[i, :performer_names], ";")
        performer_id = extract_id_from_url(String(performer_url))  # Convert SubString to String here
        push!(performers, [performer_id, performer_name, performer_url])
    end
end

# Remove duplicate entries
performers = unique!(performers);
size(performers)

(546, 3)

In [13]:
# Create an empty DataFrame for mapping Composers to Tracks
composers_tracks_map = DataFrame("composer_id" => String[], "track_id" => String[])

# Populate the mapping DataFrame
for i in 1:nrow(df)
    track_id = extract_id_from_url(df[i, :url])

    composer_urls = ismissing(df[i, :composer_urls]) ? [] : split(df[i, :composer_urls], ";")
    for composer_url in composer_urls
        composer_id = extract_id_from_url(String(composer_url))  # Convert SubString to String here
        push!(composers_tracks_map, [composer_id, track_id])
    end
end

size(composers_tracks_map)

(8005, 2)

In [14]:
# Create an empty DataFrame for mapping Performers to Tracks
performers_tracks_map = DataFrame("performer_id" => String[], "track_id" => String[])

# Populate the mapping DataFrame
for i in 1:nrow(df)
    track_id = extract_id_from_url(df[i, :url])
    for performer_url in split(df[i, :performer_urls], ";")
        performer_id = extract_id_from_url(String(performer_url))  # Convert SubString to String here
        push!(performers_tracks_map, [performer_id, track_id])
    end
end

size(performers_tracks_map)


(8184, 2)

In [15]:
# since composers and performers are all the same, combine them together into artists
artists = DataFrame()
append!(artists, composers)
append!(artists, performers)
artists = unique!(artists);

CSV.write("wrangled/artists.csv", artists)
CSV.write("wrangled/composers_tracks_map.csv", composers_tracks_map)
CSV.write("wrangled/performers_tracks_map.csv", performers_tracks_map);
