# Get album list as CSV

## Download raw HTML

In [15]:
ALBUM_LIST_URL = 'https://papadosio.bandcamp.com/music'

In [16]:
import requests

# Send a GET request to the URL
response = requests.get(ALBUM_LIST_URL)

# Check if the request was successful
if response.status_code == 200:
    # Get the HTML content of the page
    html_content = response.text

    # The file where you want to save the HTML content
    file_path = 'papadosio_bandcamp_music.html'

    # Write the HTML content to a file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(html_content)

    print(f'HTML content saved to {file_path}')
else:
    print('Failed to retrieve the HTML content. Status Code:', response.status_code)


HTML content saved to papadosio_bandcamp_music.html


## Parse HTML into CSV

In [17]:
# Base URL to prepend to album and track URLs
BASE_URL = 'https://papadosio.bandcamp.com'

In [18]:
from bs4 import BeautifulSoup
import csv

# Path to the HTML file you uploaded
file_path = '/content/papadosio_bandcamp_music.html'

# The output CSV file path
output_csv = 'papadosio_albums.csv'

# Read the HTML file
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find the 'ol' tag with id 'music-grid'
music_grid = soup.find('ol', id='music-grid')

# Open a CSV file to write the data
with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['Album Name', 'Album Link'])  # Writing the header row

    # Iterate through each 'li' in the 'ol' tag
    for li in music_grid.find_all('li'):
        # Find the 'a' tag and extract the 'href' attribute
        a_tag = li.find('a')
        if a_tag:
            album_link = a_tag['href']
            # Check if the link needs to be converted to a full URL
            if album_link.startswith('/album'):
                album_link = BASE_URL + album_link
        else:
            album_link = 'No link'

        # Find the 'p' tag, extract the text, and trim spaces and newlines
        p_tag = li.find('p')
        album_name = p_tag.get_text().strip() if p_tag else 'No name'

        # Write the album name and link to the CSV file
        writer.writerow([album_name, album_link])

print(f'Album data has been written to {output_csv}')


Album data has been written to papadosio_albums.csv


## Generate JSON of each album's contents

In [19]:
import csv
import json
import requests
from bs4 import BeautifulSoup

# The CSV file path - Adjust as needed
csv_file_path = '/content/papadosio_albums.csv'
# The output JSON file path - Adjust as needed
output_json_path = 'papadosio_albums_tracks.json'

# Function to download HTML and parse tracks
def parse_album_tracks(album_url):
    response = requests.get(album_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        tracks = []
        for td in soup.find_all('td', class_='title-col'):
            track_info = {}
            title_span = td.find('span', class_='track-title')
            if title_span:
                track_info['title'] = title_span.text.strip()
            time_span = td.find('span', class_='time')
            if time_span:
                track_info['duration'] = time_span.text.strip()
            a_tag = td.find('a')
            if a_tag:
                # Check if URL is fully resolved, if not prepend base_url
                track_url = a_tag['href']
                if not track_url.startswith('http'):
                    track_url = BASE_URL + track_url
                track_info['url'] = track_url
            tracks.append(track_info)
        return tracks
    else:
        print(f'Failed to download album page: {album_url}')
    return []

# Read the CSV file and process each album
albums_list = []
with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
    reader = csv.DictReader(csv_file)
    for index, row in enumerate(reader, start=1):
        print(f'Processing album {index}...')
        album_data = {
            'album_name': row['Album Name'],
            'album_link': row['Album Link'],
            'tracks': parse_album_tracks(row['Album Link'])
        }
        albums_list.append(album_data)

# Write the JSON output
with open(output_json_path, 'w', encoding='utf-8') as json_file:
    json.dump(albums_list, json_file, ensure_ascii=False, indent=4)

print(f'Data has been written to {output_json_path}')


Processing album 1...
Processing album 2...
Processing album 3...
Processing album 4...
Processing album 5...
Processing album 6...
Processing album 7...
Processing album 8...
Processing album 9...
Processing album 10...
Processing album 11...
Processing album 12...
Processing album 13...
Processing album 14...
Processing album 15...
Processing album 16...
Processing album 17...
Processing album 18...
Processing album 19...
Processing album 20...
Processing album 21...
Processing album 22...
Processing album 23...
Processing album 24...
Processing album 25...
Processing album 26...
Processing album 27...
Processing album 28...
Processing album 29...
Processing album 30...
Processing album 31...
Processing album 32...
Processing album 33...
Processing album 34...
Processing album 35...
Processing album 36...
Processing album 37...
Processing album 38...
Processing album 39...
Processing album 40...
Processing album 41...
Processing album 42...
Processing album 43...
Processing album 44.

## Create a list of all track names

In [21]:
import json

# The path to the JSON file
json_file_path = '/content/papadosio_albums_tracks.json'
# The output file for track names
output_file_path = 'sorted_unique_track_names.txt'

# Read the JSON file
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    albums_data = json.load(json_file)

# Extract and process track names, ensuring uniqueness
track_names = set()
for album in albums_data:
    for track in album['tracks']:
        track_name = track['title'].strip().lower()  # Strip and make case-insensitive
        track_names.add(track_name)

# Sort the track names alphabetically
sorted_track_names = sorted(track_names)

# Write the sorted, unique track names to a new file
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for track_name in sorted_track_names:
        output_file.write(f'{track_name}\n')

print(f'Sorted, unique track names have been written to {output_file_path}')


Sorted, unique track names have been written to sorted_unique_track_names.txt
