# Process Papadosio albums and sets from Bandcamp

In [None]:
import requests
import json
from bs4 import BeautifulSoup


def parse_iso8601_duration_to_seconds(duration):
  """
  Parses an ISO 8601 duration string into seconds, focusing on hours, minutes, and seconds.
  This version is more robust and avoids errors when encountering leading zeros or empty values.

  Args:
  - duration (str): The ISO 8601 duration string.

  Returns:
  - int: The duration in seconds.
  """
  # Initialize hours, minutes, and seconds to zero
  hours = minutes = seconds = 0

  # Remove the 'P' and 'T' markers
  duration = duration.replace('P', '').replace('T', '')

  # Split the duration by 'H', 'M', and 'S'
  parts = duration.split('H')
  if len(parts) > 1:
    hours = int(parts[0])
    duration = parts[1]

  parts = duration.split('M')
  if len(parts) > 1:
    minutes = int(parts[0])
    duration = parts[1]

  parts = duration.split('S')
  if len(parts) > 0 and parts[0]:
    seconds = int(parts[0])

  # Calculate total seconds
  total_seconds = seconds + minutes * 60 + hours * 3600

  return total_seconds


def parse_album(album_url):
  response = requests.get(album_url)
  if response.status_code != 200:
    print(f'Failed to download album page: {album_url}')
    return {}

  soup = BeautifulSoup(response.content, 'html.parser')

  script_tag = soup.find('script', type='application/ld+json')

  if script_tag:
    # Extract and parse the JSON data
    json_data = json.loads(script_tag.string)
  else:
    print("JSON data not found")
    return {}

  matching_release = next((release for release in json_data["albumRelease"] if release["@id"] == json_data["@id"]), None)

  if not matching_release:
    print("No matching albumRelease found.")
    return {}

  album = {
    'name': matching_release['name'],
    'url': json_data['mainEntityOfPage'],
    'date_published': json_data['datePublished'],
    'duration': 0,
    'tracks': []
  }

  for track in json_data['track']['itemListElement']:
    duration_seconds = parse_iso8601_duration_to_seconds(track['item']['duration'])
    album['duration'] += duration_seconds
    track_json = {
      'name': track['item']['name'],
      'duration': duration_seconds,
      'url': track['item']['mainEntityOfPage']
    }
    album['tracks'].append(track_json)

  return album


BASE_URL = 'https://papadosio.bandcamp.com'


# Send a GET request to the URL
response = requests.get(f'{BASE_URL}/music')

# Check if the request was successful
if response.status_code == 200:
  # Get the HTML content of the page
  html_content = response.text

  # The file where you want to save the HTML content
  file_path = 'music_page.html'

  # Write the HTML content to a file
  with open(file_path, 'w', encoding='utf-8') as file:
    file.write(html_content)

  print(f'HTML content saved to {file_path}')
else:
  print('Failed to retrieve the HTML content. Status Code:', response.status_code)


soup = BeautifulSoup(html_content, 'html.parser')

# Find the 'ol' tag with id 'music-grid'
music_grid = soup.find('ol', id='music-grid')

music_data = []

count = 0
# Iterate through each 'li' in the 'ol' tag
for li in music_grid.find_all('li'):
  count += 1
  print(f'Processing album #{count}')
  # Find the 'a' tag and extract the 'href' attribute
  a_tag = li.find('a')
  album_link = a_tag['href']
  # Check if the link needs to be converted to a full URL
  if album_link.startswith('/album'):
    album_link = BASE_URL + album_link
  album_json = parse_album(album_link)
  music_data.append(album_json)

# print(json.dumps(music_data, indent=4))
with open('papadosio.json', 'w', encoding='utf-8') as json_file:
  json.dump(music_data, json_file, ensure_ascii=False, indent=4)

print('Done!')

In [2]:
output_file_path = 'sorted_unique_track_names.txt'

track_names = set()
for album in music_data:
  for track in album['tracks']:
    track_name = track['name'].strip().lower() # Strip and make case-insensitive
    track_names.add(track_name)

# Sort the track names alphabetically
sorted_track_names = sorted(track_names)

# Write the sorted, unique track names to a new file
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for track_name in sorted_track_names:
        output_file.write(f'{track_name}\n')

print(f'Sorted, unique track names have been written to {output_file_path}')


Sorted, unique track names have been written to sorted_unique_track_names.txt


In [None]:
# TODO: Create search mapping file (e.g. "2 am" -> "2am")