# VGMusic Downloader

This [Python](https://www.python.org/)/[Jupyter](https://jupyter.org/) notebook scrapes all songs from the amazing [VGMusic](https://www.vgmusic.com/) website.

## Initialization

Make sure you install the dependencies from the [requirements.txt](requirements.txt) file:

* [beautifulsoup4](https://pypi.org/project/beautifulsoup4/)
* [html5lib](https://pypi.org/project/html5lib/)
* ~~[python-slugify](https://pypi.org/project/python-slugify/)~~
* [requests](https://pypi.org/project/requests/)
* [tqdm](https://pypi.org/project/tqdm/)
* [unidecode](https://pypi.org/project/unidecode/)

In [None]:
import re
import requests
import unicodedata
import unidecode
from bs4 import BeautifulSoup, Tag
from collections import namedtuple
from pathlib import Path
from tqdm.autonotebook import tqdm
from typing import Generator
from urllib.parse import urljoin

In [None]:
# Creating a global session to automatically reuse the same TCP connection.
# https://docs.python-requests.org/en/latest/user/advanced/#keep-alive
SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "VGMusic Downloader https://github.com/denilsonsa/vgmusic-downloader",
})

def GET(url):
    response = SESSION.get(url)
    response.raise_for_status()
    return response

In [None]:
# python-slugify is almost good, but not good enough.
# I'm now writing my own function.
#
# SLUG_DASH_REPLACEMENT = "SSLUGIFYDASHH"
# SLUG_REJECTED_PATTERN = re.compile(r"[^-+_a-zA-Z0-9() ]+")
# def slug_it(unsafe: str) -> str:
#     # Working around https://github.com/un33k/python-slugify/issues/107
#     without_dashes = unsafe.replace("-", SLUG_DASH_REPLACEMENT)
#     slug = slugify(without_dashes, lowercase=False, separator="_", regex_pattern=SLUG_REJECTED_PATTERN)
#     return slug.replace(SLUG_DASH_REPLACEMENT, "-").strip()
#
# slug_it(" Aa'bB\"cC/dD\\e(f)g[h]i{j}k?l!m:n;o<p>q*r_s-t+u=v@w#x$y%z&0 1.2,3")

In [None]:
# This code is inspired by:
# * https://github.com/un33k/python-slugify/blob/master/slugify/slugify.py
# * https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename

UNSAFE_CHARS = re.compile(r"[^-+_%. a-zA-Z0-9()\[\]]+")
SAFE_SEPARATOR = "_"
SAFE_SEPARATOR_DUPES = re.compile("(" + SAFE_SEPARATOR + "){2,}")
SAFE_SPACE_DUPES = re.compile(" {2,}")

def safe_filename(text: str) -> str:
    assert isinstance(text, str)
    text = unicodedata.normalize('NFKD', text)
    text = unidecode.unidecode(text)
    assert isinstance(text, str)
    # Replacing some:
    text = text.replace("&", " and ")
    # Replacing unsafe chars:
    text = UNSAFE_CHARS.sub(SAFE_SEPARATOR, text)
    # Removing duplicates:
    text = SAFE_SEPARATOR_DUPES.sub(SAFE_SEPARATOR, text)
    # Removing extra whitespace:
    text = SAFE_SPACE_DUPES.sub(" ", text)
    # Removing leading/trailing dots and spaces:
    text = text.strip(" .")
    
    return text

safe_filename(" Aa'bB\"cC/dD\\e(f)g[h]i{j}k?l!m:n;o<p>q*r_s-t+u=v@w#x$y%z&0  1.2,3 D&D C & C . ")

## Configuration

Please change these values to fit your needs.

In [None]:
STARTING_URL = "https://www.vgmusic.com/music/"

In [None]:
DESTINATION = Path("/path/to/save/VGMusic")

if not DESTINATION.is_dir():
    raise ValueError("The DESTINATION path does not exist! Aborting to prevent mistakes. `{}`".format(DESTINATION))

## Extracting the list of pages

Parses the sidebar menu to find all the pages/categories.

In [None]:
System = namedtuple("System", ["group", "name", "url", "safegroup", "safename"])

In [None]:
def extract_sidebar_links(url: str) -> Generator[System, None, None]:
    soup = BeautifulSoup(GET(url).text, "html5lib")
    head = ""
    for sidebar in soup.find_all(id="sidebar"):
        for p in sidebar.find_all("p"):
            classes = p.get("class", [])
            if "menuhead" in classes or "menularge" in classes:
                head = p.get_text().strip()
            elif "menu" in classes:
                for a in p.find_all("a"):
                    name = a.get_text().strip()
                    yield System(
                        head,
                        name,
                        urljoin(url, a.get("href", "")),
                        safe_filename(head),
                        safe_filename(name),
                    )

In [None]:
for stuff in extract_sidebar_links(STARTING_URL):
    print(stuff)

## Extracting each individual song

In [None]:
Song = namedtuple("Song", ["game", "name", "url", "size", "author", "safegame", "safename", "safeauthor"])

In [None]:
def extract_songs_from_page(url: str) -> Generator[Song, None, None]:
    soup = BeautifulSoup(GET(url).text, "html5lib")
    game = ""
    for tr in soup.select("body > table > tbody > tr"):
        classes = tr.get("class", [])
        cells = [td for td in tr.children if isinstance(td, Tag)]
        if "gameheader" in classes:
            game = tr.find("td", colspan="5").get_text().strip()
        elif game and len(cells) == 4:
            td_name, td_size, td_author, td_comments = cells
            a = td_name.find("a")
            href = a.get("href")
            size, _, _ = td_size.get_text().strip().partition(" bytes")
            size = int(size)
            if href:
                name = a.get_text().strip()
                author = td_author.get_text().strip()
                yield Song(
                    game,
                    name,
                    urljoin(url, href),
                    size,
                    author,
                    safe_filename(game),
                    safe_filename(name),
                    safe_filename(author),
                )

In [None]:
for stuff in extract_songs_from_page("https://www.vgmusic.com/music/console/3do/3do/"):
    print(stuff)

## File handling

In [None]:
def path_builder(system: System, song: Song) -> tuple[Path, str]:
    return (
        DESTINATION / system.safegroup / system.safename / song.safegame,
        song.safename + " {" + song.safeauthor + "}.mid",
    )

## Main loop

In [None]:
def download_everything():
    with tqdm(desc="Total songs", unit="MID") as progress_total:
        with tqdm(desc="Downloaded songs", unit="MID") as progress_downloaded:
            with tqdm(desc="Skipped songs", unit="MID") as progress_skipped:
                for system in tqdm(list(extract_sidebar_links(STARTING_URL)), desc="Systems", unit="systems"):
                    for song in tqdm(list(extract_songs_from_page(system.url)), desc="MID Songs for " + system.name, unit="MID"):
                        d, f = path_builder(system, song)
                        d.mkdir(parents=True, exist_ok=True)
                        fullpath = d / f

                        must_download = (not fullpath.exists()) or (fullpath.stat().st_size != song.size)
                        if must_download:
                            with open(fullpath, "wb") as fp:
                                fp.write(GET(song.url).content)
                            progress_downloaded.update(1)
                        else:
                            progress_skipped.update(1)
                        progress_total.update(1)

In [None]:
download_everything()

## TODO

Ideas for improvements to this code:

* [ ] Consider using [inquirer](https://pypi.org/project/inquirer/) to let the user select options. Maybe not worth the trouble, as it doesn't work inside a Jupyter notebook.
* [ ] Consider moving this out of a notebook and into a stand-alone Python script.
* [ ] Consider downloading in parallel. Probably not worth the trouble. Just leave it running overnight.