In [8]:
import requests
from bs4 import BeautifulSoup
import os 

In [9]:
url = "https://www.vgmusic.com/music/"
response = requests.get(url)
print(f"Status code: {response.status_code}",)

Status code: 200


In [10]:
soup = BeautifulSoup(response.content, 'html.parser')

In [11]:
print("Raw HTML:")
print(response.content[:2000])  # First 2000 characters
print("\n-------------------\n")

Raw HTML:
b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">\r\n<html>\r\n\t<head>\r\n\t\t<title>VGMusic - 31806 Game Music MIDI files</title>\r\n\t\t<meta name="description" content="Video Game MIDI music, since 1996!">\r\n\t\t<meta name="keywords" content="MIDI,MID,videogame,game,video game,music,archive">\r\n\t\t<meta name="robots" content="index,follow">\r\n\t\t<meta http-equiv="content-type" content="text/html;charset=utf-8">\r\n\t\t<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">\r\n\t\t<link rel="stylesheet" href="/include/style-static.css">\r\n\t</head>\r\n\t<body>\r\n\t\t<div class="center">\r\n\t\t\t<div class="content vboxed hboxed">\r\n\t\t\t\t\t <div class="logo">\r\n\t\t\t\t\t\t<a href="/"><img width="493" height="299" src="/images/logos/vglogo4.jpg" alt="Videogame Music Archive Logo made by Alexander Farris"></a><br>\r\n\t\t\t\t\t\tMario and Link &copy; Nintendo. Sonic the Hedgehog and Miles "Tails" Prower &copy

In [12]:
links = soup.find_all('a') # this finds all link tags in the HTML
print(links)

[<a href="/"><img alt="Videogame Music Archive Logo made by Alexander Farris" height="299" src="/images/logos/vglogo4.jpg" width="493"/></a>, <a href="http://www.ocremix.org/">
<img alt="OCRemix" height="43" src="/images/sites/ocr4_logo_link01.jpg" width="140"/></a>, <a href="http://www.vgmdb.net/">
<img alt="VGMdb" height="50" src="/images/sites/button_vgmdb_140x50.gif" width="140"/></a>, <a href="https://www.vgmusic.com/">Home</a>, <a href="/updates/">Archive Updates</a>, <a href="/faq/">Freq. Asked Questions</a>, <a href="/new-files/">Newly Submitted Files</a>, <a href="/new-files/upload/">Upload Files</a>, <a href="/information/links.php">Related Links</a>, <a href="/information/addlink.php">Linking to Our Site</a>, <a href="/information/milestones.php">VGMusic Milestones</a>, <a href="console/nintendo">
									Nintendo
								 </a>, <a href="/music/console/nintendo/nes/">NES</a>, <a href="/music/console/nintendo/gameboy/">Game Boy</a>, <a href="/music/console/nintendo/snes/">SN

In [13]:
print("Parsed links:")
for link in links[:5]:  # Show first 5 links
    print(f"Link text: {link.text}")
    print(f"Link URL: {link.get('href')}")
    print()

Parsed links:
Link text: 
Link URL: /

Link text: 

Link URL: http://www.ocremix.org/

Link text: 

Link URL: http://www.vgmdb.net/

Link text: Home
Link URL: https://www.vgmusic.com/

Link text: Archive Updates
Link URL: /updates/



In [14]:
for link in links: 
    href = link.get('href', '')
    if 'music/' in href and href != 'music/':
        print(href)

/music/console/nintendo/nes/
/music/console/nintendo/gameboy/
/music/console/nintendo/snes/
/music/console/nintendo/n64/
/music/console/nintendo/virtualboy/
/music/console/nintendo/gba/
/music/console/nintendo/gamecube/
/music/console/nintendo/ds/
/music/console/nintendo/3ds/
/music/console/nintendo/wii/
/music/console/nintendo/wiiu/
/music/console/nintendo/switch/
/music/console/sega/
/music/console/sega/master/
/music/console/sega/gamegear/
/music/console/sega/genesis/
/music/console/sega/segacd/
/music/console/sega/32x/
/music/console/sega/saturn/
/music/console/sega/dreamcast/
/music/console/sony/ps1/
/music/console/sony/ps2/
/music/console/sony/ps3/
/music/console/sony/ps4/
/music/console/sony/psp/
/music/console/microsoft/xbox/
/music/console/microsoft/xbox360/
/music/console/nec/tg16/
/music/console/nec/tduo/
/music/console/nec/sgx/
/music/console/nec/pcfx/
/music/console/snk/neogeo/
/music/console/snk/neogeopocket/
/music/console/atari/2600/
/music/console/atari/7800/
/music/co

Since there are quite a lot of consoles, let's create a script to find midi file from a specific console page

In [15]:
from urllib.parse import urljoin
def get_midi_links(console_url):
    """Get all the midi links from a console page"""
    # Make the request
    response = requests.get(console_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all links
    midi_links = []
    for link in soup.find_all('a'):
        href = link.get('href', '')
        # Check if the link ends with .mid 
        if href.lower().endswith('.mid'):
            # Convert relative URL to absolute URL
            full_url = urljoin(console_url, href)
            print(f"Found MIDI: {link.text}")
            print(f"URL: {full_url}\n")
            midi_links.append(full_url)
    
    return midi_links

In [16]:
console = "https://www.vgmusic.com/music/console/nintendo/nes/"
midi_links = get_midi_links(console)
print(f"Total MIDIs found: {len(midi_links)}")

Found MIDI: Kick Off
URL: https://www.vgmusic.com/music/console/nintendo/nes/10-Yard_Fight-Kick_Off.mid

Found MIDI: "Raid and Pacific Attack" Title Screen Song
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943.mid

Found MIDI: Assault on Surface Forces B
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943sab.mid

Found MIDI: Level 1
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943-lev1.mid

Found MIDI: Level 1 - Pre-Boss
URL: https://www.vgmusic.com/music/console/nintendo/nes/43pbos1.mid

Found MIDI: Level 1 - Pre-Boss (Orchestrated)
URL: https://www.vgmusic.com/music/console/nintendo/nes/43pbos12.mid

Found MIDI: Level 3
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943-lev3.mid

Found MIDI: Level 3 (Wingroove)
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943-Lev3Win.mid

Found MIDI: Mission Failure
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943lost.mid

Found MIDI: Mission Success
URL: https://www.vgmusic.com/music

now get all the console url to pass through the get_midi_links function

In [17]:
def get_console_url(base_url="https://www.vgmusic.com/music/"):
    """Get links to all console section"""
    # Make a request
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, "html.parser")
    console_links = []
    for link in soup.find_all('a'):
        href = link.get('href','')
        if 'music/' in href and href != 'music/':
            full_url = urljoin(base_url, href)
            console_name = href.split('/')[-2] # get console name from the base url
            #print(f"Found console: {console_name}")
            console_links.append(full_url)
    return console_links

consoles = get_console_url()
print(f"There are {len(consoles)} URLs ")


There are 57 URLs 


now scan all the consoles url to get all the midi files 

In [18]:
import time 
def scan_all_consoles():
    """Get all the MIDI links from all console URLs"""
    base_url = "https://www.vgmusic.com/music/"
    
    # First, get all console sections
    console_links = get_console_url(base_url) 
    
    # Then, get MIDI links from each console
    all_midi_links = {}
    for console_url in console_links:
        console_name = console_url.split('/')[-2]        
        # Add a delay to be nice to the server
        time.sleep(1)
        midi_links = get_midi_links(console_url)
        all_midi_links[console_name] = midi_links
        print(f"Found {len(midi_links)} MIDI files from {console_name} url")
    
    return all_midi_links

In [19]:
midi_files = scan_all_consoles()

Found MIDI: Kick Off
URL: https://www.vgmusic.com/music/console/nintendo/nes/10-Yard_Fight-Kick_Off.mid

Found MIDI: "Raid and Pacific Attack" Title Screen Song
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943.mid

Found MIDI: Assault on Surface Forces B
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943sab.mid

Found MIDI: Level 1
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943-lev1.mid

Found MIDI: Level 1 - Pre-Boss
URL: https://www.vgmusic.com/music/console/nintendo/nes/43pbos1.mid

Found MIDI: Level 1 - Pre-Boss (Orchestrated)
URL: https://www.vgmusic.com/music/console/nintendo/nes/43pbos12.mid

Found MIDI: Level 3
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943-lev3.mid

Found MIDI: Level 3 (Wingroove)
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943-Lev3Win.mid

Found MIDI: Mission Failure
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943lost.mid

Found MIDI: Mission Success
URL: https://www.vgmusic.com/music

now move on to download midi files

In [20]:
from urllib.parse import unquote
import re
def download_midi(url, console_dir):
    """Download a single MIDI file"""
    try:
        # Get filename from URL and clean it
        filename = unquote(url.split('/')[-1])
        # Remove invalid characters from filename
        filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
        
        # Create full file path
        file_path = os.path.join(console_dir, filename)
        
        # Skip if file already exists
        if os.path.exists(file_path):
            print(f"Skipping existing file: {filename}")
            return True
        
        # Download the file
        response = requests.get(url)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded: {filename}")
            return True
        else:
            print(f"Failed to download {filename}: Status code {response.status_code}")
            return False
            
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False

In [21]:
def download_console_midis(console_url, base_dir="vgmusic_downloads"):
    """Download all MIDI files from a single console"""
    # Get console name from URL
    console_name = console_url.split('/')[-2]
    
    # Create console directory
    console_dir = os.path.join(base_dir, console_name)
    os.makedirs(console_dir, exist_ok=True)
    
    print(f"\nDownloading MIDIs for {console_name}...")
    
    # Get all MIDI links
    midi_links = get_midi_links(console_url)
    print(f"Found {len(midi_links)} MIDI files")
    
    # Download each file
    successful = 0
    failed = []
    
    for url in midi_links:
        # Add delay between downloads
        time.sleep(1)
        
        if download_midi(url, console_dir):
            successful += 1
        else:
            failed.append(url)
    
    # Print summary for this console
    print(f"\nConsole {console_name} download complete:")
    print(f"Successfully downloaded: {successful}")
    print(f"Failed downloads: {len(failed)}")
    
    return successful, failed

In [None]:
def download_all_consoles(base_dir="vgmusic_dataset"):
    """Download MIDI files from all consoles"""
    # Create base directory
    os.makedirs(base_dir, exist_ok=True)
    
    # Get all console links
    base_url = "https://www.vgmusic.com/music/"
    console_links = get_console_url(base_url)
    print(f"Found {len(console_links)} console sections")
    
    # Track overall progress
    total_downloaded = 0
    total_failed = []
    
    # Download for each console
    for console_url in console_links:
        successful, failed = download_console_midis(console_url, base_dir)
        total_downloaded += successful
        total_failed.extend(failed)
    
    # Print overall summary
    print("\nDownload Complete!")
    print(f"Total files downloaded: {total_downloaded}")
    print(f"Total failed downloads: {len(total_failed)}")
    
    # Log failed downloads
    if total_failed:
        log_path = os.path.join(base_dir, "failed_downloads.txt")
        with open(log_path, 'w') as f:
            for url in total_failed:
                f.write(f"{url}\n")
        print(f"Failed downloads have been logged to: {log_path}")


In [23]:
# The complete script with all previous functions
def main():
    """Main function to run the downloader"""
    print("VGMusic MIDI Downloader")
    print("----------------------")
    
    # Ask user for download option
    print("\nChoose download option:")
    print("1. Download from a single console")
    print("2. Download from all consoles")
    
    choice = input("\nEnter your choice (1 or 2): ").strip()
    
    if choice == "1":
        # Get console links
        base_url = "https://www.vgmusic.com/music/"
        console_links = get_console_url(base_url)
        
        # Show available consoles
        print("\nAvailable consoles:")
        for i, url in enumerate(console_links, 1):
            console_name = url.split('/')[-2]
            print(f"{i}. {console_name}")
        
        # Get user choice
        while True:
            try:
                idx = int(input("\nEnter console number: ")) - 1
                if 0 <= idx < len(console_links):
                    break
                print("Invalid number, please try again")
            except ValueError:
                print("Please enter a valid number")
        
        # Download from selected console
        download_console_midis(console_links[idx])
    
    elif choice == "2":
        # Download from all consoles
        download_all_consoles()
    
    else:
        print("Invalid choice!")

In [None]:
if __name__ == "__main__":
    main()

VGMusic MIDI Downloader
----------------------

Choose download option:
1. Download from a single console
2. Download from all consoles

Available consoles:
1. nes
2. gameboy
3. snes
4. n64
5. virtualboy
6. gba
7. gamecube
8. ds
9. 3ds
10. wii
11. wiiu
12. switch
13. sega
14. master
15. gamegear
16. genesis
17. segacd
18. 32x
19. saturn
20. dreamcast
21. ps1
22. ps2
23. ps3
24. ps4
25. psp
26. xbox
27. xbox360
28. tg16
29. tduo
30. sgx
31. pcfx
32. neogeo
33. neogeopocket
34. 2600
35. 7800
36. lynx
37. intellivision
38. colecovision
39. odyssey2
40. 3do
41. cd-i
42. windows
43. commodore
44. msx
45. atari
46. amiga
47. amstradcpc
48. appleii
49. macintosh
50. pc-88
51. pc-98
52. tutor
53. x68000
54. spectrum
55. arcade
56. medley
57. piano

Downloading MIDIs for nes...
Found MIDI: Kick Off
URL: https://www.vgmusic.com/music/console/nintendo/nes/10-Yard_Fight-Kick_Off.mid

Found MIDI: "Raid and Pacific Attack" Title Screen Song
URL: https://www.vgmusic.com/music/console/nintendo/nes/194