In [1]:
import requests
from bs4 import BeautifulSoup
import os 

In [2]:
url = "https://www.vgmusic.com/music/"
response = requests.get(url)
print(f"Status code: {response.status_code}",)

Status code: 200


In [3]:
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
print("Raw HTML:")
print(response.content[:2000])  # First 2000 characters
print("\n-------------------\n")

Raw HTML:
b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">\r\n<html>\r\n\t<head>\r\n\t\t<title>VGMusic - 31806 Game Music MIDI files</title>\r\n\t\t<meta name="description" content="Video Game MIDI music, since 1996!">\r\n\t\t<meta name="keywords" content="MIDI,MID,videogame,game,video game,music,archive">\r\n\t\t<meta name="robots" content="index,follow">\r\n\t\t<meta http-equiv="content-type" content="text/html;charset=utf-8">\r\n\t\t<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">\r\n\t\t<link rel="stylesheet" href="/include/style-static.css">\r\n\t</head>\r\n\t<body>\r\n\t\t<div class="center">\r\n\t\t\t<div class="content vboxed hboxed">\r\n\t\t\t\t\t <div class="logo">\r\n\t\t\t\t\t\t<a href="/"><img width="493" height="299" src="/images/logos/vglogo4.jpg" alt="Videogame Music Archive Logo made by Alexander Farris"></a><br>\r\n\t\t\t\t\t\tMario and Link &copy; Nintendo. Sonic the Hedgehog and Miles "Tails" Prower &copy

In [7]:
links = soup.find_all('a') # this finds all link tags in the HTML
print(links)

[<a href="/"><img alt="Videogame Music Archive Logo made by Alexander Farris" height="299" src="/images/logos/vglogo4.jpg" width="493"/></a>, <a href="http://www.ocremix.org/">
<img alt="OCRemix" height="43" src="/images/sites/ocr4_logo_link01.jpg" width="140"/></a>, <a href="http://www.vgmdb.net/">
<img alt="VGMdb" height="50" src="/images/sites/button_vgmdb_140x50.gif" width="140"/></a>, <a href="https://www.vgmusic.com/">Home</a>, <a href="/updates/">Archive Updates</a>, <a href="/faq/">Freq. Asked Questions</a>, <a href="/new-files/">Newly Submitted Files</a>, <a href="/new-files/upload/">Upload Files</a>, <a href="/information/links.php">Related Links</a>, <a href="/information/addlink.php">Linking to Our Site</a>, <a href="/information/milestones.php">VGMusic Milestones</a>, <a href="console/nintendo">
									Nintendo
								 </a>, <a href="/music/console/nintendo/nes/">NES</a>, <a href="/music/console/nintendo/gameboy/">Game Boy</a>, <a href="/music/console/nintendo/snes/">SN

In [8]:
print("Parsed links:")
for link in links[:5]:  # Show first 5 links
    print(f"Link text: {link.text}")
    print(f"Link URL: {link.get('href')}")
    print()

Parsed links:
Link text: 
Link URL: /

Link text: 

Link URL: http://www.ocremix.org/

Link text: 

Link URL: http://www.vgmdb.net/

Link text: Home
Link URL: https://www.vgmusic.com/

Link text: Archive Updates
Link URL: /updates/



In [7]:
for link in links: 
    href = link.get('href', '')
    if 'music/' in href and href != 'music/':
        print(href)

/music/console/nintendo/nes/
/music/console/nintendo/gameboy/
/music/console/nintendo/snes/
/music/console/nintendo/n64/
/music/console/nintendo/virtualboy/
/music/console/nintendo/gba/
/music/console/nintendo/gamecube/
/music/console/nintendo/ds/
/music/console/nintendo/3ds/
/music/console/nintendo/wii/
/music/console/nintendo/wiiu/
/music/console/nintendo/switch/
/music/console/sega/
/music/console/sega/master/
/music/console/sega/gamegear/
/music/console/sega/genesis/
/music/console/sega/segacd/
/music/console/sega/32x/
/music/console/sega/saturn/
/music/console/sega/dreamcast/
/music/console/sony/ps1/
/music/console/sony/ps2/
/music/console/sony/ps3/
/music/console/sony/ps4/
/music/console/sony/psp/
/music/console/microsoft/xbox/
/music/console/microsoft/xbox360/
/music/console/nec/tg16/
/music/console/nec/tduo/
/music/console/nec/sgx/
/music/console/nec/pcfx/
/music/console/snk/neogeo/
/music/console/snk/neogeopocket/
/music/console/atari/2600/
/music/console/atari/7800/
/music/co

Since there are quite a lot of consoles, let's create a script to find midi file from a specific console page

In [None]:
from urllib.parse import urljoin
def get_midi_links(console_url):
    # Make the request
    response = requests.get(console_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all links
    midi_links = []
    for link in soup.find_all('a'):
        href = link.get('href', '')
        # Check if the link ends with .mid
        if href.lower().endswith('.mid'):
            # Convert relative URL to absolute URL
            full_url = urljoin(console_url, href)
            print(f"Found MIDI: {link.text}")
            print(f"URL: {full_url}\n")
            midi_links.append(full_url)
    
    return midi_links

In [12]:
console = "https://www.vgmusic.com/music/console/nintendo/nes/"
midi_links = get_midi_links(console)
print(f"Total MIDIs found: {len(midi_links)}")

Found MIDI: Kick Off
URL: https://www.vgmusic.com/music/console/nintendo/nes/10-Yard_Fight-Kick_Off.mid

Found MIDI: "Raid and Pacific Attack" Title Screen Song
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943.mid

Found MIDI: Assault on Surface Forces B
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943sab.mid

Found MIDI: Level 1
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943-lev1.mid

Found MIDI: Level 1 - Pre-Boss
URL: https://www.vgmusic.com/music/console/nintendo/nes/43pbos1.mid

Found MIDI: Level 1 - Pre-Boss (Orchestrated)
URL: https://www.vgmusic.com/music/console/nintendo/nes/43pbos12.mid

Found MIDI: Level 3
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943-lev3.mid

Found MIDI: Level 3 (Wingroove)
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943-Lev3Win.mid

Found MIDI: Mission Failure
URL: https://www.vgmusic.com/music/console/nintendo/nes/1943lost.mid

Found MIDI: Mission Success
URL: https://www.vgmusic.com/music

now get all the console url to pass through the get_midi_links function

In [16]:
def get_console_url(base_url="https://www.vgmusic.com/music/"):
    # Make a request
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, "html.parser")
    console_links = []
    for link in soup.find_all('a'):
        href = link.get('href','')
        if 'music/' in href and href != 'music/':
            full_url = urljoin(base_url, href)
            console_name = href.split('/')[-2] # get console name from the base url
            print(f"Found console: {console_name}")
            console_links.append(link)
    return console_links

consoles = get_console_url()
print(f"There are {len(consoles)} consoles ")

Found console: nes
Found console: gameboy
Found console: snes
Found console: n64
Found console: virtualboy
Found console: gba
Found console: gamecube
Found console: ds
Found console: 3ds
Found console: wii
Found console: wiiu
Found console: switch
Found console: sega
Found console: master
Found console: gamegear
Found console: genesis
Found console: segacd
Found console: 32x
Found console: saturn
Found console: dreamcast
Found console: ps1
Found console: ps2
Found console: ps3
Found console: ps4
Found console: psp
Found console: xbox
Found console: xbox360
Found console: tg16
Found console: tduo
Found console: sgx
Found console: pcfx
Found console: neogeo
Found console: neogeopocket
Found console: 2600
Found console: 7800
Found console: lynx
Found console: intellivision
Found console: colecovision
Found console: odyssey2
Found console: 3do
Found console: cd-i
Found console: windows
Found console: commodore
Found console: msx
Found console: atari
Found console: amiga
Found console: amst