In [25]:
import os
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pathlib import Path

In [30]:
BASE = "https://www.midiworld.com"
NUM_PAGES = 2
DOWNLOAD_DIR = "midi_downloads"
CSV_FILE = "movie_themes.csv"
OMDB_KEY = "YOUR_KEY_HERE"        # ← free at omdbapi.com
os.makedirs(DOWNLOAD_DIR, exist_ok=True)                     # seconds between downloads (be nice)


Generate the pages to scrape data from

In [31]:
PAGES = [i for i in range(1, NUM_PAGES+1)]
print(PAGES)

[1, 2]


Request the year of release for a title, using the OMDb API (later)

In [3]:
year_cache = {}
def get_year(title):
    if title in year_cache: return year_cache[title]
    clean = re.sub(r"\s*\(.*?\)$", "", title.split("-")[0]).strip()
    url = "http://www.omdbapi.com/"
    try:
        r = requests.get(url, params={"apikey": OMDB_KEY, "t": clean}, timeout=5).json()
        y = r.get("Year", "Unknown").split("–")[0]
    except:
        y = "Unknown"
    year_cache[title] = y
    return y

Download all .midi files for each movie inside each page

In [None]:
rows = []

for page in PAGES:
    url = f"https://www.midiworld.com/search/{page}/?q=movie%20themes"
    print(f"\nScraping page {page}")
    soup = BeautifulSoup(requests.get(url).text, "html.parser")

    for li in soup.find_all("li"):
        text = li.get_text()
        if "(Movie Themes)" not in text:
            continue

        # Extract title
        match = re.search(r"^([\s\S]*?)\s*\(Movie Themes\)", str(text))
    
        if not match:
            continue

        title = match.group(1).strip()
        title = re.sub(r"\s*-?\s*$", "", title)
        print(f"Title: {title}")

        # Get download links inside <li> tags
        links = li.find_all("a", href=True, string="download")
        midis = []
        print(links)

        for a in links:
            dl_url = urljoin("https://www.midiworld.com", a["href"])
            # The real .mid is in redirect → follow it
            try:
                real_url = requests.head(dl_url, allow_redirects=True).url
                # if not real_url.endswith(".mid"):
                #     continue
            except:
                continue

            clean_title = title.strip()
            clean_title = re.sub(r'\s+', '_', clean_title)   # spaces → single _
            clean_title = re.sub(r'[^\w\-]', '', clean_title)
            filename   = f"{clean_title}.midi"

            # fname = f"{title} - {os.path.basename(real_url)}"
            # fpath = os.path.join(DOWNLOAD_DIR, re.sub(r'[<>:"/\\|?*]', '_', fname))
            # fpath = os.path.join(DOWNLOAD_DIR, filename)
            fpath = Path(DOWNLOAD_DIR) / filename

            if not os.path.exists(fpath):
                print(f"  ↓ {filename}")
                r = requests.get(real_url, allow_redirects=True)
                fpath.write_bytes(r.content)
                time.sleep(0.4)

                
                print("clean: ", clean_title)

            else:
                print(f"  Already have {filename}")

            midis.append(filename)

        # year = get_year(title)
        year = "-"
        rows.append([title, year, "; ".join(midis) if midis else ""])


Scraping page 1...
→ 1492 Conquest of Paradise
[<a href="https://www.midiworld.com/download/3777" target="_blank">download</a>]
  ↓ 1492_Conquest_of_Paradise.midi
clean:  1492_Conquest_of_Paradise
→ 1941
[<a href="https://www.midiworld.com/download/3778" target="_blank">download</a>]
  ↓ 1941.midi
clean:  1941
→ 2001 - Also Sprach Zarathustra Richard Strauss
[<a href="https://www.midiworld.com/download/3779" target="_blank">download</a>]
  ↓ 2001_-_Also_Sprach_Zarathustra_Richard_Strauss.midi
clean:  2001_-_Also_Sprach_Zarathustra_Richard_Strauss
→ 20th Century Fox
[<a href="https://www.midiworld.com/download/3780" target="_blank">download</a>]
  ↓ 20th_Century_Fox.midi
clean:  20th_Century_Fox
→ 54 - If You Could Read My Mind by Gordon Lightfoot
[<a href="https://www.midiworld.com/download/3781" target="_blank">download</a>]
  ↓ 54_-_If_You_Could_Read_My_Mind_by_Gordon_Lightfoot.midi
clean:  54_-_If_You_Could_Read_My_Mind_by_Gordon_Lightfoot
→ 633 Squadron
[<a href="https://www.midiw

In [27]:
print(rows)

[['1492 Conquest of Paradise', '-', '1492_Conquest_of_Paradise.midi'], ['1941', '-', '1941.midi'], ['2001 - Also Sprach Zarathustra Richard Strauss', '-', '2001_-_Also_Sprach_Zarathustra_Richard_Strauss.midi'], ['20th Century Fox', '-', '20th_Century_Fox.midi'], ['54 - If You Could Read My Mind by Gordon Lightfoot', '-', '54_-_If_You_Could_Read_My_Mind_by_Gordon_Lightfoot.midi'], ['633 Squadron', '-', '633_Squadron.midi'], ['The Accidental Tourist', '-', 'The_Accidental_Tourist.midi'], ['Ace Ventura Pet Detective', '-', 'Ace_Ventura_Pet_Detective.midi'], ['Addams Family', '-', 'Addams_Family.midi'], ['Addicted To Love', '-', 'Addicted_To_Love.midi'], ['Advance To The Rear', '-', 'Advance_To_The_Rear.midi'], ['Against All Odds', '-', 'Against_All_Odds.midi'], ['Air Force One', '-', 'Air_Force_One.midi'], ['Armageddon', '-', 'Armageddon.midi'], ['Austin Powers', '-', 'Austin_Powers.midi'], ['Back to the Future', '-', 'Back_to_the_Future.midi'], ['Batman', '-', 'Batman.midi'], ['Beetlejui