In [26]:
# Scrape note data from random website

import requests
import re
from bs4 import BeautifulSoup, PageElement

WEBSITE_NAME = 'https://pages.mtu.edu/~suits/notefreqs.html'

def convert_note_name(element: PageElement):
    text = re.sub(r'\s', '', element.text)
    
    # Attempt to match single note (eg. C4)
    result = re.match(r'^[CDEFGAB][0-8]$', text)
    if result:
        return text
    
    # Attempt to match sharp note
    result = re.match(r'^([CDEFGAB])#([0-8])', text)
    if result:
        return '{}{}_#'.format(result.group(1), result.group(2))
    
    raise ValueError('invalid note name')

def convert_note_frequency(element: PageElement):
    return float(element.text)

def scrape_page(page: BeautifulSoup):
    notes = []
    for row in soup.select_one('center > center > table ').select('tr'):
        children = list(filter(lambda c: c != '\n', row.contents))
        notes.append({
            'name': convert_note_name(children[0]),
            'frequency': convert_note_frequency(children[1])
        })

    return notes

response = requests.get(WEBSITE_NAME)
soup = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')
notes = scrape_page(soup)

print('Successfully scraped {} notes from {}'.format(len(notes), WEBSITE_NAME))

Successfully scraped 108 notes


In [27]:
# Dump scraped data to .csv for backup

OUT_FILENAME = 'notes.csv'

with open(OUT_FILENAME, 'w') as outfile:
    outfile.write('Name,Frequency\n')
    for note in notes:
        outfile.write('{},{}\n'.format(note['name'], note['frequency']))

print('Successfully dumped {} notes to \"{}\"'.format(len(notes), OUT_FILENAME))