In [None]:
datasets_folder = '../datasets'

In [None]:
from os.path import join
names = [_.strip() for _ in open(join(datasets_folder, 'names-all.txt'), 'rt')]
names

In [None]:
patterns = [_ + '.*' for _ in names]
patterns

In [None]:
import requests
from bs4 import BeautifulSoup

def get_anchor_elements(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup.find_all('a')

In [None]:
from fnmatch import fnmatch

def is_matching_anchor_element(anchor_element, patterns):
    text = anchor_element.text.strip(' /')
    for pattern in patterns:
        if fnmatch(text, pattern):
            return True
    return False

In [None]:
from os.path import join

def get_matching_link_urls(url, patterns):
    link_urls = []
    for anchor_element in get_anchor_elements(url):
        if not is_matching_anchor_element(anchor_element, patterns):
            continue
        link_urls.append(join(url, anchor_element.attrs['href']))
    return link_urls

In [None]:
base_url = 'https://openei.org/datasets/files/961/pub'

In [None]:
from os import makedirs
from os.path import basename
from urllib.request import urlretrieve

def download_to_folder(url, folder):
    name = basename(url)
    path = join(folder, name)
    urlretrieve(url, path)
    return path

def make_folder(folder):
    try:
        makedirs(folder)
    except IOError:
        pass
    return folder

In [None]:
# Download residential data
usage_name = 'RESIDENTIAL_LOAD_DATA_E_PLUS_OUTPUT'
level_categories = 'BASE', 'HIGH', 'LOW'
for level_category in level_categories:
    target_folder = make_folder(join(
        datasets_folder, usage_name, level_category))
    page_url = join(base_url, usage_name, level_category)
    table_urls = get_matching_link_urls(page_url, patterns)
    for table_url in table_urls:
        download_to_folder(table_url, target_folder)

In [None]:
# Download commercial data
from os.path import relpath
usage_name = 'COMMERCIAL_LOAD_DATA_E_PLUS_OUTPUT'
url = join(base_url, usage_name)
page_urls = get_matching_link_urls(url, patterns)
for page_url in page_urls:
    relative_path = relpath(page_url, base_url)
    target_folder = make_folder(join(datasets_folder, relative_path))
    table_urls = get_matching_link_urls(page_url, ['RefBldg*'])
    for table_url in table_urls:
        download_to_folder(table_url, target_folder)