In [None]:
import gzip
import os
import sys
from six.moves.urllib.request import urlretrieve, urlcleanup

In [None]:
data_dir = '/data'

In [None]:
class DownloadProgress:
    def __init__(self):
        self.last_percent_reported = None

    def __call__(self, count, blockSize, totalSize):
        percent = int(count * blockSize * 100 / totalSize)

        if self.last_percent_reported != percent:
            if percent % 5 == 0:
                sys.stdout.write("%s%%" % percent)
                sys.stdout.flush()
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
      
            self.last_percent_reported = percent

In [None]:
def extract_gz(source, destination):
    with gzip.open(source, 'rb') as infile:
        with open(destination, 'wb') as outfile:
            for line in infile:
                outfile.write(line)

In [None]:
weather_dir = os.path.join(data_dir, 'weather')
by_year_dir = os.path.join(weather_dir, 'by_year')
if not os.path.exists(weather_dir):
    os.mkdir(weather_dir)
if not os.path.exists(by_year_dir):
    os.mkdir(by_year_dir)

In [None]:
for year in range(1991, 2018):
    csv = "%d.csv" % year
    csv_path = os.path.join(by_year_dir, csv)
    
    gz = csv + ".gz"
    gz_path = os.path.join(by_year_dir, gz)
    
    url = "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/" + gz
    
    if not os.path.exists(csv_path):
        print("Downloading weather data for year %d" % year)
        urlcleanup()
        urlretrieve(url, gz_path, reporthook=DownloadProgress())
        print("\nExtracting weather data for year %d" % year)
        extract_gz(gz_path, csv_path)
        os.remove(gz_path)

In [None]:
base_url = "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/"
files = [("readme.txt", "readme.txt"),
         ("ghcnd-inventory.txt", "inventory.txt"),
         ("ghcnd-stations.txt", "stations.txt"),
         ("by_year/readme.txt", "by_year_readme.txt")]
for source, destination in files:
    url = base_url + source
    path = os.path.join(weather_dir, destination)
    if not os.path.exists(path):
        print("Downloading " + destination)
        urlcleanup()
        urlretrieve(url, path, reporthook=DownloadProgress())
        print("")