In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# These are the years we want to get the data for
years = np.arange(11)+2005

ghcnd = []

# For each year, download the corresponding data, and perform cleaning
for year in tqdm(years):
    df = pd.read_csv('ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/{}.csv.gz'.format(year),
                        compression='gzip', 
                        index_col=0, 
                        header=None,
                        usecols=[0,1,2,3,5],
                        names=['ID', 'Date', 'Element', 'Data_Value', 'M-FLAG', 'Q-FLAG', 'S-FLAG', 'OBS-TIME'],
                        parse_dates = [1],
                        infer_datetime_format=True)
    df = df[(df['Element'].isin(['TMAX', 'TMIN'])) & (df['Q-FLAG'].isnull()) & (df['Data_Value'].notnull())]
    df = df.drop('Q-FLAG', axis=1)
    ghcnd.append(df)

# Create master DataFrame
ghcnd_df = pd.concat(ghcnd)

# Save to hdf5
ghcnd_df.to_hdf('GHCND_10Year.h5', 'ghcn_df')