In [None]:
import ulmo
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sn
import sys
import os

def get_data(station_id):
    
    """ 
    
    This queries NCDC with ulmo and returns a data frame with 
    Precip, Tmax, and Tmin. It also cleans the dataframe of NaNs.
    The function ulmo.ncdc.ghcn_daily.get_stations can
    query for multiple stations by state and date but not
    for individual stations. Units for precip are tenths of mm.
    Units for temperature are tenths of degrees Celsius.
    Metadata: https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt

    :param station_id: ID from https://www.ncdc.noaa.gov/cdo-web/results
    :returns: a single dataframe containing all station data
    
    """
    
    single_station_dict = ulmo.ncdc.ghcn_daily.get_data(station_id, 
            elements=['PRCP','TMIN','TMAX'], as_dataframe=True)
    precip = pd.concat(single_station_dict.values(), 
                       keys=single_station_dict.keys()).loc['PRCP']
    tmin = pd.concat(single_station_dict.values(), 
                     keys=single_station_dict.keys()).loc['TMIN']
    tmax = pd.concat(single_station_dict.values(), 
                     keys=single_station_dict.keys()).loc['TMAX']
    precip = precip.dropna(axis=0, subset=['value'])
    tmin = tmin.dropna(axis=0, subset=['value'])
    tmax = tmax.dropna(axis=0, subset=['value'])
    merged1 = pd.merge(precip, tmin, left_index=True, right_index=True)
    merged2 = pd.merge(merged1, tmax, left_index=True, right_index=True)
    rename_dict = {'value_x': 'Precip', 'mflag_x': 'mflag_p','qflag_x': 'qflag_p',
               'sflag_x': 'sflag_p', 'value_y': 'Tmin', 'mflag_y': 'mflag_tmin',
              'qflag_y': 'qflag_tmin','sflag_y': 'sflag_tmin','value': 'Tmax',
              'mflag': 'mflag_tmax','qflag': 'qflag_tmax','sflag': 'sflag_tmax'}
    merged2.rename(columns=rename_dict, inplace=True)
    merged2['STATION_ID'] = pd.Series(station_id, index=merged2.index)
    
    return merged2

def savedfs(id_dictionary):
    
    """
    
    Calls get_data() for multiple ids and saves dataframes as hdfs, adding 
    a column for a human-readable name. id_dict can eventually be put into 
    a config file for modularity.
    
    :id_dictionary: dictionary with station Name as key, tuple containing ID 
    and hdf path as values.
    
    """
    
    dataframe_dict = {}
    for key in id_dictionary:
        stationframe = get_data(id_dictionary[key][0])
        stationframe['Name'] = pd.Series(key, index=stationframe.index)
        stationframe['Precip'] = pd.to_numeric(stationframe['Precip'])
        stationframe['Tmin'] = pd.to_numeric(stationframe['Tmin'])
        stationframe['Tmax'] = pd.to_numeric(stationframe['Tmax'])
        stationframe = stationframe.to_timestamp(how='end')
        with pd.HDFStore(id_dictionary[key][1], mode='w') as hdf:
            hdf.put('sheet1', stationframe, format='table', data_columns=True)
        with pd.HDFStore(id_dictionary[key][1],  mode='r') as savedhdf:
            df = savedhdf.select('sheet1')
        dataframe_dict[key] = df
        
    return dataframe_dict

id_dict = {'SANTA BARBARA, CA US':('USC00047902','../data/raw/SANTABARBARA.h5'), 
           'CACHUMA LAKE, CA US':('USC00041253','../data/raw/CACHUMALAKE.h5')}

dict_of_dataframes = savedfs(id_dict)


In [None]:
dict_of_dataframes['SANTA BARBARA, CA US']
