In [61]:
from pymongo import MongoClient
import os
import time
import pandas as pd
import datetime as dt

client = MongoClient("localhost", 27017)
db = client['usgs']

In [211]:
qa_codes = {
    'A': 0,
    'P': 1,
    'R': 2,
    'W': 3
}

tz_codes = {
    'AKDT': -8,
    'AKST': -9,
    'AST' : -4,
    'CDT' : -6,
    'CST' : -5,
    'EDT' : -4,
    'EST' : -5,
    'GST' : -2,
    'HST' : -10,
    'MDT' : -6,
    'MST' : -7,
    'PDT' : -7,
    'PST' : -8,
}

mc_codes = {
    'meas': 'measured',
    'comp': 'corrected'
}

In [238]:
def parse_datetime(d, t, tz):
    dt_obj = dt.datetime(year   = int(d[0:4]), 
                         month  = int(d[4:6]), 
                         day    = int(d[6:8]), 
                         hour   = int(t[0:2]), 
                         minute = int(t[2:4]), 
                         second = int(t[4:6]))
    return int(time.mktime(dt_obj.utctimetuple()) + tz*3600)


def load_sites_info(db, data_folder, sites_info_file, overwrite = True, verbosity = 100):
    filelist = [f for f in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, f))]
    df = pd.read_csv(sites_info_file)

    collection = db['sites']
    if overwrite:
        collection.delete_many({})

    for i, f in enumerate(filelist):
        tags = f.split('.')    
        if i % verbosity == 0:
            print(i, f)
            
        if tags[3] == 'web' or tags[3] == 'comp':
            continue

        with open(os.path.join(data_folder, f)) as file:
            while file.readline()[0] == '#':
                pass
            file.readline()    
            line = file.readline()
            if len(line) > 0:
                prec = int(line.split('\t')[4])
            else:
                prec = -1

        site_no = int(tags[1])
        site_info = df.loc[df['SITE_NO'] == site_no].iloc[0]
        site = {
            "site_no": site_no,
            "description": site_info['STATION_NM'],
            "lat": site_info['DEC_LAT_VA'],
            "lon": site_info['DEC_LONG_V'],
            "state": site_info['STATE_NM'],
            "district": site_info['DISTRICT_N'],
            "drain_area": site_info['DRAIN_AREA'],
            "status": site_info['STATUS_15'],
            "precision": prec,
        }
        collection.insert_one(site)  
        
    return collection

        
def load_measurement_data(db, data_folder, overwrite = True, verbosity = 100):
    filelist = [f for f in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, f))]
        
    if overwrite:
        db[mc_codes['meas']].delete_many({})
        db[mc_codes['comp']].delete_many({})

    for i, f in enumerate(filelist):
        tags = f.split('.')    
        
        if i % verbosity == 0:
            print(i, f)
            
        if tags[3] == 'web':
            continue

        site_no = int(tags[1])

        utc = list()
        gh = list()
        qa = list()

        with open(os.path.join(data_folder, f)) as file:
            while file.readline()[0] == '#':
                pass
            file.readline()        
            for line in file.readlines():
                if len(line) < 1:
                    continue
                val = line.split('\t')
                ts = parse_datetime(val[0], val[1], tz_codes[val[2]])
                utc.append(ts)
                gh.append(float(val[3]))
                qa.append(qa_codes[val[7][0]])
        
        if len(utc) < 1 or len(gh) < 1 or len(qa) < 1:
            continue
            
        measurement = {
            "site_no": site_no,
            "utc": utc,
            "gh": gh,
        }
        
        code = mc_codes[tags[3]]
        db[code].insert_one(measurement)
    
    return db['meas'], db['comp']