In [None]:
%load_ext autoreload
%autoreload 2

During the conception of 030-mos.ipynb I had problems where there are a lot of NANs in the dataset.
I'm thinking this might be due to too much rigidity in our assimilation pipeline.
This notebook aims to play around with the queries to the observation database and see if we can fix the problem.


### Conclusion

My conclusion from this notebook is that there are some gaps in the MESONET database, but they aren't huge. 
Notably there seems to have been an outage on 2020-10-20.
Interpolation for the few missing values I have seems to be the way to go.
Xarray has a function for this interpolation. I would only have to build the database into an xarray.

In [None]:
import calplot
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pymongo

In [None]:
from smc01.interpolate.obs import MongoIEMDatabase

In [None]:
MONGO_HOST = 'localhost'
MONGO_PORT = 27017
MONGO_USER = None
MONGO_PASS = None
MONGO_DB = 'smc01_raw_obs'
MONGO_COLLECTION = 'iem'
ADMIN_DB = "admin"

In [None]:
def dates_to_series(dates, begin, end):
    bins = {}
    for d in dates:
        key = d.date()
        current_count = bins.get(key, 0)
        bins[key] = current_count + 1
        
    delta = end - begin
    for i in range(delta.days):
        current_date = begin + datetime.timedelta(days=i)
        
        key = current_date.date()
        if key not in bins:
            bins[key] = 0
            
    return pd.Series(bins.values(), index=pd.DatetimeIndex(bins.keys()))

In [None]:
with pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT, username=MONGO_USER, password=MONGO_PASS, authSource=ADMIN_DB) as client:
    db = MongoIEMDatabase(client, db=MONGO_DB, collection=MONGO_COLLECTION)
    #station_info = db.station_info()

In [None]:
station_info

In [None]:
['BAD', 'CQC', 'JCT', 'MWA', 'MWC', 'OFP', 'OUN', 'PAVC', 'PSX', 'RKD',
       'SEE', 'TQH', 'TZR']

In [None]:
begin = datetime.datetime(2018, 12, 28)
end = datetime.datetime(2021, 1, 1)
obs = db.station_observations('UCP', begin, end, tolerance=20)

In [None]:
len(obs)

In [None]:
dates = [obs['valid'] for obs in obs]

In [None]:
daily_counts = dates_to_series(dates, begin, end)

In [None]:
daily_counts

In [None]:
calplot.calplot(daily_counts, suptitle='Observations per day for station UCP')
plt.savefig('smc01_calendar.png', dpi=200)

In [None]:
daily_counts[daily_counts < 8]

## Check inside the mongo database

Check if there is data for every day in the mongo databse.

In [None]:
with pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT, username=MONGO_USER, password=MONGO_PASS, authSource=ADMIN_DB) as client:
    collection = client[MONGO_DB][MONGO_COLLECTION]
    
    response = collection.find({'station': 'LUF', 'valid': {
        '$gte': datetime.datetime(2018, 12, 31),
        '$lt': datetime.datetime(2021, 1, 1, 12)
    }
    })
    
    obs = list(response)

In [None]:
df = pd.DataFrame(obs)

In [None]:
df[(df['valid'] >= '2020-01-26') & (df['valid'] < '2020-01-28') & ~df['tmpf'].isna()]

In [None]:
counts = dates_to_series(valids, datetime.datetime(2020, 4, 1), datetime.datetime(2020, 12, 31))

In [None]:
calplot.calplot(counts)

In [None]:
counts.min()

In [None]:
counts