# Metar assimilation

The purpose of this notebook if to fetch Metar data from the 
Mesonet archive https://mesonet.agron.iastate.edu/request/download.phtml and
insert them into a MongoDB database.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import csv
import datetime
import os
import concurrent
import urllib.parse
import pathlib
import pandas as pd
import io
from tqdm.notebook import tqdm

from pymongo import MongoClient

In [None]:
from smc01 import iem, util
from smc01.iem import fetch_one_station

In [None]:
util.load_environment()

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))

MONGO_URL = 'localhost'
MONGO_PORT = 27017
MONGO_USER = None
MONGO_PASS = None
MONGO_DB = 'smc01_iem_test'
MONGO_COLLECTION = 'iem_hourly'
ADMIN_DB = 'admin'

## Load stations from database directly

In [None]:
can_stations = iem.get_stations_from_networks(iem.ca_networks())

In [None]:
us_stations = iem.get_stations_from_networks(iem.us_networks())

In [None]:
stations = [*can_stations, *us_stations]

In [None]:
len(stations)

## Load stations from a file

In [None]:
coverage_file = DATA_DIR / '2021-05-11-selected-stations.csv'
coverage = pd.read_csv(coverage_file)

In [None]:
coverage

In [None]:
stations = list(coverage['station'])

In [None]:
len(stations)

In [None]:
def insert_obs_in_mongo(obs):
    mongo_client = MongoClient(host=MONGO_URL, port=MONGO_PORT, username=MONGO_USER, password=MONGO_PASS, authSource=ADMIN_DB)
    db = mongo_client[MONGO_DB]
    collection = db[MONGO_COLLECTION]
    collection.insert_many(obs)
    
    mongo_client.close()
    #print('Done inserting for {}'.format(obs[0]['station']))

In [None]:
def prepare_collection():
    mongo_client = MongoClient(host=MONGO_URL, port=MONGO_PORT, username=MONGO_USER, password=MONGO_PASS, authSource=ADMIN_DB)
    db = mongo_client[MONGO_DB]
    collection = db[MONGO_COLLECTION]
    
    collection.drop()
    
    collection.create_index('station')
    collection.create_index('valid')
    collection.create_index([('station', 1), ('valid', 1)], unique=True)

In [None]:
def on_done(future):
    obs = future.result()
    
    if obs:
        insert_obs_in_mongo(obs)

In [None]:
#prepare_collection()

In [None]:


mongo_client = MongoClient(host=MONGO_URL, port=MONGO_PORT, username=MONGO_USER, password=MONGO_PASS, authSource=ADMIN_DB)
db = mongo_client[MONGO_DB]
collection = db[MONGO_COLLECTION]


In [None]:
db.list_collection_names()

In [None]:
done_stations = set(collection.distinct('station'))

In [None]:
len(done_stations)

In [None]:
done_stations = set()

In [None]:
len(stations)

In [None]:
stations_left = set(stations) - done_stations

In [None]:
begin = datetime.datetime(2018, 12, 31)
end = datetime.datetime(2019, 1, 1)


In [None]:
stn_data = fetch_one_station('04W', begin, end)

In [None]:
df = pd.DataFrame(stn_data)
df = df[~df['tmpf'].isnull()]
df.set_index('valid', drop=False, inplace=True)
#df = df.resample('1H').nearest()
df = df[~df.index.duplicated(keep='first')]
df.reset_index(drop=True, inplace=True)

In [None]:
df.columns

In [None]:
startts = datetime.datetime(2021, 1, 1)
endts = datetime.datetime(2021, 1, 15)

with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    responses = []
    for station in stations:        
        future = executor.submit(fetch_one_station, station, startts, endts)
        future.add_done_callback(on_done)
        responses.append(future)

    for future in tqdm(concurrent.futures.as_completed(responses), total=len(responses)):
        pass
    concurrent.futures.wait(responses)

In [None]:
fetch_one_station('CYVR', datetime.datetime(2020,1,1), datetime.datetime(2020,2,1))[0:10]

In [None]:
mongo_client = MongoClient(host=MONGO_URL, port=MONGO_PORT, username=MONGO_USER, password=MONGO_PASS, authSource=ADMIN_DB)

In [None]:
mongo_client.list_database_names()

In [None]:
db = mongo_client[MONGO_DB]

In [None]:
collection = db[MONGO_COLLECTION]

In [None]:
collection.distinct('station')

In [None]:
collection.find_one({'valid': datetime.datetime(2018, 1, 1, 0, 0, tzinfo=datetime.datetime.now().astimezone().tzinfo)})

In [None]:
collection.find_one({'station': 'CBAR'})

In [None]:
list()

# Remove obs when there are too many

In [None]:
data = fetch_one_station('CYUL', datetime.datetime(2020, 1, 1), datetime.datetime(2020, 2, 1))

In [None]:
df = pd.DataFrame(data)

From this, we see that even at Montreal Trudeau, there are no more than 2-3 obs 
per hour. When they do obs in between hours it's actually interesting data 
(probably manual intervention).

Consequently we wont filter the obs here.

In [None]:
df

In [None]:
(df['valid'].dt.minute == 0).sum()

In [None]:
!conda install -y pymongo