### README - nsrdb_aggregator.ipynb

This notebook is the final in the series. Its purpose is to created an aggregated database from the raw download files in ../downloads/raw/. This notebooks should run without alteration to configs, and produce a city_state_M.db aggregated database. Note the "M" is for monthly but future iterations could include daily or weekly aggregations. These changes would occur in the [config.yml](../source/config.yml) aggregation section.

In [1]:
import glob
import os
import site
import sqlite3
import sys

import logzero
import numpy as np
import pandas as pd
import yaml
from logzero import logger
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_nb
from yaml import dump, load, safe_load

In [2]:
sys.path.append("../source")
import queries

In [3]:
log_path = "logs/"
log_file = "nsrdb_aggregator.log"

logzero.logfile(log_path + log_file, maxBytes=1e5, backupCount=5, disableStderrLogger=True)
logger.info(f"{log_path}, {log_file}\n")

In [4]:
configs = None
try:
    with open("../source/config.yml", "r") as config_in:
        configs = load(config_in, Loader=yaml.SafeLoader)
        logger.info(f"{configs}\n")
except:
    logger.error(f"config file open failure.")
    exit(1)

zip_db_path = configs["file_paths"]["zip_db_path"]
db1_path = configs["file_paths"]["downloads_path_db"]
data_path = configs["file_paths"]["downloads_path_zips"]
csv_path = configs["file_paths"]["downloads_path_raw"]

period = configs["aggregation"]["period"]
city = configs["location_info"]["city"]
state = configs["location_info"]["state"]

db1_file = city + "_" + state + "_" + period + ".db"
zip_db_file = configs["file_names"]["zip_db_file"]

logger.info(f"{db1_path}, {db1_file}")

zip_import = configs["zip_import"][True]
logger.info(f"zip_import: {zip_import}\n")

nrows = configs["num_rows"][0]
logger.info(f"number of rows: {nrows}\n")

In [5]:
nsrdb_conv = "nsrdb_?????_????.csv"
nsrdb_meta_conv = "nsrdb_meta_?????_????.csv"


def get_csv_files(csv_path, name_conv):
    files = [file.split("/")[-1] for file in glob.glob(csv_path + name_conv)]
    return sorted(files)


nsrdb_files = get_csv_files(csv_path, nsrdb_conv)
nsrdb_meta_files = get_csv_files(csv_path, nsrdb_meta_conv)

In [6]:
print(csv_path)
print(db1_path, db1_file)
print(len(nsrdb_files), len(nsrdb_meta_files))
print(nsrdb_files[:3])
print(nsrdb_meta_files[:3])

../downloads/raw/
../downloads/db/ city_state_M.db
2 2
['nsrdb_12019_2020.csv', 'nsrdb_12027_2020.csv']
['nsrdb_meta_12019_2020.csv', 'nsrdb_meta_12027_2020.csv']


In [7]:
# establish db connection and cursor
conn = sqlite3.connect(db1_path + db1_file)
cursor = conn.cursor()

In [8]:
cursor.execute(queries.create_table_monthly_nsrdb)
conn.commit()

cursor.execute(queries.create_table_geo_zipcodes)
conn.commit()

In [9]:
# params = {"path": db_path, "db_file2": db_file2}
# need to test for existance of records in the db
# and skip the import if so
if zip_import:
    cursor.execute("""ATTACH DATABASE '../data/db/geo_zipcodes.db' AS gzc_db;""")
    cursor.execute("""INSERT INTO 'geo_zipcodes' SELECT * FROM gzc_db.geo_zipcodes;""")
    conn.commit()
    cursor.execute("DETACH gzc_db")

### Download link information
https://developer.nrel.gov/docs/solar/nsrdb/psm3-download/

In [10]:
cols = [
    "date_time",
    "zipcode",
    "location_id",
    "Year",
    "Month",
    "Day",
    "Hour",
    "Temperature",
    "Clearsky_DHI",
    "Clearsky_DNI",
    "Clearsky_GHI",
    "Cloud_Type",
    "Dew_Point",
    "DHI",
    "DNI",
    "Fill_Flag",
    "GHI",
    "Relative_Humidity",
    "Solar_Zenith_Angle",
    "Surface_Albedo",
    "Pressure",
    "Precipitable_Water",
    "Wind_Direction",
    "Wind_Speed",
    "Global_Horizontal_UV_Irradiance_(280-400nm)",
    "Global_Horizontal_UV_Irradiance_(295-385nm)",
]
cols = [
    "date_time",
    "zipcode",
    "location_id",
    "Temperature",
    "Clearsky_DHI",
    "Clearsky_DNI",
    "Clearsky_GHI",
    "Dew_Point",
    "DHI",
    "DNI",
    "GHI",
    "Relative_Humidity",
    "Pressure",
    "Precipitable_Water",
    "Wind_Speed",
    "Global_Horizontal_UV_Irradiance_(280-400nm)",
    "Global_Horizontal_UV_Irradiance_(295-385nm)",
]

In [11]:
debug = False

for file in tqdm_nb(nsrdb_files):
    df = pd.read_csv(csv_path + file, parse_dates=True, usecols=cols)
    df["date_time"] = pd.to_datetime(df["date_time"])

    df = df.set_index("date_time").resample(period).mean()
    df = df.round(decimals=5).reset_index(drop=False, inplace=False)
    df.rename(
        {
            "Global_Horizontal_UV_Irradiance_(280-400nm)": "GHI_UV_wd",
            "Global_Horizontal_UV_Irradiance_(295-385nm)": "GHI_UV_nw",
        },
        axis=1,
        inplace=True,
    )

    df["zipcode"] = df["zipcode"].astype("int64")
    df["zipcode"] = df["zipcode"].astype("object")
    df["location_id"] = df["location_id"].astype("int64")

    if debug:
        display(df)

    df.to_sql("nsrdb", conn, if_exists="append", index=False, method="multi")

  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
conn.commit()
conn.close()