### NSRDB Aggregator

In [1]:
import glob
import os
import site
import sqlite3
import sys

import logzero
import numpy as np
import pandas as pd
import yaml
from logzero import logger
from tqdm import tqdm
from tqdm.notebook import tqdm
from yaml import dump, load, safe_load

In [2]:
sys.path.append("../source")
import queries

In [4]:
log_path = "../logs/"
log_file = "nsrdb_aggregator.log"

logzero.logfile(log_path + log_file, maxBytes=1e6, backupCount=5, disableStderrLogger=True)
logger.info(f"{log_path}, {log_file}\n")

In [9]:
configs = None
try:
    with open("../config.yml", "r") as config_in:
        configs = load(config_in, Loader=yaml.SafeLoader)
        logger.info(f"{configs}\n")
except:
    logger.error(f"config file open failure.")
    exit(1)

cfg_vars = configs["url_variables"]
logger.info(f"variables: {cfg_vars}\n")

years = configs["request_years"]
logger.info(f"years: {years}\n")

db_path = configs["file_paths"]["download_db_path"]
data_path = configs["file_paths"]["download_path_zips"]
csv_path = configs["file_paths"]["download_path_raw"]

city = configs["location_info"]["city"]
state = configs["location_info"]["state"]
db_file = city + "_" + state + ".db"

# db_file = "nsrdb_daily.db"
db_file = "nsrdb_monthly_f.db"
# period = "D"
period = "M"

# db_file = "geo_zipcodes.db"

db_table1 = configs["table_names"]["db_table1"]
db_table2 = configs["table_names"]["db_table2"]

db_file2 = configs["file_names"]["db_file_gzc"]

logger.info(f"{db_path}, {db_file}")

nrows = configs["num_rows"][0]
zip_import = configs["zip_import"][True]

logger.info(f"number of rows: {nrows}\n")

In [10]:
# csv_path = "/home/gmyers/_UMSI/697/data/nsrdb_csv2/"

nsrdb_conv = "nsrdb_?????_????.csv"
nsrdb_meta_conv = "nsrdb_meta_?????_????.csv"


def get_csv_files(csv_path, name_conv):
    files = [file.split("/")[-1] for file in glob.glob(csv_path + name_conv)]
    return sorted(files)


nsrdb_files = get_csv_files(csv_path, nsrdb_conv)
nsrdb_meta_files = get_csv_files(csv_path, nsrdb_meta_conv)
print(db_path, db_file, db_file2)
print(len(nsrdb_files), len(nsrdb_meta_files))
print(nsrdb_files[:3])
print(nsrdb_meta_files[:3])

../../../../data/db/ nsrdb_monthly_f.db geo_zipcodes.db
3197 3197
['nsrdb_12019_1998.csv', 'nsrdb_12019_1999.csv', 'nsrdb_12019_2000.csv']
['nsrdb_meta_12019_1998.csv', 'nsrdb_meta_12019_1999.csv', 'nsrdb_meta_12019_2000.csv']


In [11]:
db_path
db_file

'nsrdb_monthly_f.db'

In [12]:
# establish db connection and cursor
conn = sqlite3.connect(db_path + db_file)
cursor = conn.cursor()

In [13]:
cursor.execute(queries.create_table_monthly_nsrdb)
conn.commit()

cursor.execute(queries.create_table_geo_zipcodes)
conn.commit()

In [14]:
# params = {"path": db_path, "db_file2": db_file2}
# need to test for existance of records in the db
# and skip the import if so
if zip_import:
    cursor.execute("""ATTACH DATABASE '../../../data/db/geo_zipcodes.db' AS gzc_db;""")
    cursor.execute("""INSERT INTO 'geo_zipcodes' SELECT * FROM gzc_db.geo_zipcodes;""")
    conn.commit()
    cursor.execute("DETACH gzc_db")

In [15]:
df_zip = pd.read_csv("../../../../data/zip_code_database.csv")
df_zip.set_index("zip", inplace=True, verify_integrity=True)
# df_zip.loc[74145][["primary_city", "state", "county"]]

In [16]:
qry_zips = "select DISTINCT zipcode from geo_zipcodes;"
zips_qry = cursor.execute(qry_zips)
zipss = [x[0] for x in list(zips_qry)]

In [17]:
# for i, zips in enumerate(zipss):
for zips in zipss:  # tqdm(zipss):
    locale_data = df_zip.loc[int(zips)][["primary_city", "state", "county"]].to_dict()
    locale_data.update({"zipcode": zips})

    cursor.execute(
        f"""update geo_zipcodes SET city=:primary_city,
        state=:state, county=:county
        where zipcode=:zipcode;""",
        locale_data,
    )

conn.commit()
#     if i == 5:
#         break

### Download link information
https://developer.nrel.gov/docs/solar/nsrdb/psm3-download/

In [18]:
cols = [
    "date_time",
    "zipcode",
    "location_id",
    "Year",
    "Month",
    "Day",
    "Hour",
    "Temperature",
    "Clearsky_DHI",
    "Clearsky_DNI",
    "Clearsky_GHI",
    "Cloud_Type",
    "Dew_Point",
    "DHI",
    "DNI",
    "Fill_Flag",
    "GHI",
    "Relative_Humidity",
    "Solar_Zenith_Angle",
    "Surface_Albedo",
    "Pressure",
    "Precipitable_Water",
    "Wind_Direction",
    "Wind_Speed",
    "Global_Horizontal_UV_Irradiance_(280-400nm)",
    "Global_Horizontal_UV_Irradiance_(295-385nm)",
]
cols = [
    "date_time",
    "zipcode",
    "location_id",
    "Temperature",
    "Clearsky_DHI",
    "Clearsky_DNI",
    "Clearsky_GHI",
    "Dew_Point",
    "DHI",
    "DNI",
    "GHI",
    "Relative_Humidity",
    "Pressure",
    "Precipitable_Water",
    "Wind_Speed",
    "Global_Horizontal_UV_Irradiance_(280-400nm)",
    "Global_Horizontal_UV_Irradiance_(295-385nm)",
]

In [19]:
debug = False
# for idx, file in enumerate(nsrdb_files):
for i, file in enumerate(nsrdb_files):  # tqdm(nsrdb_files):
    print(i, end="\r")
    df = pd.read_csv(csv_path + file, parse_dates=True, usecols=cols)
    df["date_time"] = pd.to_datetime(df["date_time"])

    df = df.set_index("date_time").resample(period).mean()
    df = df.round(decimals=5).reset_index(drop=False, inplace=False)
    df.rename(
        {
            "Global_Horizontal_UV_Irradiance_(280-400nm)": "GHI_UV_wd",
            "Global_Horizontal_UV_Irradiance_(295-385nm)": "GHI_UV_nw",
        },
        axis=1,
        inplace=True,
    )

    df["zipcode"] = df["zipcode"].astype("int64")
    df["zipcode"] = df["zipcode"].astype("object")
    df["location_id"] = df["location_id"].astype("int64")

    if debug:
        display(df)

    df.to_sql("nsrdb", conn, if_exists="append", index=False, method="multi")

#     if idx == 2:
#         break

3196

In [20]:
conn.commit()
conn.close()