<a href="https://colab.research.google.com/github/pakdaniel/MECS-4995-AML-Group-24/blob/main/check_CASTNET_files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark
!pip install findspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=6b151c3093cc20ebb598f2f925ee87b2f813fca5e8c80f8a97d7268ce5e18c4c
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = (SparkSession
         .builder
         .master("local[*]")
         .appName("Check CASTNET Files")
         .config("spark.ui.port", "4050")
         .getOrCreate())

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [4]:
import os
import shutil
import pandas as pd
import glob
from typing import List

FILE_OUTPUT = "output"
if not os.path.exists(FILE_OUTPUT):
    os.makedirs(FILE_OUTPUT)
SHELL_SCRIPTS = "shell_scripts"
if not os.path.exists(SHELL_SCRIPTS):
    os.makedirs(SHELL_SCRIPTS)

get_filename = lambda p: os.path.normpath(p).split(os.path.sep)[-1]

def check_files_for_schema_match(files: List[str]) -> None:
  """
  Checks .csv files to see if they are all the same
  """
  df_current = None
  for i in range(0, len(files)-1):
    if df_current is None:
      df_current = pd.read_csv(files[0])
    df_next = pd.read_csv(files[i+1])

    current_file = get_filename(files[i])
    next_file = get_filename(files[i+1])

    if df_current.equals(df_next):
      print(f"{current_file} and {next_file} are the same")
    else:
      print(f"{current_file} and {next_file} are NOT the same")

def set_df_columns_nullable(spark, df, column_list, nullable=True):
    for struct_field in df.schema:
        if struct_field.name in column_list:
            struct_field.nullable = nullable
    df_mod = spark.createDataFrame(df.rdd, df.schema)
    return df_mod


csv_options = {
    "header": True,
    "sep": ",",
    "inferSchema": False,
    "mode": "PERMISSIVE"
}

# Hourly Gas

## Download files (run this on Colab)

In [5]:
sh = """for y in {2013..2022}
do
    wget --no-check-certificate "https://gaftp.epa.gov/castnet/CASTNET_Outgoing/data/hourly_gas_$y.zip"
    mkdir -p hourly_gas/$y
    unzip hourly_gas_$y.zip -d hourly_gas/$y
    rm hourly_gas_$y.zip
    sleep 1
done
"""
with open(os.path.join(SHELL_SCRIPTS, "hourly_gas.sh"), "w") as file:
  file.write(sh)

In [6]:
!bash shell_scripts/hourly_gas.sh

--2023-11-01 02:50:20--  https://gaftp.epa.gov/castnet/CASTNET_Outgoing/data/hourly_gas_2013.zip
Resolving gaftp.epa.gov (gaftp.epa.gov)... 134.67.100.99, 2620:117:506f:c7::f063
Connecting to gaftp.epa.gov (gaftp.epa.gov)|134.67.100.99|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 1429534 (1.4M) [application/zip]
Saving to: ‘hourly_gas_2013.zip’


2023-11-01 02:50:20 (10.5 MB/s) - ‘hourly_gas_2013.zip’ saved [1429534/1429534]

Archive:  hourly_gas_2013.zip
  inflating: hourly_gas/2013/hourly_gas_2013.csv  
  inflating: hourly_gas/2013/hourly_gas_2013_columninfo.csv  
  inflating: hourly_gas/2013/hourly_gas_2013_tableinfo.csv  
--2023-11-01 02:50:21--  https://gaftp.epa.gov/castnet/CASTNET_Outgoing/data/hourly_gas_2014.zip
Resolving gaftp.epa.gov (gaftp.epa.gov)... 134.67.100.99, 2620:117:506f:c7::f063
Connecting to gaftp.epa.gov (gaftp.epa.gov)|134.67.100.99|:443... connected.
  Unable to locally verify the 

## Check schema by reading the metadata *_columninfo.csv

In [7]:
hg_columninfo_files = os.path.join("hourly_gas", "*", "*_columninfo.csv")
hg_columninfo_files = glob.glob(hg_columninfo_files, recursive=True)
hg_columninfo_files.sort()

check_files_for_schema_match(hg_columninfo_files)

hourly_gas_2013_columninfo.csv and hourly_gas_2014_columninfo.csv are the same
hourly_gas_2014_columninfo.csv and hourly_gas_2015_columninfo.csv are the same
hourly_gas_2015_columninfo.csv and hourly_gas_2016_columninfo.csv are the same
hourly_gas_2016_columninfo.csv and hourly_gas_2017_columninfo.csv are the same
hourly_gas_2017_columninfo.csv and hourly_gas_2018_columninfo.csv are the same
hourly_gas_2018_columninfo.csv and hourly_gas_2019_columninfo.csv are the same
hourly_gas_2019_columninfo.csv and hourly_gas_2020_columninfo.csv are the same
hourly_gas_2020_columninfo.csv and hourly_gas_2021_columninfo.csv are the same
hourly_gas_2021_columninfo.csv and hourly_gas_2022_columninfo.csv are the same


In [8]:
display(pd.read_csv(hg_columninfo_files[0]).sort_values("COLUMN_ID"))

Unnamed: 0,COLUMN_ID,COLUMN_NAME,UNIT,DESCRIPTION,DATA_TYPE,DATA_LENGTH,PRIMARY_KEY
0,1,SITE_ID,,Site identification code,CHAR,6,1.0
1,2,DATE_TIME,,"Date and time sample collection began, Local S...",DATE,19,2.0
2,3,PARAMETER,,Abbreviated chemical symbol of parameter measured,CHAR,10,3.0
3,4,VALUE,ppb,Numeric value of parameter measured; ppb.,NUMBER,164,
4,5,VALUE_F,,Data quality code for VALUE,CHAR,1,
5,6,QA_CODE,,Quality assurance level of the record. (see QA...,CHAR,2,
6,7,UPDATE_DATE,,Date and time of last record update,DATE,19,


In [9]:
schema = StructType()

schema.add("SITE_ID", StringType(), False)
schema.add("DATE_TIME", StringType(), False)
schema.add("PARAMETER", StringType(), False)
schema.add("VALUE", DecimalType(16,4))
schema.add("VALUE_F", StringType())
schema.add("QA_CODE", StringType())
schema.add("UPDATE_DATE", StringType())

hg_files = os.path.join("hourly_gas", "*", "hourly_gas_20[1-2][0-9].csv")

df_hourly_gas = (spark
               .read
               .format("csv")
               .schema(schema)
               .options(**csv_options)
               .load(hg_files)
               )
df_hourly_gas = (df_hourly_gas
               .withColumn("DATE_TIME", F.to_timestamp("DATE_TIME"))
               .withColumn("UPDATE_DATE", F.to_timestamp("DATE_TIME"))
               .withColumn("Filename", F.element_at(F.split(F.input_file_name(), "/"), -1))
)

df_hourly_gas.printSchema()

root
 |-- SITE_ID: string (nullable = true)
 |-- DATE_TIME: timestamp (nullable = true)
 |-- PARAMETER: string (nullable = true)
 |-- VALUE: decimal(16,4) (nullable = true)
 |-- VALUE_F: string (nullable = true)
 |-- QA_CODE: string (nullable = true)
 |-- UPDATE_DATE: timestamp (nullable = true)
 |-- Filename: string (nullable = true)



In [10]:
display(df_hourly_gas.limit(10))

SITE_ID,DATE_TIME,PARAMETER,VALUE,VALUE_F,QA_CODE,UPDATE_DATE,Filename
BVL130,2020-01-01 00:00:00,CO,270.3,I,3,2020-01-01 00:00:00,hourly_gas_2020.csv
BVL130,2020-01-01 00:00:00,NO,0.022,U,3,2020-01-01 00:00:00,hourly_gas_2020.csv
BVL130,2020-01-01 00:00:00,NOY,2.178,,3,2020-01-01 00:00:00,hourly_gas_2020.csv
BVL130,2020-01-01 00:00:00,NOYDIF,2.155,,3,2020-01-01 00:00:00,hourly_gas_2020.csv
BVL130,2020-01-01 00:00:00,SO2_GA,0.373,U,3,2020-01-01 00:00:00,hourly_gas_2020.csv
BVL130,2020-01-01 01:00:00,CO,227.5,I,3,2020-01-01 01:00:00,hourly_gas_2020.csv
BVL130,2020-01-01 01:00:00,NO,0.029,U,3,2020-01-01 01:00:00,hourly_gas_2020.csv
BVL130,2020-01-01 01:00:00,NOY,2.588,,3,2020-01-01 01:00:00,hourly_gas_2020.csv
BVL130,2020-01-01 01:00:00,NOYDIF,2.559,,3,2020-01-01 01:00:00,hourly_gas_2020.csv
BVL130,2020-01-01 01:00:00,SO2_GA,0.449,U,3,2020-01-01 01:00:00,hourly_gas_2020.csv


In [11]:
display(df_hourly_gas.groupBy("Filename").count().orderBy("Filename"))

Filename,count
hourly_gas_2013.csv,208824
hourly_gas_2014.csv,200952
hourly_gas_2015.csv,324120
hourly_gas_2016.csv,326112
hourly_gas_2017.csv,311280
hourly_gas_2018.csv,328608
hourly_gas_2019.csv,341637
hourly_gas_2020.csv,342576
hourly_gas_2021.csv,341640
hourly_gas_2022.csv,341640


In [12]:
output_folder = "hourly_gas_combined"
df_hourly_gas.coalesce(1).write.format("parquet").mode("append").save(os.path.join(FILE_OUTPUT, output_folder))

parquet_file = [i for i in os.listdir(os.path.join(FILE_OUTPUT, output_folder)) if i.endswith("parquet")]
assert len(parquet_file) == 1, "Did you run this cell multiple times?"
parquet_file = parquet_file[0]

shutil.move(os.path.join(FILE_OUTPUT, output_folder, parquet_file), os.path.join(FILE_OUTPUT, f"{output_folder}.snappy.parquet"))
shutil.rmtree(os.path.join(FILE_OUTPUT, output_folder))

In [13]:
df_hourly_gas.unpersist()
del df_hourly_gas

In [14]:
df = pd.read_parquet(os.path.join(FILE_OUTPUT, "hourly_gas_combined.snappy.parquet"))
df = df.groupby(["Filename"])["Filename"].count()
display(df)

Filename
hourly_gas_2013.csv    208824
hourly_gas_2014.csv    200952
hourly_gas_2015.csv    324120
hourly_gas_2016.csv    326112
hourly_gas_2017.csv    311280
hourly_gas_2018.csv    328608
hourly_gas_2019.csv    341637
hourly_gas_2020.csv    342576
hourly_gas_2021.csv    341640
hourly_gas_2022.csv    341640
Name: Filename, dtype: int64

# Meteorlogical

## Download files (run this on Colab)

In [15]:
sh = """for y in {2013..2022}
do
    wget --no-check-certificate "https://gaftp.epa.gov/castnet/CASTNET_Outgoing/data/metdata_$y.zip"
    mkdir -p metdata/$y
    unzip metdata_$y.zip -d metdata/$y
    rm metdata_$y.zip
    sleep 1
done
"""
with open(os.path.join(SHELL_SCRIPTS, "metdata.sh"), "w") as file:
  file.write(sh)

In [16]:
!bash shell_scripts/metdata.sh

--2023-11-01 02:52:18--  https://gaftp.epa.gov/castnet/CASTNET_Outgoing/data/metdata_2013.zip
Resolving gaftp.epa.gov (gaftp.epa.gov)... 134.67.100.99, 2620:117:506f:c7::f063
Connecting to gaftp.epa.gov (gaftp.epa.gov)|134.67.100.99|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 13482548 (13M) [application/zip]
Saving to: ‘metdata_2013.zip’


2023-11-01 02:52:18 (37.2 MB/s) - ‘metdata_2013.zip’ saved [13482548/13482548]

Archive:  metdata_2013.zip
  inflating: metdata/2013/metdata_2013.csv  
  inflating: metdata/2013/metdata_2013_columninfo.csv  
  inflating: metdata/2013/metdata_2013_tableinfo.csv  
--2023-11-01 02:52:20--  https://gaftp.epa.gov/castnet/CASTNET_Outgoing/data/metdata_2014.zip
Resolving gaftp.epa.gov (gaftp.epa.gov)... 134.67.100.99, 2620:117:506f:c7::f063
Connecting to gaftp.epa.gov (gaftp.epa.gov)|134.67.100.99|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP reques

In [17]:
md_columninfo_files = os.path.join("metdata", "*", "*_columninfo.csv")
md_columninfo_files = glob.glob(md_columninfo_files, recursive=True)
md_columninfo_files.sort()

check_files_for_schema_match(md_columninfo_files)

metdata_2013_columninfo.csv and metdata_2014_columninfo.csv are the same
metdata_2014_columninfo.csv and metdata_2015_columninfo.csv are the same
metdata_2015_columninfo.csv and metdata_2016_columninfo.csv are the same
metdata_2016_columninfo.csv and metdata_2017_columninfo.csv are the same
metdata_2017_columninfo.csv and metdata_2018_columninfo.csv are the same
metdata_2018_columninfo.csv and metdata_2019_columninfo.csv are the same
metdata_2019_columninfo.csv and metdata_2020_columninfo.csv are the same
metdata_2020_columninfo.csv and metdata_2021_columninfo.csv are the same
metdata_2021_columninfo.csv and metdata_2022_columninfo.csv are the same


In [18]:
display(pd.read_csv(md_columninfo_files[0]).sort_values("COLUMN_ID"))

Unnamed: 0,COLUMN_ID,COLUMN_NAME,UNIT,DESCRIPTION,DATA_TYPE,DATA_LENGTH,PRIMARY_KEY
0,1,SITE_ID,,Site identification code,CHAR,6,1.0
1,2,DATE_TIME,,"Date and time sample collection began, Local S...",DATE,19,2.0
2,3,TEMPERATURE,degrees c,Temperature; degrees C.,NUMBER,164,
3,4,TEMPERATURE_F,,Data quality code for TEMPERATURE,CHAR,3,
4,5,TEMPERATURE_DELTA,degrees c,Temperature difference between 9m and 2m probe...,NUMBER,164,
5,6,TEMPERATURE_DELTA_F,,Data quality code for TEMPERATURE_DELTA,CHAR,3,
6,7,RELATIVE_HUMIDITY,%,Percent of relative humidity,NUMBER,164,
7,8,RELATIVE_HUMIDITY_F,,Data quality code for RELATIVE_HUMIDITY,CHAR,3,
8,9,SOLAR_RADIATION,watt/square meter,Solar Radiation; watt/square meter.,NUMBER,164,
9,10,SOLAR_RADIATION_F,,Data quality code for SOLAR_RADIATION,CHAR,3,


In [19]:
schema = StructType()
schema.add("SITE_ID", StringType(), False)
schema.add("DATE_TIME", StringType(), False)
schema.add("TEMPERATURE", DecimalType(16,4))
schema.add("TEMPERATURE_F", StringType())
schema.add("TEMPERATURE_DELTA", DecimalType(16,4))
schema.add("TEMPERATURE_DELTA_F", StringType())
schema.add("RELATIVE_HUMIDITY", DecimalType(16,4))
schema.add("RELATIVE_HUMIDITY_F", StringType())
schema.add("SOLAR_RADIATION", DecimalType(16,4))
schema.add("SOLAR_RADIATION_F", StringType())
schema.add("OZONE", DecimalType(16,4))
schema.add("OZONE_F", StringType())
schema.add("PRECIPITATION", DecimalType(16,4))
schema.add("PRECIPITATION_F", StringType())
schema.add("WINDSPEED", DecimalType(16,4))
schema.add("WINDSPEED_F", StringType())
schema.add("WIND_DIRECTION", DecimalType(16,4))
schema.add("WIND_DIRECTION_F", StringType())
schema.add("SIGMA_THETA", DecimalType(16,4))
schema.add("SIGMA_THETA_F", StringType())
schema.add("FLOW_RATE", DecimalType(16,4))
schema.add("FLOW_RATE_F", StringType())
schema.add("WINDSPEED_SCALAR", DecimalType(16,4))
schema.add("WINDSPEED_SCALAR_F", StringType())
schema.add("WETNESS", DecimalType(16,4))
schema.add("WETNESS_F", StringType())
schema.add("SHELTER_TEMPERATURE", DecimalType(16,4))
schema.add("SHELTER_TEMPERATURE_F", StringType())
schema.add("QA_CODE", StringType())
schema.add("UPDATE_DATE", StringType())


md_files = os.path.join("metdata", "*", "metdata_20[1-2][0-9].csv")

df_metdata = (spark
               .read
               .format("csv")
               .schema(schema)
               .options(**csv_options)
               .load(md_files)
               )
df_metdata = (df_metdata
               .withColumn("DATE_TIME", F.to_timestamp("DATE_TIME"))
               .withColumn("UPDATE_DATE", F.to_timestamp("DATE_TIME"))
               .withColumn("Filename", F.element_at(F.split(F.input_file_name(), "/"), -1))
              )
df_metdata.printSchema()

root
 |-- SITE_ID: string (nullable = true)
 |-- DATE_TIME: timestamp (nullable = true)
 |-- TEMPERATURE: decimal(16,4) (nullable = true)
 |-- TEMPERATURE_F: string (nullable = true)
 |-- TEMPERATURE_DELTA: decimal(16,4) (nullable = true)
 |-- TEMPERATURE_DELTA_F: string (nullable = true)
 |-- RELATIVE_HUMIDITY: decimal(16,4) (nullable = true)
 |-- RELATIVE_HUMIDITY_F: string (nullable = true)
 |-- SOLAR_RADIATION: decimal(16,4) (nullable = true)
 |-- SOLAR_RADIATION_F: string (nullable = true)
 |-- OZONE: decimal(16,4) (nullable = true)
 |-- OZONE_F: string (nullable = true)
 |-- PRECIPITATION: decimal(16,4) (nullable = true)
 |-- PRECIPITATION_F: string (nullable = true)
 |-- WINDSPEED: decimal(16,4) (nullable = true)
 |-- WINDSPEED_F: string (nullable = true)
 |-- WIND_DIRECTION: decimal(16,4) (nullable = true)
 |-- WIND_DIRECTION_F: string (nullable = true)
 |-- SIGMA_THETA: decimal(16,4) (nullable = true)
 |-- SIGMA_THETA_F: string (nullable = true)
 |-- FLOW_RATE: decimal(16,4) (

In [20]:
display(df_metdata.limit(10))

SITE_ID,DATE_TIME,TEMPERATURE,TEMPERATURE_F,TEMPERATURE_DELTA,TEMPERATURE_DELTA_F,RELATIVE_HUMIDITY,RELATIVE_HUMIDITY_F,SOLAR_RADIATION,SOLAR_RADIATION_F,OZONE,OZONE_F,PRECIPITATION,PRECIPITATION_F,WINDSPEED,WINDSPEED_F,WIND_DIRECTION,WIND_DIRECTION_F,SIGMA_THETA,SIGMA_THETA_F,FLOW_RATE,FLOW_RATE_F,WINDSPEED_SCALAR,WINDSPEED_SCALAR_F,WETNESS,WETNESS_F,SHELTER_TEMPERATURE,SHELTER_TEMPERATURE_F,QA_CODE,UPDATE_DATE,Filename
ABT147,2021-01-01 00:00:00,-1.625,,,M,,M,,M,24.74,,,M,,M,,M,,M,1.501,,,M,,M,23.47,,3,2021-01-01 00:00:00,metdata_2021.csv
ABT147,2021-01-01 01:00:00,-1.783,,,M,,M,,M,24.27,<,,M,,M,,M,,M,1.501,,,M,,M,23.46,,3,2021-01-01 01:00:00,metdata_2021.csv
ABT147,2021-01-01 02:00:00,-2.433,,,M,,M,,M,22.12,Y,,M,,M,,M,,M,1.501,,,M,,M,23.33,,3,2021-01-01 02:00:00,metdata_2021.csv
ABT147,2021-01-01 03:00:00,-2.287,,,M,,M,,M,23.58,,,M,,M,,M,,M,1.501,,,M,,M,23.57,,3,2021-01-01 03:00:00,metdata_2021.csv
ABT147,2021-01-01 04:00:00,-2.469,,,M,,M,,M,24.04,,,M,,M,,M,,M,1.501,,,M,,M,23.43,,3,2021-01-01 04:00:00,metdata_2021.csv
ABT147,2021-01-01 05:00:00,-2.629,,,M,,M,,M,24.89,,,M,,M,,M,,M,1.501,,,M,,M,23.71,,3,2021-01-01 05:00:00,metdata_2021.csv
ABT147,2021-01-01 06:00:00,-2.961,,,M,,M,,M,24.49,,,M,,M,,M,,M,1.501,,,M,,M,23.59,,3,2021-01-01 06:00:00,metdata_2021.csv
ABT147,2021-01-01 07:00:00,-2.855,,,M,,M,,M,25.61,,,M,,M,,M,,M,1.501,,,M,,M,23.48,,3,2021-01-01 07:00:00,metdata_2021.csv
ABT147,2021-01-01 08:00:00,-1.713,,,M,,M,,M,26.35,,,M,,M,,M,,M,1.501,,,M,,M,23.39,,3,2021-01-01 08:00:00,metdata_2021.csv
ABT147,2021-01-01 09:00:00,-0.657,,,M,,M,,M,27.23,,,M,,M,,M,,M,1.501,,,M,,M,23.23,,3,2021-01-01 09:00:00,metdata_2021.csv


In [21]:
display(df_metdata.groupBy("Filename").count().orderBy("Filename"))

Filename,count
metdata_2013.csv,795926
metdata_2014.csv,817272
metdata_2015.csv,831024
metdata_2016.csv,831648
metdata_2017.csv,836783
metdata_2018.csv,845064
metdata_2019.csv,847584
metdata_2020.csv,870984
metdata_2021.csv,883344
metdata_2022.csv,884184


In [22]:
output_folder = "metdata_combined"
df_metdata.coalesce(1).write.format("parquet").mode("append").save(os.path.join(FILE_OUTPUT, output_folder))

parquet_file = [i for i in os.listdir(os.path.join(FILE_OUTPUT, output_folder)) if i.endswith("parquet")]
assert len(parquet_file) == 1, "Did you run this cell multiple times?"
parquet_file = parquet_file[0]

shutil.move(os.path.join(FILE_OUTPUT, output_folder, parquet_file), os.path.join(FILE_OUTPUT, f"{output_folder}.snappy.parquet"))
shutil.rmtree(os.path.join(FILE_OUTPUT, output_folder))

In [23]:
df_metdata.unpersist()
del df_metdata

In [25]:
df = pd.read_parquet(os.path.join(FILE_OUTPUT, "metdata_combined.snappy.parquet"))
df = df.groupby(["Filename"])["Filename"].count()
display(df)

Filename
metdata_2013.csv    795926
metdata_2014.csv    817272
metdata_2015.csv    831024
metdata_2016.csv    831648
metdata_2017.csv    836783
metdata_2018.csv    845064
metdata_2019.csv    847584
metdata_2020.csv    870984
metdata_2021.csv    883344
metdata_2022.csv    884184
Name: Filename, dtype: int64

# Site

In [26]:
sh = """
  wget --no-check-certificate "https://gaftp.epa.gov/castnet/CASTNET_Outgoing/data/site.zip"
  mkdir -p site
  unzip site.zip -d site
  rm -rf site.zip
  cp site/site.csv output/site.csv
  rm -rf site
"""
with open(os.path.join(SHELL_SCRIPTS, "site.sh"), "w") as file:
  file.write(sh)

In [27]:
!bash shell_scripts/site.sh

--2023-11-01 02:59:07--  https://gaftp.epa.gov/castnet/CASTNET_Outgoing/data/site.zip
Resolving gaftp.epa.gov (gaftp.epa.gov)... 134.67.100.99, 2620:117:506f:c7::f063
Connecting to gaftp.epa.gov (gaftp.epa.gov)|134.67.100.99|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 9683 (9.5K) [application/zip]
Saving to: ‘site.zip’


2023-11-01 02:59:07 (137 MB/s) - ‘site.zip’ saved [9683/9683]

Archive:  site.zip
  inflating: site/site.csv           
  inflating: site/site_columninfo.csv  
  inflating: site/site_tableinfo.csv  


In [30]:
os.stat("output/metdata_combined.snappy.parquet").st_size/(1024**2)

107.9228572845459