For each weather data 
- save the raw data to parquet file
- join w veg data to reduce # of grids


In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [2]:
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [3]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [4]:
pd.set_option('display.max_colwidth', None)

In [5]:
files = os.listdir('../Weather_Data/')
files

['dead_fuel_moisture_1000hr.2001.2023.CA.nc',
 'dead_fuel_moisture_100hr.2001.2023.CA.nc',
 'expanded_Palmer_Drought_Severity_Index.2001.2023.CA.nc',
 'max_air_temperature.2001.2023.CA-008.nc',
 'max_relative_humidity.2001.2023.CA.nc',
 'max_wind_speed.2001.2023.CA.nc',
 'min_air_temperature.2001.2023.CA-009.nc',
 'min_relative_humidity.2001.2023.CA.nc',
 'Palmer_Drought_Severity_Index.2001.2023.CA.nc',
 'precipitation_amount.2001.2023.CA.nc',
 'specific_humidity.2001.2023.CA.nc',
 'surface_downwelling_shortwave_flux.2001.2023.CA.nc',
 'wind_from_direction.2001.2023.CA.nc',
 'wind_speed.2001.2023.CA.nc']

In [6]:
data_dir = '../Weather_Data/'
# Load the first dataset to use as a reference
# reference_ds = xr.open_dataset(os.path.join(data_dir, files[0]))

In [7]:
# go through all the files in ../Weather_Data/, print the dimensions and coordinates
for file in files:
    ds = xr.open_dataset(os.path.join(data_dir, file))
    print(f"File: {file}")
    print(f"Dimensions: {ds.dims}")
    print(f"Coordinates: {ds.coords}")
    print(f"Data variables: {ds.data_vars}")
    print()

File: dead_fuel_moisture_1000hr.2001.2023.CA.nc
Dimensions: Frozen({'day': 8400, 'lon': 259, 'lat': 240})
Coordinates: Coordinates:
  * day      (day) datetime64[ns] 2001-01-01 2001-01-02 ... 2023-12-31
  * lon      (lon) float64 -124.8 -124.7 -124.7 -124.6 ... -114.1 -114.1 -114.0
  * lat      (lat) float64 41.98 41.94 41.9 41.86 ... 32.15 32.11 32.07 32.03
Data variables: Data variables:
    dead_fuel_moisture_1000hr  (day, lat, lon) float32 ...

File: dead_fuel_moisture_100hr.2001.2023.CA.nc
Dimensions: Frozen({'day': 8400, 'lon': 259, 'lat': 240})
Coordinates: Coordinates:
  * day      (day) datetime64[ns] 2001-01-01 2001-01-02 ... 2023-12-31
  * lon      (lon) float64 -124.8 -124.7 -124.7 -124.6 ... -114.1 -114.1 -114.0
  * lat      (lat) float64 41.98 41.94 41.9 41.86 ... 32.15 32.11 32.07 32.03
Data variables: Data variables:
    dead_fuel_moisture_100hr  (day, lat, lon) float32 ...

File: expanded_Palmer_Drought_Severity_Index.2001.2023.CA.nc
Dimensions: Frozen({'day': 8400, 'l

In [7]:
# Function to check if dimensions and coordinates match
def check_compatibility(ds1, ds2):
    return ds1.dims == ds2.dims and all(ds1.coords[dim].equals(ds2.coords[dim]) for dim in ds1.dims)

# Initialize a list to store datasets and file names
datasets = []
compatible_files = []

# Load the reference dataset
reference_file = files[0]
reference_ds = xr.open_dataset(os.path.join(data_dir, reference_file))
datasets.append(reference_ds)
compatible_files.append(reference_file)

# Iterate over the remaining files and check compatibility
for file in files[1:]:
    file_name = file.split('.')[0]
    ds = xr.open_dataset(os.path.join(data_dir, file))
    # if file_name contains temperature, rename the variable to file_names
    if 'temperature' in file or 'relative_humidity' in file:
        # get the name of data variable
        current_data_var_name = list(ds.data_vars)[0]
        # rename the data variable
        ds = ds.rename({current_data_var_name: file_name})
        # print the replacement action
        print(f"Renaming {current_data_var_name} to {file_name}")
    if check_compatibility(reference_ds, ds):
        datasets.append(ds)
        compatible_files.append(file)
    else:
        print(f"Dimensions or coordinates do not match for file: {file}")

Renaming air_temperature to max_air_temperature
Renaming relative_humidity to max_relative_humidity
Renaming air_temperature to min_air_temperature
Renaming relative_humidity to min_relative_humidity
Dimensions or coordinates do not match for file: Palmer_Drought_Severity_Index.2001.2023.CA.nc


In [9]:
compatible_files

['dead_fuel_moisture_1000hr.2001.2023.CA.nc',
 'dead_fuel_moisture_100hr.2001.2023.CA.nc',
 'expanded_Palmer_Drought_Severity_Index.2001.2023.CA.nc',
 'max_air_temperature.2001.2023.CA-008.nc',
 'max_relative_humidity.2001.2023.CA.nc',
 'max_wind_speed.2001.2023.CA.nc',
 'min_air_temperature.2001.2023.CA-009.nc',
 'min_relative_humidity.2001.2023.CA.nc',
 'precipitation_amount.2001.2023.CA.nc',
 'specific_humidity.2001.2023.CA.nc',
 'surface_downwelling_shortwave_flux.2001.2023.CA.nc',
 'wind_from_direction.2001.2023.CA.nc',
 'wind_speed.2001.2023.CA.nc']

In [16]:
# Perform actions for each compatible file
for file in compatible_files:
    ds = xr.open_dataset(os.path.join(data_dir, file))
    
    # Convert the dataset to a pandas DataFrame
    panda_df = ds.to_dataframe().reset_index()
    
    # Save the DataFrame as a Parquet file
    output_file = f'../Clean_Data/Weather_Data/{file.split(".")[0]}.parquet'
    panda_df.to_parquet(output_file)
    
    # Print the action
    print(f"Saved {file} as {output_file}")

Saved dead_fuel_moisture_1000hr.2001.2023.CA.nc as ../Clean_Data/Weather_Data/dead_fuel_moisture_1000hr.parquet
Saved dead_fuel_moisture_100hr.2001.2023.CA.nc as ../Clean_Data/Weather_Data/dead_fuel_moisture_100hr.parquet
Saved expanded_Palmer_Drought_Severity_Index.2001.2023.CA.nc as ../Clean_Data/Weather_Data/expanded_Palmer_Drought_Severity_Index.parquet
Saved max_air_temperature.2001.2023.CA-008.nc as ../Clean_Data/Weather_Data/max_air_temperature.parquet
Saved max_relative_humidity.2001.2023.CA.nc as ../Clean_Data/Weather_Data/max_relative_humidity.parquet
Saved max_wind_speed.2001.2023.CA.nc as ../Clean_Data/Weather_Data/max_wind_speed.parquet
Saved min_air_temperature.2001.2023.CA-009.nc as ../Clean_Data/Weather_Data/min_air_temperature.parquet
Saved min_relative_humidity.2001.2023.CA.nc as ../Clean_Data/Weather_Data/min_relative_humidity.parquet
Saved precipitation_amount.2001.2023.CA.nc as ../Clean_Data/Weather_Data/precipitation_amount.parquet
Saved specific_humidity.2001.202

In [9]:
# merge all datasets
# merged_weather_dat = xr.merge(datasets)

In [10]:
# Print the dataset summary
# print(merged_weather_dat)

# Print the coordinates
# print(merged_weather_dat.coords)

# Print the data variables
# print(merged_weather_dat.data_vars)

<xarray.Dataset>
Dimensions:                                    (day: 8400, lon: 259, lat: 240)
Coordinates:
  * day                                        (day) datetime64[ns] 2001-01-0...
  * lon                                        (lon) float64 -124.8 ... -114.0
  * lat                                        (lat) float64 41.98 ... 32.03
Data variables: (12/14)
    dead_fuel_moisture_1000hr                  (day, lat, lon) float32 ...
    dead_fuel_moisture_100hr                   (day, lat, lon) float32 ...
    pdsi                                       (day, lat, lon) float32 ...
    category                                   (day, lat, lon) float32 ...
    max_air_temperature                        (day, lat, lon) float64 ...
    max_relative_humidity                      (day, lat, lon) float32 ...
    ...                                         ...
    min_relative_humidity                      (day, lat, lon) float32 ...
    precipitation_amount                       (day, 

In [11]:
# save the merged weather data
# panda_df = merged_weather_dat.to_dataframe().reset_index()

MemoryError: Unable to allocate 23.3 GiB for an array with shape (12, 522144000) and data type float32

In [15]:
# read the parquet file
# panda_df = pd.read_parquet('../Clean_Data/merged_weather_data.parquet')

In [16]:
panda_df = panda_df.dropna(how='all', subset=merged_weather_dat.data_vars)

In [17]:
panda_df.shape

(519859200, 14)

In [27]:
# remove N/A rows, this time still check merged_weather_dat.data_vars except for max_air_temperature, min_air_temperature
cols_wo_air_temperature = [col for col in merged_weather_dat.data_vars if col not in ['max_air_temperature', 'min_air_temperature']]
cols_wo_air_temperature

['dead_fuel_moisture_1000hr',
 'dead_fuel_moisture_100hr',
 'max_relative_humidity',
 'min_relative_humidity',
 'precipitation_amount',
 'specific_humidity',
 'surface_downwelling_shortwave_flux_in_air',
 'wind_from_direction',
 'wind_speed']

In [28]:
# drop rows where all values in cols_wo_air_temperature are NaN
panda_df = panda_df.dropna(how='all', subset=cols_wo_air_temperature)

In [29]:
panda_df.shape

(207328800, 14)

In [30]:
panda_df.to_parquet('../Clean_Data/merged_weather_data_complete.parquet')

In [5]:
panda_df = pd.read_parquet('../Clean_Data/merged_weather_data_complete.parquet')

In [6]:
panda_df.shape

(207328800, 14)

In [13]:
lon_lat_pair_weather_match_veg = pd.read_parquet('../Clean_Data/lon_lat_pair_weather_match_veg.parquet')

In [14]:
lon_lat_pair_weather_match_veg.shape

(17703, 4)

In [19]:
lon_lat_pair_weather_match_veg.head()

Unnamed: 0,lon,lat,type,distance
2197,-124.391667,40.441667,7.0,1.826642
2198,-124.391667,40.4,25.0,1.827314
2434,-124.35,40.566667,1.0,1.824622
2435,-124.35,40.525,8.0,1.825296
2436,-124.35,40.483333,25.0,1.825969


In [20]:
# confirm if lon and lat are unique in lon_lat_pair_weather_match_veg
lon_lat_pair_weather_match_veg[['lon', 'lat']].duplicated().sum()

0

In [11]:
compatible_files[5:]

['max_wind_speed.2001.2023.CA.nc',
 'min_air_temperature.2001.2023.CA-009.nc',
 'min_relative_humidity.2001.2023.CA.nc',
 'precipitation_amount.2001.2023.CA.nc',
 'specific_humidity.2001.2023.CA.nc',
 'surface_downwelling_shortwave_flux.2001.2023.CA.nc',
 'wind_from_direction.2001.2023.CA.nc',
 'wind_speed.2001.2023.CA.nc']

In [21]:
# read each saved parquet file
for file in compatible_files:
    # print the file name
    print(file)
    panda_df = pd.read_parquet(f'../Clean_Data/Weather_Data/{file.split(".")[0]}.parquet')
    # merge the dataframes
    panda_df_filtered = pd.merge(lon_lat_pair_weather_match_veg[['lon', 'lat']], panda_df, on=['lon', 'lat'], how='inner')
    # save the merged dataframe
    # Save the DataFrame as a Parquet file
    output_file = f'../Clean_Data/Weather_Data_w_Veg_Filter/{file.split(".")[0]}_filtered.parquet'
    panda_df_filtered.to_parquet(output_file)
    # show # of rows before and after merging
    print(panda_df.shape, panda_df_filtered.shape)
    # print separator
    print("="*50)

dead_fuel_moisture_1000hr.2001.2023.CA.nc
(522144000, 4) (148705200, 4)
dead_fuel_moisture_100hr.2001.2023.CA.nc
(522144000, 4) (148705200, 4)
expanded_Palmer_Drought_Severity_Index.2001.2023.CA.nc
(522144000, 5) (148705200, 5)
max_air_temperature.2001.2023.CA-008.nc
(522144000, 4) (148705200, 4)
max_relative_humidity.2001.2023.CA.nc
(522144000, 4) (148705200, 4)
max_wind_speed.2001.2023.CA.nc
(522144000, 4) (148705200, 4)
min_air_temperature.2001.2023.CA-009.nc


MemoryError: Unable to allocate 1.11 GiB for an array with shape (148705200,) and data type int64

In [15]:
# read each saved parquet file
for file in compatible_files[5:]:
    # print the file name
    print(file)
    panda_df = pd.read_parquet(f'../Clean_Data/Weather_Data/{file.split(".")[0]}.parquet')
    # merge the dataframes
    panda_df_filtered = pd.merge(lon_lat_pair_weather_match_veg[['lon', 'lat']], panda_df, on=['lon', 'lat'], how='inner')
    # save the merged dataframe
    # Save the DataFrame as a Parquet file
    output_file = f'../Clean_Data/Weather_Data_w_Veg_Filter/{file.split(".")[0]}_filtered.parquet'
    panda_df_filtered.to_parquet(output_file)
    # show # of rows before and after merging
    print(panda_df.shape, panda_df_filtered.shape)
    # print separator
    print("="*50)

max_wind_speed.2001.2023.CA.nc
(522144000, 4) (148705200, 4)
min_air_temperature.2001.2023.CA-009.nc
(522144000, 4) (148705200, 4)
min_relative_humidity.2001.2023.CA.nc
(522144000, 4) (148705200, 4)
precipitation_amount.2001.2023.CA.nc
(522144000, 4) (148705200, 4)
specific_humidity.2001.2023.CA.nc
(522144000, 4) (148705200, 4)
surface_downwelling_shortwave_flux.2001.2023.CA.nc
(522144000, 4) (148705200, 4)
wind_from_direction.2001.2023.CA.nc
(522144000, 4) (148705200, 4)
wind_speed.2001.2023.CA.nc
(522144000, 4) (148705200, 4)


In [14]:
# in panda_df, drop rows where lon and lat are not in lon_lat_pair_weather_match_veg using merge
# weather_dat = panda_df.merge(lon_lat_pair_weather_match_veg[['lon', 'lat','type']], on=['lon', 'lat'], how='inner')
# weather_dat.shape

(146193600, 14)

In [None]:
# weather_dat.to_parquet('../Clean_Data/merged_weather_data_w_veg.parquet')

In [36]:
# old code, only save limited locations where fire event happened
# panda_df.to_parquet('../Clean_Data/merged_weather_data_limited_locations.parquet')