For each weather data after filtering w veg data (from 02 03 step)
- join w subregion data for further filtering

In [2]:
# Force garbage collection
import gc
gc.collect()

0

In [3]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [4]:
from datetime import datetime

In [5]:
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [6]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Pyproj version
3.6.1


In [7]:
pd.set_option('display.max_colwidth', None)

In [8]:
path_to_read = '../../Clean_Data/Weather_Data/Combined_Weather_Data_w_Veg_Filter/'
files = os.listdir(path_to_read)
files

['dead_fuel_moisture_1000h.parquet',
 'dead_fuel_moisture_100h.parquet',
 'max_air_temperature.parquet',
 'max_relative_humidity.parquet',
 'min_air_temperature.parquet',
 'min_relative_humidity.parquet',
 'precipitation_amount.parquet',
 'specific_humidity.parquet',
 'surface_downwelling_shortwave_flux.parquet',
 'SWE.parquet',
 'wind_from_direction.parquet',
 'wind_speed.parquet']

In [9]:
path_to_subregion = f'../../Clean_Data/Extended_Data_w_Veg_Filter/SubRegion'
file_name = 'lon_lat_pair_weather_match_subregion.parquet'
subregion = pd.read_parquet(path_to_subregion)

In [11]:
subregion.head()

Unnamed: 0,lon,lat,SubRegion
0,-124.391667,40.441667,North Coast
2,-124.35,40.525,North Coast
3,-124.35,40.483333,North Coast
4,-124.35,40.441667,North Coast
5,-124.35,40.4,North Coast


In [12]:
subregion.shape

(13048, 3)

In [13]:
subregion[['lon', 'lat']].drop_duplicates().shape[0] == subregion.shape[0]

True

In [14]:
path_to_read = '../../Clean_Data/Weather_Data/Combined_Weather_Data_w_Veg_Filter/'
path_to_save = '../../Clean_Data/Weather_Data/Combined_Weather_Data_w_Veg_SubRegion_Filter/'
# Ensure the output directory exists
if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)

missing_rate_dict_before_merge = {}
missing_rate_dict_after_merge = {}
log_messages = []
log_messages.append("Task: Data cleaning on combined weather data with subregion filter")
log_messages.append(f"Processing started on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

for file in tqdm(os.listdir(path_to_read)):
    
    log_messages.append("-" * 50)
    weather_dat = pd.read_parquet(f'{path_to_read}/{file}')
    log_messages.append(f"Processing file: {file}, shape: {weather_dat.shape}")

    # get column name in 4th column
    weather_variable = weather_dat.columns[3]
    # group by 'year', calculate the missing rate for weather_variable
    missing_rate_by_year = weather_dat.groupby('year')[weather_variable].apply(lambda x: x.isnull().mean())

    # Store missing rates in the dictionary
    for year, rate in missing_rate_by_year.items():
        if year not in missing_rate_dict_before_merge:
            missing_rate_dict_before_merge[year] = {}
        missing_rate_dict_before_merge[year][weather_variable] = rate

    # add shape to log messages
    
    # merge it with veg_data to filter the DataFrame
    dat_filtered = pd.merge(weather_dat, subregion[['lon', 'lat']], on=['lon', 'lat'], how='inner')
    log_messages.append(f"Filtered DataFrame shape: {dat_filtered.shape}")
    dat_filtered.to_parquet(f'{path_to_save}/{file}')
    log_messages.append(f"Saved filtered data to: {path_to_save}/{file}")

    missing_rate_by_year_after = dat_filtered.groupby('year')[weather_variable].apply(lambda x: x.isnull().mean())
    # Store missing rates in the dictionary
    for year, rate in missing_rate_by_year_after.items():
        if year not in missing_rate_dict_after_merge:
            missing_rate_dict_after_merge[year] = {}
        missing_rate_dict_after_merge[year][weather_variable] = rate

    # clean up
    del weather_dat, dat_filtered
    gc.collect()

100%|██████████| 12/12 [14:19<00:00, 71.60s/it]


In [15]:
# Convert the dictionary to a DataFrame
missing_rate_df_before = pd.DataFrame.from_dict(missing_rate_dict_before_merge, orient='index').sort_index()
missing_rate_df_before.index.name = 'year'

In [16]:
missing_rate_df_before

Unnamed: 0_level_0,dead_fuel_moisture_1000hr,dead_fuel_moisture_100hr,max_air_temperature,max_relative_humidity,min_air_temperature,min_relative_humidity,precipitation_amount,specific_humidity,surface_downwelling_shortwave_flux_in_air,SWE,wind_from_direction,wind_speed
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1993,,,,,,,,,,0.03963,,
1994,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.03963,0.023152,0.023152
1995,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.03963,0.023152,0.023152
1996,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.03963,0.023152,0.023152
1997,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.03963,0.023152,0.023152
1998,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.03963,0.023152,0.023152
1999,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.03963,0.023152,0.023152
2000,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.023152,0.03963,0.023152,0.023152
2001,0.020371,0.020371,0.000417,0.020371,0.000417,0.020371,0.746026,0.020371,0.020371,0.03963,0.0218,0.020371
2002,0.020371,0.020371,0.000417,0.020371,0.000417,0.020372,0.81749,0.020371,0.020371,0.03963,0.021884,0.020371


In [17]:
# Convert the dictionary to a DataFrame
missing_rate_df_after = pd.DataFrame.from_dict(missing_rate_dict_after_merge, orient='index').sort_index()
missing_rate_df_after.index.name = 'year'

In [18]:
missing_rate_df_after

Unnamed: 0_level_0,dead_fuel_moisture_1000hr,dead_fuel_moisture_100hr,max_air_temperature,max_relative_humidity,min_air_temperature,min_relative_humidity,precipitation_amount,specific_humidity,surface_downwelling_shortwave_flux_in_air,SWE,wind_from_direction,wind_speed
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1993,,,,,,,,,,0.017091,,
1994,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.017091,0.003602,0.003602
1995,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.017091,0.003602,0.003602
1996,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.017091,0.003602,0.003602
1997,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.017091,0.003602,0.003602
1998,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.017091,0.003602,0.003602
1999,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.017091,0.003602,0.003602
2000,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.003602,0.017091,0.003602,0.003602
2001,0.000996,0.000996,0.000307,0.000996,0.000307,0.000996,0.733842,0.000996,0.000996,0.017091,0.002501,0.000996
2002,0.000996,0.000996,0.000307,0.000996,0.000307,0.000997,0.807864,0.000996,0.000996,0.017091,0.002599,0.000996


In [19]:
path_to_save = '../../Summary_Data/Missing_Rate/'
# Ensure the output directory exists
if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)

missing_rate_df_before.to_csv(os.path.join(path_to_save, 'weather_data_w_veg_filter.csv'))
missing_rate_df_after.to_csv(os.path.join(path_to_save, 'weather_data_w_veg_subregion_filter.csv'))

In [20]:
log_save_path = '../../Logs/Clean_Extended_Data/'
# Ensure the log directory exists
if not os.path.exists(log_save_path):
    os.makedirs(log_save_path)
with open(f'{log_save_path}/filter_weather_veg_data_w_subregion_log.txt', 'w') as log_file:
    log_file.write('\n'.join(log_messages))

In [None]:
(157594531-142966936)/157594531 # 9.28% were removed after filtering with subregion

0.09281791003267746