In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [14]:
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from datetime import datetime

In [3]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Pyproj version
3.6.1


In [4]:
pd.set_option('display.max_colwidth', None)

In [5]:
# Force garbage collection
import gc
gc.collect()

41

## Merge Weather Data

In [7]:
data_dir = '../../Clean_Data/Weather_Data/Combined_Weather_Data_w_Veg_SubRegion_Filter'
files = os.listdir(data_dir)
files

['dead_fuel_moisture_1000h.parquet',
 'dead_fuel_moisture_100h.parquet',
 'max_air_temperature.parquet',
 'max_relative_humidity.parquet',
 'min_air_temperature.parquet',
 'min_relative_humidity.parquet',
 'precipitation_amount.parquet',
 'specific_humidity.parquet',
 'surface_downwelling_shortwave_flux.parquet',
 'SWE.parquet',
 'wind_from_direction.parquet',
 'wind_speed.parquet']

In [15]:
log_messages = []
log_messages.append("Task: Merge weather data after veg and subregion filtering")
log_messages.append(f"Processing started on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [18]:
# use SWE.parquet as the main dataset
all_features = pd.read_parquet(f'{data_dir}/SWE.parquet')
# rename time to day
all_features = all_features.rename(columns={'time': 'day'})
log_messages.append(f"Loaded SWE.parquet with shape: {all_features.shape}")

In [19]:
# Read the rest of the files with a progress bar
for file in tqdm(files, desc="Merging weather files"):
    # Read the file
    panda_df = pd.read_parquet(f'{data_dir}/{file}')
    # drop col year if it exists
    if 'year' in panda_df.columns:
        panda_df = panda_df.drop(columns=['year'])
    if file == 'SWE.parquet':
        # If it's the first file, we already have all_features initialized
        continue
    all_features = pd.merge(all_features, panda_df, on=['lon', 'lat', 'day'], how='inner')
    log_messages.append(f"Merged {file} with shape: {panda_df.shape} into all_features, new shape: {all_features.shape}")    

Merging weather files: 100%|██████████| 12/12 [23:38<00:00, 118.21s/it]


In [20]:
all_features.dtypes

day                                          datetime64[ns]
lat                                                 float64
lon                                                 float64
SWE                                                 float32
year                                                  int32
dead_fuel_moisture_1000hr                           float32
dead_fuel_moisture_100hr                            float32
max_air_temperature                                 float64
max_relative_humidity                               float32
min_air_temperature                                 float64
min_relative_humidity                               float32
precipitation_amount                                float32
specific_humidity                                   float32
surface_downwelling_shortwave_flux_in_air           float32
wind_from_direction                                 float32
wind_speed                                          float32
dtype: object

In [22]:
save_path = '../../Clean_Data/Extended_Feature_Data/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
all_features.to_parquet(f'{save_path}/Weather_Data_w_Veg_SubRegion_Filter_Merged.parquet', index=False)
log_messages.append(f"Saved merged weather DataFrame to {save_path}/Weather_Data_w_Veg_SubRegion_Filter_Merged.parquet")

In [23]:
log_save_path = '../../Logs/Clean_Extended_Data/'
# Ensure the log directory exists
if not os.path.exists(log_save_path):
    os.makedirs(log_save_path)
with open(f'{log_save_path}/merge_weather_data_w_veg_subregion_log.txt', 'w') as log_file:
    log_file.write('\n'.join(log_messages))

In [47]:
# Force garbage collection
import gc
gc.collect()

1444

## Check Data

In [25]:
# Force garbage collection
import gc
gc.collect()

0

In [26]:
input_path = '../../Clean_Data/Extended_Feature_Data/'
file_name = 'Weather_Data_w_Veg_SubRegion_Filter_Merged.parquet'

all_features = pd.read_parquet(f'{input_path}/{file_name}')

In [27]:
all_features.shape, all_features.dtypes

((127478960, 16),
 day                                          datetime64[ns]
 lat                                                 float64
 lon                                                 float64
 SWE                                                 float32
 year                                                  int32
 dead_fuel_moisture_1000hr                           float32
 dead_fuel_moisture_100hr                            float32
 max_air_temperature                                 float64
 max_relative_humidity                               float32
 min_air_temperature                                 float64
 min_relative_humidity                               float32
 precipitation_amount                                float32
 specific_humidity                                   float32
 surface_downwelling_shortwave_flux_in_air           float32
 wind_from_direction                                 float32
 wind_speed                                          float32
 dtype

In [29]:
all_features['day'].min(), all_features['day'].max()

(Timestamp('1994-01-01 00:00:00'), Timestamp('2020-09-30 00:00:00'))

In [31]:
# print missing rate for each column
all_features.isnull().mean() * 100

day                                           0.000000
lat                                           0.000000
lon                                           0.000000
SWE                                           1.709074
year                                          0.000000
dead_fuel_moisture_1000hr                     0.167830
dead_fuel_moisture_100hr                      0.167830
max_air_temperature                           0.116906
max_relative_humidity                         0.167830
min_air_temperature                           0.116906
min_relative_humidity                         0.167832
precipitation_amount                         57.369750
specific_humidity                             0.167830
surface_downwelling_shortwave_flux_in_air     0.167830
wind_from_direction                           0.267880
wind_speed                                    0.167830
dtype: float64

In [32]:
# append the min and max dates to the log messages
log_messages.append(f"Minimum date in the dataset: {all_features['day'].min().strftime('%Y-%m-%d')}")
log_messages.append(f"Maximum date in the dataset: {all_features['day'].max().strftime('%Y-%m-%d')}")

In [33]:
# log missing rates
missing_rates = all_features.isnull().mean() * 100
for col, rate in missing_rates.items():
    log_messages.append(f"Missing rate for {col}: {rate:.2f}%")

In [35]:
log_save_path = '../../Logs/Clean_Extended_Data/'
# Ensure the log directory exists
if not os.path.exists(log_save_path):
    os.makedirs(log_save_path)
with open(f'{log_save_path}/merge_weather_data_w_veg_subregion_log.txt', 'w') as log_file:
    log_file.write('\n'.join(log_messages))