In [6]:
import pandas as pd
import pyproj
from tqdm import tqdm
import numpy as np
import os
import matplotlib.pyplot as plt
from datetime import datetime

In [7]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
3.6.1


In [3]:
pd.set_option('display.max_colwidth', None)

In [1]:
# Force garbage collection
import gc
gc.collect()

34

In [3]:
input_path = '../../Clean_Data/Extended_Feature_Data/'
file_name =  'Weather_Data_w_Veg_SubRegion_Filter_Merged_Add_population_lai.parquet'

all_features = pd.read_parquet(os.path.join(input_path, file_name))

In [4]:
all_features.shape

(127478960, 18)

In [5]:
all_features.dtypes

day                                          datetime64[ns]
lat                                                 float64
lon                                                 float64
SWE                                                 float32
year                                                  int32
dead_fuel_moisture_1000hr                           float32
dead_fuel_moisture_100hr                            float32
max_air_temperature                                 float64
max_relative_humidity                               float32
min_air_temperature                                 float64
min_relative_humidity                               float32
precipitation_amount                                float32
specific_humidity                                   float32
surface_downwelling_shortwave_flux_in_air           float32
wind_from_direction                                 float32
wind_speed                                          float32
population_density                      

In [9]:
log_messages = []
log_messages.append(f"Task: Feature cleaning and processing started on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [10]:
log_messages.append(f"Input file: {file_name}")

### Add wind direction

In [12]:
wind_direction_ranges = {
    'N': (337.5, 22.5),
    'NE': (22.5, 67.5),
    'E': (67.5, 112.5),
    'SE': (112.5, 157.5),
    'S': (157.5, 202.5),
    'SW': (202.5, 247.5),
    'W': (247.5, 292.5),
    'NW': (292.5, 337.5)
}

In [13]:
# Initialize the wind_direction_category column with None
all_features['wind_direction_category'] = None

# Use numpy to create boolean masks for each wind direction category
wind_from_direction = all_features['wind_from_direction'].values

# Mask for North (N)
mask_N = (wind_from_direction >= 337.5) | (wind_from_direction < 22.5)
all_features.loc[mask_N, 'wind_direction_category'] = 'N'

# Masks for other directions
for category, (min_angle, max_angle) in wind_direction_ranges.items():
    if category != 'N':
        mask = (wind_from_direction >= min_angle) & (wind_from_direction < max_angle)
        all_features.loc[mask, 'wind_direction_category'] = category

In [14]:
# order the wind_direction_category from N to NW
wind_direction_order = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW']
all_features['wind_direction_category'] = pd.Categorical(all_features['wind_direction_category'], categories=wind_direction_order, ordered=True)

In [15]:
# group by wind_direction_category and calculate the min, max and count for each group
all_features.groupby('wind_direction_category').agg(
    wind_from_direction_min=('wind_from_direction', 'min'),
    wind_from_direction_max=('wind_from_direction', 'max'),
    count=('wind_from_direction', 'count')
).reset_index()

  all_features.groupby('wind_direction_category').agg(


Unnamed: 0,wind_direction_category,wind_from_direction_min,wind_from_direction_max,count
0,N,0.0,360.0,14076527
1,NE,23.0,67.0,8588427
2,E,68.0,112.0,6781408
3,SE,113.0,157.0,10112788
4,S,158.0,202.0,18898187
5,SW,203.0,247.0,24280011
6,W,248.0,292.0,20345834
7,NW,293.0,337.0,24054287


In [16]:
log_messages.append("=" * 50)
log_messages.append(f"Wind direction categories is created with mapping: {wind_direction_ranges}")

### Check air temperature

In [18]:
check_rows = all_features[all_features['max_air_temperature'] < all_features['min_air_temperature']]
if not check_rows.empty:
    log_messages.append(f"Warning: Found {len(check_rows)} rows where max_air_temperature is less than min_air_temperature.")

In [22]:
all_features['max_air_temperature'] = np.where(all_features['max_air_temperature'] < all_features['min_air_temperature'], all_features['min_air_temperature'], all_features['max_air_temperature'])

In [21]:
log_messages.append("for those rows, max_air_temperature is set to min_air_temperature")

In [24]:
all_features.dtypes

day                                          datetime64[ns]
lat                                                 float64
lon                                                 float64
SWE                                                 float32
year                                                  int32
dead_fuel_moisture_1000hr                           float32
dead_fuel_moisture_100hr                            float32
max_air_temperature                                 float64
max_relative_humidity                               float32
min_air_temperature                                 float64
min_relative_humidity                               float32
precipitation_amount                                float32
specific_humidity                                   float32
surface_downwelling_shortwave_flux_in_air           float32
wind_from_direction                                 float32
wind_speed                                          float32
population_density                      

### Check relative_humidity

In [25]:
check_rows = all_features[all_features['max_relative_humidity'] < all_features['min_relative_humidity']]
if not check_rows.empty:
    log_messages.append(f"Warning: Found {len(check_rows)} rows where max_relative_humidity is less than min_relative_humidity.")

In [27]:
check_rows.shape[0]

0

### Check weather grid in between old source and extended data

In [None]:
all_features_1994 = all_features[all_features['day'] == '1994-01-01']
all_features_1994 = all_features_1994[['lat','lon']].drop_duplicates()
all_features_2006 = all_features[all_features['day'] == '2006-01-01']
all_features_2006 = all_features_2006[['lat','lon']].drop_duplicates()

In [35]:
all_features_1994.shape == all_features_2006.shape

True

confirmed that good!

## Fill NA

In [28]:
# checking the missing rate in all columns
missing_rate = all_features.isnull().mean().sort_values(ascending=False)
# only keep columns with non-zero missing rate
missing_rate = missing_rate[missing_rate > 0]
missing_rate

precipitation_amount                         0.573698
LAI                                          0.030399
SWE                                          0.017091
wind_direction_category                      0.002679
wind_from_direction                          0.002679
min_relative_humidity                        0.001678
specific_humidity                            0.001678
wind_speed                                   0.001678
dead_fuel_moisture_1000hr                    0.001678
dead_fuel_moisture_100hr                     0.001678
max_relative_humidity                        0.001678
surface_downwelling_shortwave_flux_in_air    0.001678
min_air_temperature                          0.001169
max_air_temperature                          0.001169
population_density                           0.000766
dtype: float64

In [36]:
# fill na in precipitation_amount with 0
all_features['precipitation_amount'] = all_features['precipitation_amount'].fillna(0)
# fill na in LAI with 0
all_features['LAI'] = all_features['LAI'].fillna(0)
# fill na in population_density with 0
all_features['population_density'] = all_features['population_density'].fillna(0)

In [37]:
log_messages.append("=" * 50)
log_messages.append(f"cols that are missing:\n{missing_rate}")
log_messages.append("=" * 50)
log_messages.append(f"Deal with missing value")
log_messages.append("Filled NA values in precipitation_amount, LAI, and population_density with 0.")

In [39]:
# fill na in SWE with 0
all_features['SWE'] = all_features['SWE'].fillna(0)
log_messages.append("Filled NA values in SWE with 0.")

In [40]:
col_missing = ['dead_fuel_moisture_1000hr','dead_fuel_moisture_100hr',
               'max_relative_humidity', 'min_relative_humidity','specific_humidity','surface_downwelling_shortwave_flux_in_air',
               'wind_speed','wind_from_direction','wind_direction_category',
               'max_air_temperature', 'min_air_temperature']
# for col in col_missing, remove the rows with missing values
for col in col_missing:
    all_features = all_features[all_features[col].notna()]
    log_messages.append(f"Removed rows with missing values in {col} column. Remaining rows: {all_features.shape[0]}")

In [41]:
all_features.isnull().sum()

day                                          0
lat                                          0
lon                                          0
SWE                                          0
year                                         0
dead_fuel_moisture_1000hr                    0
dead_fuel_moisture_100hr                     0
max_air_temperature                          0
max_relative_humidity                        0
min_air_temperature                          0
min_relative_humidity                        0
precipitation_amount                         0
specific_humidity                            0
surface_downwelling_shortwave_flux_in_air    0
wind_from_direction                          0
wind_speed                                   0
population_density                           0
LAI                                          0
wind_direction_category                      0
dtype: int64

In [42]:
input_path = '../../Clean_Data/Extended_Feature_Data/'
file_name =  'Weather_Data_w_Veg_SubRegion_Filter_Merged_Add_population_lai_Completed.parquet'
all_features.to_parquet(os.path.join(input_path, file_name), index=False)

In [43]:
log_save_path = '../../Logs/Clean_Extended_Data/'
# Ensure the log directory exists
if not os.path.exists(log_save_path):
    os.makedirs(log_save_path)
with open(f'{log_save_path}/assign_wind_direction_and_deal_missing_rows.txt', 'w') as log_file:
    log_file.write('\n'.join(log_messages))