For LAI data 
- Merge and Save the raw data to parquet file
  - source1: extended data
  - source2: original data (../New_Data/glass.lai.2001.2021.CA.daily_interpolated.nc)
- Join w veg data to reduce # of grids
  - Exclude water, agricultural lands, and urban areas based on the vegetation type data.

In [1]:
# del all variables
import sys
for name in dir():
    if not name.startswith('_'):
        del globals()[name]

In [2]:
# Force garbage collection
import gc
gc.collect()

34

In [3]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from datetime import datetime

In [4]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [5]:
entity = 'Vegetation'
data_dir = f'../../Extended_Data/{entity}/'

## Merge Data from old and extended

In [6]:
files = os.listdir(data_dir)
files

['LAI.1994.2000.CA.daily_interpolated.nc', 'LAI.1994.2000.CA.nc']

In [7]:
file = 'LAI.1994.2000.CA.daily_interpolated.nc'
ds = xr.open_dataset(os.path.join(data_dir, file))
print(f"File: {file}")
print(f"Dimensions: {ds.dims}")
print(f"Day range: {ds.coords['time'].min().values} to {ds.coords['time'].max().values}")
print(f"Coordinates: {ds.coords}")
print(f"Data variables: {ds.data_vars}")

File: LAI.1994.2000.CA.daily_interpolated.nc
Dimensions: Frozen({'time': 2558, 'lat': 238, 'lon': 258})
Day range: 1994-01-01T00:00:00.000000000 to 2001-01-01T00:00:00.000000000
Coordinates: Coordinates:
  * time     (time) datetime64[ns] 1994-01-01 1994-01-02 ... 2001-01-01
  * lat      (lat) float64 32.32 32.36 32.4 32.44 ... 42.07 42.11 42.15 42.19
  * lon      (lon) float64 -124.7 -124.6 -124.6 -124.6 ... -114.1 -114.0 -114.0
Data variables: Data variables:
    LAI      (time, lat, lon) float64 ...


In [9]:
ds_old = xr.open_dataset('../../New_Data/glass.lai.2001.2021.CA.daily_interpolated.nc')
ds_old = ds_old.assign_coords(time=pd.to_datetime(ds_old['time'].values.astype(int).astype(str), format='%Y%m%d'))
print(f"Dimensions: {ds_old.dims}")
print(f"Day range: {ds.coords['time'].min().values} to {ds.coords['time'].max().values}")
print(f"Coordinates: {ds_old.coords}")
print(f"Data variables: {ds_old.data_vars}")

Dimensions: Frozen({'time': 7666, 'lon': 259, 'lat': 240})
Day range: 1994-01-01T00:00:00.000000000 to 2001-01-01T00:00:00.000000000
Coordinates: Coordinates:
  * time     (time) datetime64[ns] 2001-01-01 2001-01-02 ... 2021-12-27
  * lon      (lon) float64 -124.8 -124.7 -124.7 -124.6 ... -114.1 -114.1 -114.0
  * lat      (lat) float64 41.98 41.94 41.9 41.86 ... 32.15 32.11 32.07 32.03
Data variables: Data variables:
    LAI      (time, lat, lon) float32 ...


In [10]:
# drop 2001-01-01 from ds bc ds_old has it
ds = ds.sel(time=slice('1994-01-01', '2000-12-31'))

In [12]:
ds

In [13]:
print(ds_old.coords)

Coordinates:
  * time     (time) datetime64[ns] 2001-01-01 2001-01-02 ... 2021-12-27
  * lon      (lon) float64 -124.8 -124.7 -124.7 -124.6 ... -114.1 -114.1 -114.0
  * lat      (lat) float64 41.98 41.94 41.9 41.86 ... 32.15 32.11 32.07 32.03


In [None]:
common_lon = np.intersect1d(ds.coords['lon'].values, ds_old.coords['lon'].values)
common_lat = np.intersect1d(ds.coords['lat'].values, ds_old.coords['lat'].values)
len(common_lon), len(common_lat)

(257, 233)

In [14]:
lai_pd = ds.to_dataframe().reset_index()
lai_pd_old = ds_old.to_dataframe().reset_index()

In [51]:
lai_pd.dtypes

time    datetime64[ns]
lat            float64
lon            float64
LAI            float64
dtype: object

In [52]:
lai_pd_old.dtypes

time    datetime64[ns]
lon            float64
lat            float64
LAI            float32
dtype: object

In [15]:
columns_to_keep = ['time', 'lat', 'lon', 'LAI']
lai_pd = lai_pd[columns_to_keep]
lai_pd_old = lai_pd_old[columns_to_keep]

In [16]:
# concatenate the two dataframes
lai_pd_combined = pd.concat([lai_pd, lai_pd_old], ignore_index=True)

In [17]:
# ensure the shape is correct, assert error if not
assert lai_pd_combined.shape[0] == lai_pd.shape[0] + lai_pd_old.shape[0], "Shape mismatch after concatenation"

In [18]:
del ds, ds_old, lai_pd, lai_pd_old

In [19]:
path_to_save = f'../../Clean_Data/Extended_Data/{entity}'
# Ensure the output directory exists
if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)

output_file = f'{path_to_save}/{file.split(".")[0]}.parquet'
lai_pd_combined.to_parquet(output_file)

print(f"Saved {file} as {output_file}")

Saved LAI.1994.2000.CA.daily_interpolated.nc as ../../Clean_Data/Extended_Data/Vegetation/LAI.parquet


In [20]:
lai_pd_combined['year'] = lai_pd_combined['time'].dt.year
lai_pd_combined.groupby('year')['LAI'].apply(lambda x: x.isnull().mean())

year
1994    0.626067
1995    0.626067
1996    0.626067
1997    0.626067
1998    0.626067
1999    0.626067
2000    0.626067
2001    0.673845
2002    0.681087
2003    0.670824
2004    0.668499
2005    0.651382
2006    0.671438
2007    0.680602
2008    0.675228
2009    0.673124
2010    0.669819
2011    0.665930
2012    0.677213
2013    0.679982
2014    0.678052
2015    0.675516
2016    0.676242
2017    0.672758
2018    0.679234
2019    0.668661
2020    0.665746
2021    0.679524
Name: LAI, dtype: float64

## Check lon/lat match with weather grid

In [21]:
reference_weather_data = pd.read_parquet('../../Clean_Data/Weather_Data/Extended_Weather_Data/wind_speed.parquet')
reference_weather_data = reference_weather_data[reference_weather_data['day']=='1998-01-01']
reference_weather_data = reference_weather_data[['lon', 'lat']].drop_duplicates()
reference_weather_data.shape

(61404, 2)

In [22]:
lai_pd_lon_lat = lai_pd_combined[['lon', 'lat']].drop_duplicates()
lai_pd_lon_lat.shape

(63683, 2)

In [23]:
test_merge = lai_pd_lon_lat.merge(reference_weather_data, on=['lon', 'lat'], how='inner')
# if test_merge.shape is same as reference_weather_data, then the merge is successful
if test_merge.shape == reference_weather_data.shape:
    print("Merge successful")
else:
    # print test_merge shape and reference_weather_data shape
    print(f"Merge failed: {test_merge.shape} != {reference_weather_data.shape}")

Merge successful


## Merge w Veg Data

In [24]:
# clean up memory
gc.collect()
# delete all dataframes
for var in list(globals().keys()):
    if isinstance(globals()[var], pd.DataFrame):
        del globals()[var]

In [25]:
import os
from datetime import datetime

In [26]:
# the data is generated from 02 Veg Data Extract.ipynb
import pandas as pd
veg_data = pd.read_parquet('../../Clean_Data/lon_lat_pair_weather_match_veg_v2.parquet')

In [27]:
veg_data.shape

(17703, 7)

In [28]:
veg_data.head()

Unnamed: 0,lon,lat,type,distance,veg_type_details,fire_attribute,veg
0,-124.391667,40.441667,7.0,1.826642,Native Coastal Sage Scrub - fire,fire,Native Coastal Sage Scrub
1,-124.391667,40.4,25.0,1.827314,Native Oak Woodland - fire,fire,Native Oak Woodland
2,-124.35,40.566667,1.0,1.824622,Agriculture - low,low,Agriculture
3,-124.35,40.525,8.0,1.825296,Native Coastal Sage Scrub - mesic,mesic,Native Coastal Sage Scrub
4,-124.35,40.483333,25.0,1.825969,Native Oak Woodland - fire,fire,Native Oak Woodland


In [29]:
veg_data[['lon', 'lat']].duplicated().sum()

0

In [30]:
veg_data_filter = veg_data[~veg_data['veg'].str.contains('Water|Urban|Agriculture')]

In [31]:
veg_data_filter.shape

(14383, 7)

In [32]:
file = 'LAI.parquet'
entity = 'Vegetation'

In [33]:
path_to_read = f'../../Clean_Data/Extended_Data/{entity}'
path_to_save = f'../../Clean_Data/Extended_Data_w_Veg_Filter/{entity}'
# Ensure the output directory exists
if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)

print("Task: Data cleaning on combined LAI data with vegetation filter")
print(f"Processing started on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


dat = pd.read_parquet(f'{path_to_read}/{file}')
print(f"Processing file: {file}, shape: {dat.shape}")
dat_filtered = pd.merge(dat, veg_data_filter[['lon', 'lat']], on=['lon', 'lat'], how='inner')
print(f"Filtered DataFrame shape: {dat_filtered.shape}")
del dat  # Free up memory
dat_filtered.to_parquet(f'{path_to_save}/{file}')

Task: Data cleaning on combined LAI data with vegetation filter
Processing started on: 2025-06-15 16:53:39
Processing file: LAI.parquet, shape: (633528588, 4)
Filtered DataFrame shape: (147037409, 4)


In [34]:
dat_filtered['year'] = dat_filtered['time'].dt.year
dat_filtered.groupby('year')['LAI'].apply(lambda x: x.isnull().mean())

year
1994    0.077522
1995    0.077522
1996    0.077522
1997    0.077522
1998    0.077522
1999    0.077522
2000    0.077522
2001    0.053565
2002    0.050510
2003    0.048234
2004    0.050983
2005    0.051144
2006    0.048921
2007    0.050153
2008    0.053587
2009    0.049206
2010    0.049184
2011    0.045130
2012    0.046274
2013    0.048790
2014    0.046118
2015    0.043975
2016    0.046573
2017    0.048877
2018    0.047935
2019    0.049485
2020    0.043612
2021    0.046295
Name: LAI, dtype: float64