In [3]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np
import os

In [1]:
import matplotlib.pyplot as plt

In [4]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [5]:
input_path = '../../Clean_Data/Extended_Data_w_Veg_Filter/Vegetation'
# list files
files = os.listdir(input_path)
files

['LAI.parquet']

In [6]:
file_name = files[0]  # Change this to the specific file you want to read
lai_dat = pd.read_parquet(os.path.join(input_path, file_name))

In [7]:
lai_dat.shape

(147037409, 4)

In [8]:
# missing rate
lai_dat.isnull().mean()

time    0.000000
lat     0.000000
lon     0.000000
LAI     0.055762
dtype: float64

In [9]:
lai_dat.dtypes

time    datetime64[ns]
lat            float64
lon            float64
LAI            float64
dtype: object

In [10]:
# check min and max of time
print(lai_dat['time'].min())
print(lai_dat['time'].max())

1994-01-01 00:00:00
2021-12-27 00:00:00


In [11]:
lai_dat.shape[0]

147037409

In [10]:
lai_dat[['time','lat','lon']].drop_duplicates().shape[0]

147037409

In [12]:
# assert error if time, lat, lon are not unique
assert lai_dat[['time','lat','lon']].drop_duplicates().shape[0] == lai_dat.shape[0], "Time, lat, lon are not unique"

## MERGE w the WEATHER WEATHER DATA

In [13]:
input_path = '../../Clean_Data/Extended_Feature_Data/'
file_name = 'Weather_Data_w_Veg_SubRegion_Filter_Merged_Add_population.parquet'

all_features = pd.read_parquet(f'{input_path}/{file_name}')

In [15]:
all_features.dtypes

day                                          datetime64[ns]
lat                                                 float64
lon                                                 float64
SWE                                                 float32
year                                                  int32
dead_fuel_moisture_1000hr                           float32
dead_fuel_moisture_100hr                            float32
max_air_temperature                                 float64
max_relative_humidity                               float32
min_air_temperature                                 float64
min_relative_humidity                               float32
precipitation_amount                                float32
specific_humidity                                   float32
surface_downwelling_shortwave_flux_in_air           float32
wind_from_direction                                 float32
wind_speed                                          float32
population_density                      

In [17]:
# rename time to day in lai_dat
lai_dat.rename(columns={'time': 'day'}, inplace=True)
# print min and max of day in a sentence
print(f"Min day: {lai_dat['day'].min()}, Max day: {lai_dat['day'].max()}")

Min day: 1994-01-01 00:00:00, Max day: 2021-12-27 00:00:00


In [18]:
print(f"Min day: {all_features['day'].min()}, Max day: {all_features['day'].max()}")

Min day: 1994-01-01 00:00:00, Max day: 2020-09-30 00:00:00


In [16]:
# print shape of all_features
print(f"Shape of all_features: {all_features.shape}")

Shape of all_features: (127478960, 17)


In [19]:
# left join all_features and population_dat by ['lon', 'lat', 'day']
all_features = pd.merge(all_features, lai_dat, on=['lon', 'lat', 'day'], how='left')

In [20]:
print(f"Shape of all_features after merge: {all_features.shape}")

Shape of all_features after merge: (127478960, 18)


In [21]:
all_features.isnull().mean()

day                                          0.000000
lat                                          0.000000
lon                                          0.000000
SWE                                          0.017091
year                                         0.000000
dead_fuel_moisture_1000hr                    0.001678
dead_fuel_moisture_100hr                     0.001678
max_air_temperature                          0.001169
max_relative_humidity                        0.001678
min_air_temperature                          0.001169
min_relative_humidity                        0.001678
precipitation_amount                         0.573698
specific_humidity                            0.001678
surface_downwelling_shortwave_flux_in_air    0.001678
wind_from_direction                          0.002679
wind_speed                                   0.001678
population_density                           0.000766
LAI                                          0.030399
dtype: float64

In [22]:
input_path = '../../Clean_Data/Extended_Feature_Data/'
file_name =  'Weather_Data_w_Veg_SubRegion_Filter_Merged_Add_population_lai.parquet'

# save the merged dataframe to parquet
all_features.to_parquet(f'{input_path}/{file_name}', index=False)