In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np
import os

In [2]:
import matplotlib.pyplot as plt

In [3]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [4]:
path_to_pop = '../../Clean_Data/Extended_Data_w_Veg_Filter/Population'
file_name = 'population_density_yearly_1994_2020.parquet'
population_dat = pd.read_parquet(os.path.join(path_to_pop, file_name))

In [5]:
population_dat.shape

(388341, 4)

In [6]:
# missing rate
population_dat.isnull().mean()

year                  0.000000
lon                   0.000000
lat                   0.000000
population_density    0.029132
dtype: float64

In [10]:
population_dat.dtypes

year                    int64
lon                   float64
lat                   float64
population_density    float64
dtype: object

## MERGE w the WEATHER WEATHER DATA

In [7]:
input_path = '../../Clean_Data/Extended_Feature_Data/'
file_name = 'Weather_Data_w_Veg_SubRegion_Filter_Merged.parquet'

all_features = pd.read_parquet(f'{input_path}/{file_name}')

In [8]:
all_features.shape

(127478960, 16)

In [9]:
all_features.dtypes

day                                          datetime64[ns]
lat                                                 float64
lon                                                 float64
SWE                                                 float32
year                                                  int32
dead_fuel_moisture_1000hr                           float32
dead_fuel_moisture_100hr                            float32
max_air_temperature                                 float64
max_relative_humidity                               float32
min_air_temperature                                 float64
min_relative_humidity                               float32
precipitation_amount                                float32
specific_humidity                                   float32
surface_downwelling_shortwave_flux_in_air           float32
wind_from_direction                                 float32
wind_speed                                          float32
dtype: object

In [11]:
population_dat['year'].unique()

array([1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020], dtype=int64)

In [12]:
# check unique values of year from all_features
all_features['year'].unique()

array([1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020])

In [14]:
# assert error if years do not match
assert set(population_dat['year'].unique()) == set(all_features['year'].unique()), \
    "Years in population data and feature data do not match."

In [15]:
# print shape of all_features
print(f"Shape of all_features: {all_features.shape}")

Shape of all_features: (127478960, 16)


In [17]:
# left join all_features and population_dat by ['lon', 'lat', 'year']
all_features = pd.merge(all_features, population_dat, on=['lon', 'lat', 'year'], how='left')

In [18]:
print(f"Shape of all_features after merge: {all_features.shape}")

Shape of all_features after merge: (127478960, 17)


In [19]:
all_features.isnull().mean()

day                                          0.000000
lat                                          0.000000
lon                                          0.000000
SWE                                          0.017091
year                                         0.000000
dead_fuel_moisture_1000hr                    0.001678
dead_fuel_moisture_100hr                     0.001678
max_air_temperature                          0.001169
max_relative_humidity                        0.001678
min_air_temperature                          0.001169
min_relative_humidity                        0.001678
precipitation_amount                         0.573698
specific_humidity                            0.001678
surface_downwelling_shortwave_flux_in_air    0.001678
wind_from_direction                          0.002679
wind_speed                                   0.001678
population_density                           0.000766
dtype: float64

In [None]:
input_path = '../../Clean_Data/Extended_Feature_Data/'
file_name =  'Weather_Data_w_Veg_SubRegion_Filter_Merged_Add_population.parquet'

# save the merged dataframe to parquet
all_features.to_parquet(f'{input_path}/{file_name}', index=False)