In [None]:
import polars as pl

In [43]:
actual = pl.read_parquet(
    "gs://crea-pm25ml-examples/combined__omi_full_nn.parquet"
)

In [None]:
actual

In [35]:
expected_rows = float(actual.shape[0])

In [46]:
# Count the number of nulls in each column and plot the results where the number of nulls is greater than 0

null_counts_df = actual.select(
    [
        (pl.col(col).is_null() | pl.col(col).is_nan()).sum().alias(col)
        for col in actual.columns
        if col != "grid_id" and col != "date"
    ]
).unpivot(
    variable_name="column",
    value_name="null_count"
).filter(
    pl.col("null_count") > 0
).with_columns(
    missing_percent=pl.col("null_count") / expected_rows * 100.0
).sort("missing_percent", descending=True)

In [47]:
null_counts_df

column,null_count,missing_percent
str,u32,f64
"""omi_no2__no2_cloud_screened""",555401,54.169926
"""omi_no2__no2_trop_cloud_screen…",555401,54.169926
"""omi_no2_v4__no2_cloud_screened""",498361,48.606644
"""omi_no2_v4__no2_trop_cloud_scr…",498361,48.606644
"""omi_no2__no2""",471335,45.970717
…,…,…
"""era5_land__leaf_area_index_low…",10075,0.982645
"""era5_land__leaf_area_index_hig…",10075,0.982645
"""era5_land__leaf_area_index_low…",10075,0.982645
"""era5_land__leaf_area_index_hig…",10075,0.982645


In [48]:
null_counts_df.plot.bar(
    x="column",
    y="missing_percent",
)

It looks like the columns that have null values are:
 - ERA5 - which we're already handling spatially
 - OMI NO2 - which we we will handle as part of the "generation" of data
 - and the columns which we're going to use ML for