**Error Analysis**

In [2]:
# import libraries

import earthaccess
import xarray as xr
import dask
import numpy as np
import pandas as pd
import geopandas as gpd
import cartopy.feature as cfeature
from rasterio import features
from scipy.ndimage import convolve
from scipy.ndimage import distance_transform_edt
import matplotlib.pyplot as plt
from rasterio.transform import from_origin
import matplotlib.pyplot as plt
from pathlib import Path

# colormap for plotting sea ice throughout rest of project

cmap = plt.get_cmap("Blues").copy()
cmap.set_bad(color='lightgray')

# authenticate NASA earth access

auth = earthaccess.login(strategy='interactive', persist = True)

In [2]:
# search NASA database

results = earthaccess.search_data(
    short_name='NSIDC-0051',
    temporal=('1990-01-01', '2025-10-01'),
    bounding_box=(-180, 0, 180, 90),
    cloud_hosted=True
)

print(f"we found {len(results)} results")

# open files in earthaccess

files = earthaccess.open(results)

we found 13204 results


QUEUEING TASKS | :   0%|          | 0/13204 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/13204 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/13204 [00:00<?, ?it/s]

In [4]:
files

[<File-like object HTTPFileSystem, https://data.nsidc.earthdatacloud.nasa.gov/nsidc-cumulus-prod-protected/PM/NSIDC-0051/2/1990/01/01/NSIDC0051_SEAICE_PS_N25km_199001_v2.0.nc>,
 <File-like object HTTPFileSystem, https://data.nsidc.earthdatacloud.nasa.gov/nsidc-cumulus-prod-protected/PM/NSIDC-0051/2/1990/01/01/NSIDC0051_SEAICE_PS_N25km_19900101_v2.0.nc>,
 <File-like object HTTPFileSystem, https://data.nsidc.earthdatacloud.nasa.gov/nsidc-cumulus-prod-protected/PM/NSIDC-0051/2/1990/01/02/NSIDC0051_SEAICE_PS_N25km_19900102_v2.0.nc>,
 <File-like object HTTPFileSystem, https://data.nsidc.earthdatacloud.nasa.gov/nsidc-cumulus-prod-protected/PM/NSIDC-0051/2/1990/01/03/NSIDC0051_SEAICE_PS_N25km_19900103_v2.0.nc>,
 <File-like object HTTPFileSystem, https://data.nsidc.earthdatacloud.nasa.gov/nsidc-cumulus-prod-protected/PM/NSIDC-0051/2/1990/01/04/NSIDC0051_SEAICE_PS_N25km_19900104_v2.0.nc>,
 <File-like object HTTPFileSystem, https://data.nsidc.earthdatacloud.nasa.gov/nsidc-cumulus-prod-protected/

In [6]:
ds = xr.open_mfdataset(files)

KeyboardInterrupt: 

In [None]:
# read in land file from geopandas

land = gpd.read_file("../data/ne_10m_land/ne_10m_land.shp")
land = land.to_crs(epsg=3411)

# create affine transform and make sure arctic is not upside down

dx = float(ds.x.diff('x').mean())
dy = float(ds.y.diff('y').mean())
x0 = float(ds.x.min())
y0 = float(ds.y.max())

transform = [dx, 0, x0, 0, -abs(dy), y0]

# use transform to mask out coastal cells

land_mask = features.rasterize(
    ((geom, 1) for geom in land.geometry),
    out_shape=(ds.sizes['y'], ds.sizes['x']),
    transform=transform,
    fill=0,
    dtype=np.uint8
)

# calculate distance from land using euclidian distance transform

distance_from_land = distance_transform_edt(land_mask == 0)

# convert to xarray.DataArray

distance_xr = xr.DataArray(
    distance_from_land,
    coords={'y': ds.y, 'x': ds.x},
    dims=('y', 'x'),
    name='distance_to_land_cells'
)

# add as data variable in ds

ds['edtl'] = distance_xr

In [None]:
# read in files

folderpath = 'scratch/fld1/visual_ice/'
pathlist = Path(folderpath).glob("*.csv")

# convert rows and columns into lats and lons

row_to_lat = dict(enumerate(ds['x'].values))
col_to_lon = dict(enumerate(ds['y'].values))

In [None]:
# loop through files

for i, file in enumerate(pathlist):

    # read in single file and map to lat / lon

    visual = pd.read_csv(str(file))
    visual["time"] = pd.to_datetime(visual["Date"], yearfirst=True)
    visual['x'] = visual['Row'].map(row_to_lat)
    visual['y'] = visual['Column'].map(col_to_lon)

    # convert to xarray

    da_sparse = visual.set_index(['time', 'y', 'x']).to_xarray()
    da_full = da_sparse.reindex_like(ds, method=None).chunk({'time': 2})

    # concatinate into the main dataset (or create new data variable for first file)

    if i == 0:
        ds['visual_ice'] = da_full['SI frac']
    else:
        ds['visual_ice'] = xr.concat([ds['SI_frac'], da_full['SI frac']], dim='time')

In [4]:
visual = pd.read_csv('../local_data/earth_engine_demos/big_array_coast_cell_smallest_w180w135_1999b_LC05.csv')

EmptyDataError: No columns to parse from file

In [None]:
# data cleaning

condition = ((ds.visual_ice.notnull()) & (ds.F17_ICECON < 1.0))
ds_clean = ds.where(condition, other=np.nan).compute()

df = ds_clean.to_dataframe().reset_index().dropna(axis=1)

In [None]:
# calculate error

error = ((ds_clean['F17_ICECON'] - ds_clean['visual_ice']))

# mse
mse = ((((ds_clean['F17_ICECON'] - ds_clean['visual_ice']))**2)**0.5)
avg_mse = mse.mean(dim=['time', 'x', 'y'], skipna=True)
print('Simple error is', avg_mse.compute().item())

In [None]:
# dependancies
#   – sort dataset see if there are differences (how much land, weather, snow cover, clouds, sun elevation)
#   – look at date but not enough dates in sample data
#   – regression, plotting, divide into populations, light gradient boosting?

In [None]:
# clean that table
# there are lots of 1.012 and 1.016 which is obviously impossible but look in NSIDC docs it means something
# calculate error with a sign