In [6]:
using Pkg
Pkg.activate("..")  # Activate parent directory to use local ArgoData.jl
Pkg.instantiate()
using ArgoData      # Now loads from ../src/ instead of installed package
using Parquet2, Tables, DataFrames, IntervalSets, Glob, Dates
using MeshArrays, DataDeps, GeoJSON, CairoMakie
import TableOperations as TO
Pkg.status()

[32m[1m  Activating[22m[39m project at `~/myWHOI/CrocoLake/ArgoData.jl`


[36m[1mProject[22m[39m ArgoData v0.2.3
[32m[1mStatus[22m[39m `~/myWHOI/CrocoLake/ArgoData.jl/Project.toml`
  [90m[e28b5b4c] [39mBootstrap v2.4.0
  [90m[336ed68f] [39mCSV v0.10.15
  [90m[13f3f980] [39mCairoMakie v0.15.6
  [90m[124859b0] [39mDataDeps v0.7.13
  [90m[a93c6f00] [39mDataFrames v1.8.1
  [90m[9c0b9be8] [39mDataverse v0.2.7
  [90m[01fcc997] [39mFTPClient v1.2.1
  [90m[61d90e0f] [39mGeoJSON v0.8.4[93m [loaded: v0.8.3][39m
  [90m[c27321d9] [39mGlob v1.3.1
  [90m[a98d9a8b] [39mInterpolations v0.16.2
  [90m[8197267c] [39mIntervalSets v0.7.11
  [90m[033835bb] [39mJLD2 v0.6.2
  [90m[cb8c808f] [39mMeshArrays v0.3.23
  [90m[85f8d34a] [39mNCDatasets v0.14.10
  [90m[bac558e1] [39mOrderedCollections v1.8.1
  [90m[98572fba] [39mParquet2 v0.2.33
  [90m[10745b16] [39mStatistics v1.11.1
  [90m[ab02a1b2] [39mTableOperations v1.2.0
  [90m[bd369af6] [39mTables v1.12.1
  [90m[ddb6d928] [39mYAML v0.4.15
  [90m[ade2ca70] [39mDates v1.11.0
  [90m[

# Argo in parquet format

> **Note:** This notebook is based on the data set published in Milanese, E., & Nicholson, D. (2025).

- Milanese, E., & Nicholson, D. (2025). Sample parquet datasets of Argo program ocean data [Data set]. Zenodo. https://doi.org/10.5281/zenodo.15198578
- https://github.com/boom-lab/argo2parquet-public
- https://github.com/boom-lab/crocolaketools-public
- https://euroargodev.github.io/ArgoData.jl/dev/

## Notebook History

- Test data provided by Enrico Milanese (@enrico-mi) and Roo Nicholson (@dnicholson)
- Examples provided by Enrico Milanese as a Jupyter notebook + env
- Gael Forget (@gaelforget): 
  - streamline notebook code
  - add float subset example, Plots
  - convert to `Pluto` notebook
  - streamline code via `Argo_parquet` module in `src/Parquet.jl`
  - streamline code via `Argo_pq` struct in src

## Download Sample Files

In [13]:
# folder_pq = Argo_parquet.sample_download("ARGO_PHY_SAMPLE_QC")
folder_pq = "/home/enrico/myWHOI/CrocoLake/crocolake-julia/Argo/1003_PHY_ARGO-QC"

"/home/enrico/myWHOI/CrocoLake/crocolake-julia/Argo/1003_PHY_ARGO-QC"

In [14]:
files = glob("*parquet", folder_pq)

537-element Vector{String}:
 "/home/enrico/myWHOI/CrocoLake/c"[93m[1m ⋯ 34 bytes ⋯ [22m[39m"QC/1003_ARGO-QC_PHY_000.parquet"
 "/home/enrico/myWHOI/CrocoLake/c"[93m[1m ⋯ 34 bytes ⋯ [22m[39m"QC/1003_ARGO-QC_PHY_001.parquet"
 "/home/enrico/myWHOI/CrocoLake/c"[93m[1m ⋯ 34 bytes ⋯ [22m[39m"QC/1003_ARGO-QC_PHY_002.parquet"
 "/home/enrico/myWHOI/CrocoLake/c"[93m[1m ⋯ 34 bytes ⋯ [22m[39m"QC/1003_ARGO-QC_PHY_003.parquet"
 "/home/enrico/myWHOI/CrocoLake/c"[93m[1m ⋯ 34 bytes ⋯ [22m[39m"QC/1003_ARGO-QC_PHY_004.parquet"
 "/home/enrico/myWHOI/CrocoLake/c"[93m[1m ⋯ 34 bytes ⋯ [22m[39m"QC/1003_ARGO-QC_PHY_005.parquet"
 "/home/enrico/myWHOI/CrocoLake/c"[93m[1m ⋯ 34 bytes ⋯ [22m[39m"QC/1003_ARGO-QC_PHY_006.parquet"
 "/home/enrico/myWHOI/CrocoLake/c"[93m[1m ⋯ 34 bytes ⋯ [22m[39m"QC/1003_ARGO-QC_PHY_007.parquet"
 "/home/enrico/myWHOI/CrocoLake/c"[93m[1m ⋯ 34 bytes ⋯ [22m[39m"QC/1003_ARGO-QC_PHY_008.parquet"
 "/home/enrico/myWHOI/CrocoLake/c"[93m[1m ⋯ 34 bytes ⋯ [22m

## Open via `ArgoData.jl`

In [15]:
da = Argo_parquet.Dataset(folder_pq)

[0m  folder    = [34m/home/enrico/myWHOI/CrocoLake/crocolake-julia/Argo/1003_PHY_ARGO-QC [39m
[0m  files     = [34m537 [39m
[0m  Dataset   = [34mParquet2.Dataset(ncolumns=20) [39m
[0m  schema    = [32mTables.Schema:[39m
[32m :DB_NAME                     Union{Missing, String}[39m
[32m :PLATFORM_NUMBER             Union{Missing, String}[39m
[32m :CYCLE_NUMBER                Union{Missing, Int32}[39m
[32m :DATA_MODE                   Union{Missing, String}[39m
[32m :DATE_UPDATE                 Union{Missing, DateTime}[39m
[32m :LATITUDE                    Union{Missing, Float64}[39m
[32m :LONGITUDE                   Union{Missing, Float64}[39m
[32m :JULD                        Union{Missing, DateTime}[39m
[32m :PRES                        Union{Missing, Float32}[39m
[32m :PRES_QC                     Union{Missing, UInt8}[39m
[32m :PRES_ERROR                  Union{Missing, Float32}[39m
[32m :TEMP                        Union{Missing, Float32}[39m
[32m

## A Closer Look

### Single File Example

Here we look at several ways to access data, column subsets, and row subsets.

In [16]:
ds = Parquet2.Dataset(files[2])

[34m≔ [39mParquet2.Dataset (46548111 bytes)
	1. [33m"DB_NAME"[39m: [36mUnion{Missing, String}[39m
	2. [33m"PLATFORM_NUMBER"[39m: [36mUnion{Missing, String}[39m
	3. [33m"CYCLE_NUMBER"[39m: [36mUnion{Missing, Int32}[39m
	4. [33m"DATA_MODE"[39m: [36mUnion{Missing, String}[39m
	5. [33m"DATE_UPDATE"[39m: [36mUnion{Missing, DateTime}[39m
	6. [33m"LATITUDE"[39m: [36mUnion{Missing, Float64}[39m
	7. [33m"LONGITUDE"[39m: [36mUnion{Missing, Float64}[39m
	8. [33m"JULD"[39m: [36mUnion{Missing, DateTime}[39m
	9. [33m"PRES"[39m: [36mUnion{Missing, Float32}[39m
	10. [33m"PRES_QC"[39m: [36mUnion{Missing, UInt8}[39m
	11. [33m"PRES_ERROR"[39m: [36mUnion{Missing, Float32}[39m
	12. [33m"TEMP"[39m: [36mUnion{Missing, Float32}[39m
	13. [33m"TEMP_QC"[39m: [36mUnion{Missing, UInt8}[39m
	14. [33m"TEMP_ERROR"[39m: [36mUnion{Missing, Float32}[39m
	15. [33m"PSAL"[39m: [36mUnion{Missing, Float32}[39m
	16. [33m"PSAL_QC"[39m: [36mUnion{Missing, UInt8}[

### Whole Parquet Folder

In [18]:
# Example: Working with whole folder
# Uncomment to run:

ds2 = Parquet2.Dataset(folder_pq)
# Append all row groups (important step)
Parquet2.appendall!(ds2)
Parquet2.filelist(ds2)
Tables.schema(ds2)

Tables.Schema:
 :DB_NAME                     Union{Missing, String}
 :PLATFORM_NUMBER             Union{Missing, String}
 :CYCLE_NUMBER                Union{Missing, Int32}
 :DATA_MODE                   Union{Missing, String}
 :DATE_UPDATE                 Union{Missing, DateTime}
 :LATITUDE                    Union{Missing, Float64}
 :LONGITUDE                   Union{Missing, Float64}
 :JULD                        Union{Missing, DateTime}
 :PRES                        Union{Missing, Float32}
 :PRES_QC                     Union{Missing, UInt8}
 :PRES_ERROR                  Union{Missing, Float32}
 :TEMP                        Union{Missing, Float32}
 :TEMP_QC                     Union{Missing, UInt8}
 :TEMP_ERROR                  Union{Missing, Float32}
 :PSAL                        Union{Missing, Float32}
 :PSAL_QC                     Union{Missing, UInt8}
 :PSAL_ERROR                  Union{Missing, Float32}
 :ABS_SAL_COMPUTED            Union{Missing, Float32}
 :CONSERVATIVE_TEMP_CO

## Extract data from one region

In [17]:
lons = -75 .. -50
lats = 25 .. 40
dates = Dates.DateTime("2001-01-01T00:00:00") .. Dates.DateTime("2024-12-31T23:59:59")
variables = (:JULD, :LATITUDE, :LONGITUDE, :PRES, :TEMP, :PLATFORM_NUMBER)

df1 = Argo_parquet.get_subset_region(da.Dataset, variables=variables, lons=lons, lats=lats, dates=dates, verbose=true)

Found 275 matching row groups out of 1445
Processing row group 10/275...
Processing row group 20/275...
Processing row group 30/275...
Processing row group 40/275...
Processing row group 50/275...
Processing row group 60/275...
Processing row group 70/275...
Processing row group 80/275...
Processing row group 90/275...
Processing row group 100/275...
Processing row group 110/275...
Processing row group 120/275...
Processing row group 130/275...
Processing row group 140/275...
Processing row group 150/275...
Processing row group 160/275...
Processing row group 170/275...
Processing row group 180/275...
Processing row group 190/275...
Processing row group 200/275...
Processing row group 210/275...
Processing row group 220/275...
Processing row group 230/275...
Processing row group 240/275...
Processing row group 250/275...
Processing row group 260/275...
Processing row group 270/275...
Combining 161 filtered row groups...
Final result: 20319232 rows


Row,JULD,LATITUDE,LONGITUDE,PRES,TEMP,PLATFORM_NUMBER
Unnamed: 0_level_1,DateTime?,Float64?,Float64?,Float32?,Float32?,String?
1,2004-02-02T17:12:29.995,25.365,-62.212,10.0,24.584,1900022
2,2004-02-02T17:12:29.995,25.365,-62.212,20.0,24.574,1900022
3,2004-02-02T17:12:29.995,25.365,-62.212,30.0,24.54,1900022
4,2004-02-02T17:12:29.995,25.365,-62.212,40.0,24.518,1900022
5,2004-02-02T17:12:29.995,25.365,-62.212,50.0,24.52,1900022
6,2004-02-02T17:12:29.995,25.365,-62.212,60.0,24.528,1900022
7,2004-02-02T17:12:29.995,25.365,-62.212,70.0,24.53,1900022
8,2004-02-02T17:12:29.995,25.365,-62.212,80.0,24.532,1900022
9,2004-02-02T17:12:29.995,25.365,-62.212,90.0,24.534,1900022
10,2004-02-02T17:12:29.995,25.365,-62.212,100.0,24.506,1900022


### Helper Functions

In [None]:
fil = MeshArrays.demo.download_polygons("countries.geojson")
pol = MeshArrays.read_polygons(fil)

In [None]:
function plot_lo_la_etc(lo, la; te=[], pol=pol)
    fig = Figure()
    ax = Axis(fig[1,1], aspect = AxisAspect(1), backgroundcolor=:transparent)
    co = (isempty(te) ? :blue : te)
    pnts = scatter!(lo, la, color=co, markersize=4)
    if !isempty(te)
        cbar = Colorbar(fig[1,2], pnts, height = Relative(0.75), tickwidth = 2,
            tickalign = 1, width = 14, ticksize = 14)
    end
    [lines!(ax, l1, color = :black, linewidth = 0.5) for l1 in pol]
    limits!(ax, (-180,180), (-90,90))
    fig
end

In [None]:
(lo, la, te) = Argo_parquet.get_lon_lat_temp(df1)
plot_lo_la_etc(lo, la; te=te, pol=pol)

## Extract data from one profiler

In [None]:
ID = Tables.getcolumn(da.Dataset, :PLATFORM_NUMBER)
IDu = unique(ID)

In [None]:
df3 = Argo_parquet.get_subset_float(da.Dataset, ID=1901730)

In [None]:
function plot_one_profile!(ax, df)
    sort!(df, :PRES)
    np = sum(1 .- ismissing.(df[:, :PRES]))
    df = DataFrame(:P => df[1:np, :PRES], :T => df[1:np, :TEMP])
    lines!(ax, df.T, -df.P)
end

function plot_profiles(df3)
    gdf3 = groupby(df3, :JULD)
    fi = Figure()
    ax = Axis(fi[1,1])
    [plot_one_profile!(ax, df) for df in gdf3]
    fi
end

In [None]:
fig4 = plot_profiles(df3)
ii = findall((!ismissing).(df3.PRES))
scatter!(Axis(fig4[2,1]), DateTime.(df3.JULD[ii]), -df3.PRES[ii], color=Float64.(df3.TEMP[ii]), markersize=2)
fig4

## Appendix

### Distributed Example

```julia
using Distributed

@everywhere using Tables, DataFrames, Parquet2, SharedArrays

function get_lon_lat_juld_loop1(ds::Parquet2.Dataset; verbose=false)
    files = filelist(ds)
    nf = length(files)
    LO = SharedArray{Float64}(100000, nf)
    LA = SharedArray{Float64}(100000, nf)
    JU = SharedArray{DateTime}(100000, nf)
    @sync @distributed for f in 1:nf
        println([f nf])
        ds1 = Parquet2.Dataset(files[f])
        lo = Tables.getcolumn(ds1, :LONGITUDE)
        la = Tables.getcolumn(ds1, :LATITUDE)
        ju = Tables.getcolumn(ds1, :JULD)
        ii = findall((ismissing.(lo) .+ ismissing.(la) .+ ismissing.(ju)) .== false)
        ii = ii[findall((lo[ii] .> -180) .* (lo[ii] .< 180) .* (la[ii] .> -90))]
        x = unique([(lo[i], la[i], ju[i]) for i in ii])
        (lo, la, ju) = ([a[1] for a in x], [a[2] for a in x], [a[3] for a in x])
        np = length(lo)
        verbose ? println([f np]) : nothing
        LO[1:np, f] .= lo
        LA[1:np, f] .= la
        JU[1:np, f] .= ju
    end
    Array(LO), Array(LA), Array(JU)
end
```