In [17]:
import os
import tarfile
import gzip
import tempfile
import pandas as pd
import numpy as np
import netCDF4 as nc

def read_hdf(filename, var_name = "chl"):
    with nc.Dataset(filename) as file:
        file.set_auto_mask(False)
        variables = {x: file[x][()] for x in file.variables}
    return variables[var_name]


df = dict()

month_name = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

# Loop through each tar file
for year in range(1997, 2024):
    # Open the tar file
    tar_filename = f'data/chl.s.{year}.tar'
    df[year] = pd.DataFrame()
    with tarfile.open(tar_filename, "r:") as tar:
        # Loop through each member of the tar file
        month = 8 if year == 1997 else -1
        for member in tar.getmembers():
            month += 1
            if month == 3 and year == 2022:
                month += 1 # Missing April 2022 data
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file
                        
                        df[year][month_name[month]] = read_hdf(tmp.name).reshape(-1)

In [84]:
df[2020]

Unnamed: 0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
2,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
3,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
4,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2332795,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
2332796,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
2332797,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
2332798,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0


In [40]:
#Make sure that "cleaned.xlsx" is in the same folder as the notebook
cleaned = pd.read_excel("cleaned.xlsx", index_col=0).dropna(subset = ["Lat", "Long"]) #Drop rows with no coordinates
cleaned

Unnamed: 0,Location,Cruise,Stn/Loc,Lat,Long,Depth,Cr_%,Cr_2pie,Cr_QF,Cr_nmol/kg,...,Filtered_2SEM,Filtered_Cr,Cr(III)_nmol/kg,Cr(III)_SD,Cr(III)_QF,Cr(III) filt out,Seawater,Source,Notes,Year
1,ETNP,TT145,2,21,-132,0.0,,,,,...,,3.684864,0.097561,,1.0,,No,"Murray et al., 1983",lat & long est from fig,1983.0
2,ETNP,TT145,2,21,-132,25.0,,,,3.190219,...,,,0.000000,,1.0,,No,"Murray et al., 1983",lat & long est from fig,1983.0
3,ETNP,TT145,2,21,-132,50.0,,,,3.132063,...,,,0.204878,,1.0,,No,"Murray et al., 1983",lat & long est from fig,1983.0
4,ETNP,TT145,2,21,-132,75.0,,,,,...,,,0.000000,,1.0,,No,"Murray et al., 1983",lat & long est from fig,1983.0
5,ETNP,TT145,2,21,-132,100.0,,,,3.424319,...,,,0.097561,,1.0,,No,"Murray et al., 1983",lat & long est from fig,1983.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,ETSP,GP16,13,-14,-99,1987.8,0.771818,0.040358,2.0,4.851725,...,,,,,,,,Nasemann unpublished,,
533,ETSP,GP16,13,-14,-99,3378.6,0.769107,0.035261,2.0,5.234698,...,,,,,,,,Nasemann unpublished,,
534,ETSP,GP16,13,-14,-99,3811.1,0.758318,0.037619,2.0,5.088152,...,,,,,,,,Nasemann unpublished,,
538,East Pac Rise,,"HYCY 10,11",12.833333,103.933333,2615.0,,,,15.800000,...,,,,,,,,Jeandel & Minster 1984,"Omit: 35 m from bottom, unfilt",1984.0


In [86]:
#lon       | lat
#-180: 0   | 90: 0
#0: 1080   | 0: 540
#180: 2160 | -90: 1080

import math

for month in month_name:
    for i in cleaned.index:
        year = cleaned.loc[i, "Year"]
        year = int(year) if year == year else "NaN"
        lat, lon = cleaned.loc[i, "Lat"], cleaned.loc[i, "Long"]
        if type(lat) == str:
            lat = float(lat.replace("−", "-"))
        if type(lon) == str:
            lon = float(lon.replace("−", "-"))
        lat = (90 - lat) * 6
        lon = (lon + 180) * 6
        idx = (math.floor(lat) - 1) * 2160 + math.floor(lon) - 1
        if (year not in df) or (month not in df[year].columns) or (df[year][month][idx] < 0):
            #cleaned.loc[i, "chl_" + month] = df["Mean"][month][(math.floor(lat) - 1) * 2160 + math.floor(lon) - 1]
            cnt = 0
            for yy in range(1997, 2024):
                if (month in df[yy].columns) and (df[yy][month][idx] >= 0):
                    if cnt == 0:
                        cleaned.loc[i, "chl_" + month] = df[yy][month][idx]
                    else:
                        cleaned.loc[i, "chl_" + month] += df[yy][month][idx]
                    cnt += 1
            if cnt == 0:
                cleaned.loc[i, "chl_" + month] = -9999 #Missing every year
            else:
                cleaned.loc[i, "chl_" + month] /= cnt
        else:
            cleaned.loc[i, "chl_" + month] = df[year][month][idx]
        
cleaned

Unnamed: 0,Location,Cruise,Stn/Loc,Lat,Long,Depth,Cr_%,Cr_2pie,Cr_QF,Cr_nmol/kg,...,chl_Mar,chl_Apr,chl_May,chl_Jun,chl_Jul,chl_Aug,chl_Sep,chl_Oct,chl_Nov,chl_Dec
1,ETNP,TT145,2,21,-132,0.0,,,,,...,0.052969,0.047427,0.049164,0.048803,0.045213,0.042394,0.044499,0.049289,0.060621,0.066887
2,ETNP,TT145,2,21,-132,25.0,,,,3.190219,...,0.052969,0.047427,0.049164,0.048803,0.045213,0.042394,0.044499,0.049289,0.060621,0.066887
3,ETNP,TT145,2,21,-132,50.0,,,,3.132063,...,0.052969,0.047427,0.049164,0.048803,0.045213,0.042394,0.044499,0.049289,0.060621,0.066887
4,ETNP,TT145,2,21,-132,75.0,,,,,...,0.052969,0.047427,0.049164,0.048803,0.045213,0.042394,0.044499,0.049289,0.060621,0.066887
5,ETNP,TT145,2,21,-132,100.0,,,,3.424319,...,0.052969,0.047427,0.049164,0.048803,0.045213,0.042394,0.044499,0.049289,0.060621,0.066887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,ETSP,GP16,13,-14,-99,1987.8,0.771818,0.040358,2.0,4.851725,...,0.127056,0.126991,0.123647,0.118205,0.112208,0.107434,0.102281,0.109801,0.114711,0.121321
533,ETSP,GP16,13,-14,-99,3378.6,0.769107,0.035261,2.0,5.234698,...,0.127056,0.126991,0.123647,0.118205,0.112208,0.107434,0.102281,0.109801,0.114711,0.121321
534,ETSP,GP16,13,-14,-99,3811.1,0.758318,0.037619,2.0,5.088152,...,0.127056,0.126991,0.123647,0.118205,0.112208,0.107434,0.102281,0.109801,0.114711,0.121321
538,East Pac Rise,,"HYCY 10,11",12.833333,103.933333,2615.0,,,,15.800000,...,-9999.000000,-9999.000000,-9999.000000,-9999.000000,12.002384,9.575248,9.675007,10.606449,13.203321,15.696847


In [None]:
#Note that chl represents the concentration of chlorophyll, which should be non-negative
#Thus, I check missing values by checking whether the chl values is non-negative