In [None]:
# read new data
# read each page from excel file
import pandas as pd
import numpy as np

filename='cleaned'
Pacific=pd.read_excel(filename+'.xlsx',sheet_name='Pacific')
Atlantic=pd.read_excel(filename+'.xlsx',sheet_name='Atlantic')
Mediterranean=pd.read_excel(filename+'.xlsx',sheet_name='Mediterranean')
Southern=pd.read_excel(filename+'.xlsx',sheet_name='Southern Ocean')
Arctic=pd.read_excel(filename+'.xlsx',sheet_name='Arctic')

In [None]:
# we will be only merge one ocean at a time 

df2 = Pacific[['Lat','Long','Year']]
# convert to 'Lat','Long' string 
df2['Lat'] = df2['Lat'].astype(str)
df2['Long'] = df2['Long'].astype(str)

# here shockingly we have chinese character in the data!
df2['Lat'] = df2['Lat'].apply(lambda x: x.replace('−', '-')).astype(float)
df2['Long'] = df2['Long'].apply(lambda x: x.replace('−', '-')).astype(float)
# drop nan in lat and long 
df2 = df2.dropna(subset=['Lat','Long'])


In [None]:
import pandas as pd
import tarfile
import gzip
from pyhdf.SD import SD, SDC
import tempfile
import numpy as np
from scipy.interpolate import griddata

# List of all the years
years = range(1997, 2024)

# Empty DataFrame to store all results
all_results = pd.DataFrame()

# Get the original latitudes and longitudes
orig_lats = df2['Lat'].copy()
orig_lons = df2['Long'].copy()

for year in years:
    with tarfile.open(f'data/vgpm.s.{year}.tar', "r:") as tar:
        for member in tar.getmembers():
            if member.name.endswith('.hdf.gz'):
                # Decide month from the filename=day of year
                # last three integers
                day_of_year = int(str(member.name.split('.')[1])[-3:])
                month = str((day_of_year - 1) // 30 + 1).zfill(2)

                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)
                        # Access the 'npp' dataset
                        data = hdf_file.select('npp')[:]

                        # Replace '-9999.0' with NaN
                        data[data == -9999.0] = np.nan

                        # Define the latitude and longitude arrays
                        lats = np.linspace(90, -90, data.shape[0])  # Shape[0] is the number of rows
                        lons = np.linspace(-180, 180, data.shape[1])  # Shape[1] is the number of columns

                        lon_grid, lat_grid = np.meshgrid(lons, lats)
                        lat_list = lat_grid.reshape(-1,1)
                        lon_list = lon_grid.reshape(-1,1)
                        points = np.concatenate([lat_list, lon_list], axis=1)
                        xi = np.array(list(zip(df2['Lat'], df2['Long'])))
                        df2[f'npp_{month}'] = griddata(points, data.ravel(), xi, method='nearest')

        # Add the year column
        df2['Year'] = year
        df2['Lat'] = orig_lats
        df2['Long'] = orig_lons

        # Append df2 to the results DataFrame
        all_results = pd.concat([all_results, df2])

    # Close the file
    hdf_file.end()

# save the data
all_results.to_csv('npp_means_new.csv', index=False)

In [None]:
# add a column of yearly mean
all_results['yearly_mean_npp'] = all_results.filter(regex='npp_').mean(axis=1)

In [None]:
# save the data csv
all_results.to_csv('npp_means_new.csv', index=False)

In [None]:
## check the data


df_npp_means = pd.read_csv('npp_means_new.csv')
df_npp_means.sample(10)