## Step one download the file 
### change your url 


In [None]:
# download the data from SeaWiFS: http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.vgpm.s.chl.a.sst.php

import requests
import os

def download_file(url, filename):
    response = requests.get(url, stream=True)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

# make sure 'data/' directory exists
if not os.path.exists('data/'):
    os.makedirs('data/')

#############################################################################################################
##your code#################################################################################################
#############################################################################################################
base_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/vgpm.r2014.s.chl.a.sst/hdf/vgpm.s.{}.tar'

for year in range(1997, 2010):  # 2010 is not included
    url = base_url.format(year)
    filename = f'data/vgpm.s.{year}.tar'
    download_file(url, filename)


# download the data from MODIS:  http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.vgpm.m.chl.m.sst.php

#############################################################################################################
##your code#################################################################################################
#############################################################################################################
new_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/vgpm.r2022.m.chl.m.sst/hdf/vgpm.m.{}.tar'

for year in range(2010, 2024):  # 2024 is not included
    url = new_url.format(year)
    filename = f'data/vgpm.s.{year}.tar'
    download_file(url, filename)

## Step two calculate the mean of whole data
### please check your data and the final dataset , name it properly 

In [4]:
import os
import tarfile
import gzip
import tempfile
from pyhdf.SD import SD, SDC
import pandas as pd
import numpy as np

# Create a DataFrame to store the results
df = pd.DataFrame()

# Loop through each tar file
for year in range(1997, 2024):
    # Open the tar file
    tar_filename = f'data/vgpm.s.{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:
        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'npp' dataset
                        data = hdf_file.select('npp')[:]

                        # Take the mean along the time (2nd) dimension and reshape to a 1D array
                        mean_data = np.mean(data, axis=1).reshape(-1)

                        # Add this to the DataFrame
                        df[str(year)] = mean_data

                        # Close the file
                        hdf_file.end()

# Save the DataFrame to a CSV file
df.to_csv('npp_means.csv', index=False)


In [10]:
# Create a DataFrame to store the results
df = pd.DataFrame()

# Loop through each tar file
for year in range(1997, 2024):
    # Open the tar file
    tar_filename = f'data/vgpm.s.{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:
        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'npp' dataset
                        data = hdf_file.select('npp')

                        # Retrieve the latitude and longitude attributes
                        latitude_center = data.attributes()['Latitude Center']
                        longitude_center = data.attributes()['Longitude Center']

                        # Take the mean along the time (2nd) dimension
                        mean_data = np.mean(data[:], axis=1).reshape(-1)

                        # Add this to the DataFrame
                        df_temp = pd.DataFrame({
                            'year': [year] * len(mean_data),
                            'latitude': [latitude_center] * len(mean_data),
                            'longitude': [longitude_center] * len(mean_data),
                            'npp': mean_data
                        })
                        df = pd.concat([df, df_temp])

                        # Close the file
                        hdf_file.end()

# Save the DataFrame to a CSV file
df.to_csv('npp_means_new.csv', index=False)


## You can stop here and provide Echo with the csv file for the final merge , remember to see yourself it looks similar to the following format 

In [2]:
# Load the npp_means.csv file
import pandas as pd
df_npp_means = pd.read_csv('npp_means.csv')
df_npp_means

Unnamed: 0,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
2,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
3,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
4,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1076,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1077,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1078,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0


In [13]:
df_npp_means.shape


(1080, 27)

In [4]:
lats = np.linspace(-90, 90, 1080)
lons = np.linspace(-180, 180, 1080)

df = pd.DataFrame({'Latitude': lats, 'Longitude': lons})
# add df and df_npp_means ax=1
df_npp_means_new = pd.concat([df, df_npp_means], axis=1)

In [5]:
df_npp_means_new

Unnamed: 0,Latitude,Longitude,1997,1998,1999,2000,2001,2002,2003,2004,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,-90.000000,-180.000000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1,-89.833179,-179.666358,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
2,-89.666358,-179.332715,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
3,-89.499537,-178.999073,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
4,-89.332715,-178.665431,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,89.332715,178.665431,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1076,89.499537,178.999073,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1077,89.666358,179.332715,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1078,89.833179,179.666358,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0


In [6]:
# read each page from excel file
import pandas as pd
import numpy as np

filename='data/Cr Compilation filtered Apr23'
Pacific=pd.read_excel(filename+'.xlsx',sheet_name='Pacific')
Atlantic=pd.read_excel(filename+'.xlsx',sheet_name='Atlantic')
Mediterranean=pd.read_excel(filename+'.xlsx',sheet_name='Mediterranean')
Southern=pd.read_excel(filename+'.xlsx',sheet_name='Southern Ocean')
Arctic=pd.read_excel(filename+'.xlsx',sheet_name='Arctic')

In [17]:
Pacific.columns

Index(['Location', 'Cruise', 'Stn/Loc', 'Lat', 'Long', 'Depth', 'δ⁵³Cr',
       'Unnamed: 7', 'Unnamed: 8', 'Cr', 'Unnamed: 10', 'Unnamed: 11',
       'Filtered out', 'Unnamed: 13', 'Unnamed: 14', 'Cr(III)', 'Unnamed: 16',
       'Unnamed: 17', 'Filtered Out', 'Seawater', 'Source', 'Notes'],
      dtype='object')

In [7]:
import re
def extract_year(source):
    match = re.search(r'\b\d{4}\b', str(source))
    if match:
        return (match.group(0))
    else:
        return np.nan

In [8]:
Pacific['Year'] = Pacific['Source'].apply(extract_year)
Atlantic['Year'] = Atlantic['Source'].apply(extract_year)
Mediterranean['Year'] = Mediterranean['Source'].apply(extract_year)
Southern['Year'] = Southern['Source'].apply(extract_year)
Arctic['Year'] = Arctic['Source'].apply(extract_year)

In [None]:
# Assume df1 and df2 are your dataframes
df_npp_means_new = df_npp_means_new.rename(columns={"Latitude": "Lat", "Longitude": "Long"})  # rename columns for merging

# Round latitude and longitude in both dataframes to four decimal places
df1['Lat'] = df1['Lat'].round(4)
df1['Long'] = df1['Long'].round(4)
df2['Lat'] = df2['Lat'].round(4)
df2['Long'] = df2['Long'].round(4)

# Then merge on these columns
merged_df = pd.merge(df1, df2,  how='inner', left_on=['Lat','Long', 'Year'], right_on = ['Lat','Long', 'Year'])



In [None]:
# fillna with np.mean()