# SeaWIFS

In [15]:
import requests
import os

def download_file(url, filename):
    if os.path.exists(filename):
        print(f"File '{filename}' already exists. Skipping download.")
        return

    response = requests.get(url, stream=True)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

# make sure 'data/' directory exists
if not os.path.exists('data/'):
    os.makedirs('data/')

base_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/par.r2014.seawifs/hdf/par.s.{}.tar'

for year in range(1997, 2011):  # 2011 is not included
    url = base_url.format(year)
    filename = f'data/par.s.{year}.tar'
    download_file(url, filename)


File 'data/par.s.1997.tar' already exists. Skipping download.
File 'data/par.s.1998.tar' already exists. Skipping download.
File 'data/par.s.1999.tar' already exists. Skipping download.
File 'data/par.s.2000.tar' already exists. Skipping download.
File 'data/par.s.2001.tar' already exists. Skipping download.
File 'data/par.s.2002.tar' already exists. Skipping download.
File 'data/par.s.2003.tar' already exists. Skipping download.
File 'data/par.s.2004.tar' already exists. Skipping download.
File 'data/par.s.2005.tar' already exists. Skipping download.
File 'data/par.s.2006.tar' already exists. Skipping download.
File 'data/par.s.2007.tar' already exists. Skipping download.
File 'data/par.s.2008.tar' already exists. Skipping download.
File 'data/par.s.2009.tar' already exists. Skipping download.
File 'data/par.s.2010.tar' already exists. Skipping download.


## Step two: calculate the mean of whole data

In [3]:
conda update numpy

Retrieving notices: ...working... ERROR conda.notices.fetch:get_channel_notice_response(67): Request error <HTTPSConnectionPool(host='repo.anaconda.com', port=443): Max retries exceeded with url: /pkgs/main/notices.json (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 403 Forbidden')))> for channel: defaults url: https://repo.anaconda.com/pkgs/main/notices.json
ERROR conda.notices.fetch:get_channel_notice_response(67): Request error <HTTPSConnectionPool(host='repo.anaconda.com', port=443): Max retries exceeded with url: /pkgs/r/notices.json (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 403 Forbidden')))> for channel: defaults url: https://repo.anaconda.com/pkgs/r/notices.json
done
Collecting package metadata (current_repodata.json): failed

ProxyError: Conda cannot proceed due to an error in your proxy configuration.
Check for typos and other configuration errors in any '.netrc' file in your home directory,
any 

In [4]:
conda update pyhdf

Collecting package metadata (current_repodata.json): failed

ProxyError: Conda cannot proceed due to an error in your proxy configuration.
Check for typos and other configuration errors in any '.netrc' file in your home directory,
any environment variables ending in '_PROXY', and any other system-wide proxy
configuration settings.



Note: you may need to restart the kernel to use updated packages.


In [29]:
import os
import tarfile
import gzip
import tempfile
import pandas as pd
import numpy as np
from pyhdf.SD import SD, SDC

# Define the latitude and longitude arrays
lats = np.linspace(90, -90, 1080)
lons = np.linspace(-180, 180, 2160)

# Create a meshgrid of latitudes and longitudes
lon_grid, lat_grid = np.meshgrid(lons, lats, indexing='ij')

# Flatten the arrays for dataframe construction
lat_flat = lat_grid.flatten()
lon_flat = lon_grid.flatten()

# Create a multi-index from the latitude and longitude arrays
index = pd.MultiIndex.from_arrays([lat_flat, lon_flat], names=['Lat', 'Long'])

# Create an empty DataFrame with this index
df = pd.DataFrame(index=index)

# Loop through each tar file
for year in range(1997, 2011):
    # Open the tar file
    tar_filename = f'data/par.s.{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:

         # Create an empty DataFrame to store data for this year
        temp_df = pd.DataFrame(index=index)

        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'npp' dataset
                        data = hdf_file.select('par')[:]

                        # Replace '-9999.0' with NaN
                        data[data == -9999.0] = np.nan

                        # Flatten the data and add it to the temporary DataFrame
                        data_flat = data.flatten()
                        temp_df[member.name] = data_flat

                        # Close the file
                        hdf_file.end()

        # After going through all files for the year, calculate the mean for each location (ignoring NaNs)
        mean_data = temp_df.mean(axis=1, skipna=True)

        # Add this DataFrame to the main DataFrame
        df[str(year)] = mean_data

# Reset the index of the DataFrame, making 'Lat' and 'Long' normal columns
df.reset_index(inplace=True)
                      

# Save the DataFrame to a CSV file
df.to_csv('par_means_s.csv', index=False)


In [45]:
df_par_means = pd.read_csv('par_means_s.csv')
df_par_means.head()

Unnamed: 0,Lat,Long,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,90.0,-180.0,,,,,,,,,,,,,,
1,89.833179,-180.0,,,,,,,,,,,,,,
2,89.666358,-180.0,,,,,,,,,,,,,,
3,89.499537,-180.0,,,,,,,,,,,,,,
4,89.332715,-180.0,,,,,,,,,,,,,,


In [31]:
df_par_means.describe()

Unnamed: 0,Lat,Long,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
count,2332800.0,2332800.0,1328358.0,1431556.0,1434942.0,1433280.0,1428070.0,1525700.0,1559677.0,1564153.0,1571626.0,1570545.0,1587869.0,1579729.0,1573119.0,1581705.0
mean,1.347462e-14,5.988442e-16,36.4868,32.93971,32.86141,32.72164,32.57054,34.04737,33.59226,33.40154,33.17612,33.29809,32.87351,33.2142,33.20171,33.01338
std,52.00967,103.9712,14.37401,10.81675,10.97235,11.10469,11.37511,11.46245,11.67125,11.73296,11.86311,11.79777,12.0575,11.93083,11.84485,11.92258
min,-90.0,-180.0,0.001999,0.011993,0.211998,0.029999,0.011993,0.03266356,0.05799689,0.07399689,0.06799689,0.0789969,0.04599689,0.1459969,0.01899689,0.09199689
25%,-45.0,-90.0,29.55235,22.42377,22.30057,21.97562,21.59606,23.79849,22.68313,22.30756,22.21823,22.10608,21.71053,22.092,22.20668,22.07667
50%,7.105427e-15,-1.425249e-14,39.71651,33.52026,33.45121,33.4947,33.71332,35.78067,34.60883,34.5997,34.09866,34.17763,34.01879,34.3645,34.01798,34.08301
75%,45.0,90.0,47.83021,43.51616,43.18112,43.27298,43.26261,43.90462,44.35869,44.29033,44.07848,44.23293,43.95175,44.11874,44.14447,44.00641
max,90.0,180.0,66.248,64.09,63.074,63.104,64.222,67.2,68.12823,69.272,67.89,67.126,69.144,69.416,67.874,67.258


## Step three: read cleaned excel

In [32]:
# read new data
# read each page from excel file
import pandas as pd
import numpy as np
# find the path of the file called 'cleaned.xlsx'
filename='cleaned'
Pacific=pd.read_excel(filename+'.xlsx',sheet_name='Pacific')
Atlantic=pd.read_excel(filename+'.xlsx',sheet_name='Atlantic')
Mediterranean=pd.read_excel(filename+'.xlsx',sheet_name='Mediterranean')
Southern=pd.read_excel(filename+'.xlsx',sheet_name='Southern Ocean')
Arctic=pd.read_excel(filename+'.xlsx',sheet_name='Arctic')

## Step four: Melt for merge

In [33]:
df_par_melt = df_par_means.melt(id_vars=['Lat', 'Long'], var_name='Year', value_name='mld')

## Step five: Select decimal

In [34]:
# Then merge on these columns
# assuming df_melted_variable1-5 and ocean_dfs are your dataframes
ocean_names = ["Pacific", "Atlantic", "Mediterranean", "Southern", "Arctic"]
# change this to your dataframes
df_melted_list = [df_par_melt]
ocean_dfs = [Pacific, Atlantic, Mediterranean, Southern, Arctic]


for df in df_melted_list + ocean_dfs:
    # convert types safely
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64')
    df['Lat'] = pd.to_numeric(df['Lat'], errors='coerce')
    df['Long'] = pd.to_numeric(df['Long'], errors='coerce')


# round lat and long to 0.0001
# I believe round is a way of thresholding the data, let me know if there is a better wayof matching 
# the lat and longs
for df in df_melted_list + ocean_dfs:
    df['Lat'] = df['Lat'].round(1)
    df['Long'] = df['Long'].round(1)

In [35]:
# check if there is a match

# Create a set of tuples for the Pacific dataframe
pacific_coords = set(zip(Pacific['Lat'], Pacific['Long']))

# Create a set of tuples for the df_npp dataframe
npp_coords = set(zip(df_par_melt['Lat'], df_par_melt['Long']))

# Find the common coordinates
common_coords = pacific_coords.intersection(npp_coords)

print(f"Common coordinates between Pacific and df_npp: {len(common_coords)}")


Common coordinates between Pacific and df_npp: 16


## Step six: Merge

In [36]:
import pandas as pd
import numpy as np

merged_dfs = []  # List to store merged dataframes

for ocean_name, ocean_df in zip(ocean_names, ocean_dfs):
    # merge ocean_df with all dataframes in df_melted_list
    for df_melted in df_melted_list:
        ocean_df = pd.merge(ocean_df, df_melted, on=['Lat', 'Long', 'Year'], how='left')

    # Fill NaN values with the mean of each column
    for col in ocean_df.select_dtypes(include=[np.number]).columns:
        ocean_df[col] = ocean_df[col].astype(float)
        ocean_df[col].fillna(ocean_df[col].mean(), inplace=True)
        if ocean_df[col].apply(float.is_integer).all():  # Check if all values are integer
            ocean_df[col] = ocean_df[col].astype('Int64')  # Change dtype back to integer

    merged_dfs.append(ocean_df)  # Append the merged dataframe to the list

with pd.ExcelWriter('merged_par_s.xlsx') as writer:
    for ocean_name, merged_df in zip(ocean_names, merged_dfs):
        merged_df.to_excel(writer, sheet_name=ocean_name)  # Write each merged dataframe to a different sheet


# MODIS

In [25]:
import requests
import os

def download_file(url, filename):
    if os.path.exists(filename):
        print(f"File '{filename}' already exists. Skipping download.")
        return

    response = requests.get(url, stream=True)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

# make sure 'data/' directory exists
if not os.path.exists('data/'):
    os.makedirs('data/')


# download the data from MODIS:  http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.vgpm.m.chl.m.sst.php

new_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/par.modis.r2022/hdf/par.m.{}.tar'

for year in range(2002, 2024):  # 2024 is not included
    url = new_url.format(year)
    filename = f'data/par.s.{year}.tar'
    download_file(url, filename)

File 'data/par.s.2002.tar' already exists. Skipping download.
File 'data/par.s.2003.tar' already exists. Skipping download.
File 'data/par.s.2004.tar' already exists. Skipping download.
File 'data/par.s.2005.tar' already exists. Skipping download.
File 'data/par.s.2006.tar' already exists. Skipping download.
File 'data/par.s.2007.tar' already exists. Skipping download.
File 'data/par.s.2008.tar' already exists. Skipping download.
File 'data/par.s.2009.tar' already exists. Skipping download.
File 'data/par.s.2010.tar' already exists. Skipping download.
File 'data/par.s.2011.tar' already exists. Skipping download.
File 'data/par.s.2012.tar' already exists. Skipping download.
File 'data/par.s.2013.tar' already exists. Skipping download.
File 'data/par.s.2014.tar' already exists. Skipping download.
File 'data/par.s.2015.tar' already exists. Skipping download.
File 'data/par.s.2016.tar' already exists. Skipping download.
File 'data/par.s.2017.tar' already exists. Skipping download.
File 'da

In [46]:
# Define the latitude and longitude arrays
lats = np.linspace(90, -90, 1080)
lons = np.linspace(-180, 180, 2160)

# Create a meshgrid of latitudes and longitudes
lon_grid, lat_grid = np.meshgrid(lons, lats, indexing='ij')

# Flatten the arrays for dataframe construction
lat_flat = lat_grid.flatten()
lon_flat = lon_grid.flatten()

# Create a multi-index from the latitude and longitude arrays
index = pd.MultiIndex.from_arrays([lat_flat, lon_flat], names=['Lat', 'Long'])

# Create an empty DataFrame with this index
df = pd.DataFrame(index=index)

# Loop through each tar file
for year in range(1997, 2024):
    # Open the tar file
    tar_filename = f'data/par.s.{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:

         # Create an empty DataFrame to store data for this year
        temp_df = pd.DataFrame(index=index)

        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'npp' dataset
                        data = hdf_file.select('par')[:]

                        # Replace '-9999.0' with NaN
                        data[data == -9999.0] = np.nan

                        # Flatten the data and add it to the temporary DataFrame
                        data_flat = data.flatten()
                        temp_df[member.name] = data_flat

                        # Close the file
                        hdf_file.end()

        # After going through all files for the year, calculate the mean for each location (ignoring NaNs)
        mean_data = temp_df.mean(axis=1, skipna=True)

        # Add this DataFrame to the main DataFrame
        df[str(year)] = mean_data

# Reset the index of the DataFrame, making 'Lat' and 'Long' normal columns
df.reset_index(inplace=True)
                      

# Save the DataFrame to a CSV file
df.to_csv('par_means_m.csv', index=False)


In [47]:
df_par_means_m = pd.read_csv('par_means_m.csv')
df_par_means_m.head()

Unnamed: 0,Lat,Long,1997,1998,1999,2000,2001,2002,2003,2004,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,90.0,-180.0,,,,,,,,,...,,,,,,,,,,
1,89.833179,-180.0,,,,,,,,,...,,,,,,,,,,
2,89.666358,-180.0,,,,,,,,,...,,,,,,,,,,
3,89.499537,-180.0,,,,,,,,,...,,,,,,,,,,
4,89.332715,-180.0,,,,,,,,,...,,,,,,,,,,


In [48]:
df_par_means_m.describe()

Unnamed: 0,Lat,Long,1997,1998,1999,2000,2001,2002,2003,2004,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
count,2332800.0,2332800.0,1328358.0,1431556.0,1434942.0,1433280.0,1428070.0,1525700.0,1559677.0,1564153.0,...,1574863.0,1597800.0,1618205.0,1605778.0,1609688.0,1616116.0,1629161.0,1616048.0,1615011.0,1349397.0
mean,1.347462e-14,5.988442e-16,36.4868,32.93971,32.86141,32.72164,32.57054,34.04737,33.59226,33.40154,...,33.13329,33.33278,33.1581,33.26634,33.23867,33.25779,32.90469,33.24599,33.46675,39.45527
std,52.00967,103.9712,14.37401,10.81675,10.97235,11.10469,11.37511,11.46245,11.67125,11.73296,...,11.98258,11.69108,11.74541,11.75228,11.76685,11.76853,12.03324,11.72796,11.44901,15.70208
min,-90.0,-180.0,0.001999,0.011993,0.211998,0.029999,0.011993,0.03266356,0.05799689,0.07399689,...,0.01399689,0.2239969,0.02199689,0.04199689,0.05399689,0.1229969,0.07599689,0.02599689,0.02799689,0.01899689
25%,-45.0,-90.0,29.55235,22.42377,22.30057,21.97562,21.59606,23.79849,22.68313,22.30756,...,22.16235,22.47413,22.46878,22.46191,22.24007,22.42867,22.0577,22.43214,23.18759,32.23234
50%,7.105427e-15,-1.425249e-14,39.71651,33.52026,33.45121,33.4947,33.71332,35.78067,34.60883,34.5997,...,33.99547,34.14306,33.73656,34.12461,34.18643,33.99836,33.82202,34.05156,34.16727,41.42003
75%,45.0,90.0,47.83021,43.51616,43.18112,43.27298,43.26261,43.90462,44.35869,44.29033,...,44.26788,44.31431,44.26306,44.05948,44.10228,44.28291,43.93111,44.0021,43.87196,51.759
max,90.0,180.0,66.248,64.09,63.074,63.104,64.222,67.2,68.12823,69.272,...,68.856,67.7615,65.544,69.148,66.582,68.876,67.76,68.63,67.804,68.52


In [49]:
# read new data
# read each page from excel file
import pandas as pd
import numpy as np
# find the path of the file called 'cleaned.xlsx'
filename='cleaned'
Pacific=pd.read_excel(filename+'.xlsx',sheet_name='Pacific')
Atlantic=pd.read_excel(filename+'.xlsx',sheet_name='Atlantic')
Mediterranean=pd.read_excel(filename+'.xlsx',sheet_name='Mediterranean')
Southern=pd.read_excel(filename+'.xlsx',sheet_name='Southern Ocean')
Arctic=pd.read_excel(filename+'.xlsx',sheet_name='Arctic')

In [50]:
df_par_melt_m = df_par_means_m.melt(id_vars=['Lat', 'Long'], var_name='Year', value_name='mld')

In [51]:
# Then merge on these columns
# assuming df_melted_variable1-5 and ocean_dfs are your dataframes
ocean_names = ["Pacific", "Atlantic", "Mediterranean", "Southern", "Arctic"]
# change this to your dataframes
df_melted_list = [df_par_melt_m]
ocean_dfs = [Pacific, Atlantic, Mediterranean, Southern, Arctic]


for df in df_melted_list + ocean_dfs:
    # convert types safely
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64')
    df['Lat'] = pd.to_numeric(df['Lat'], errors='coerce')
    df['Long'] = pd.to_numeric(df['Long'], errors='coerce')


# round lat and long to 0.0001
# I believe round is a way of thresholding the data, let me know if there is a better wayof matching 
# the lat and longs
for df in df_melted_list + ocean_dfs:
    df['Lat'] = df['Lat'].round(1)
    df['Long'] = df['Long'].round(1)

In [52]:
# check if there is a match

# Create a set of tuples for the Pacific dataframe
pacific_coords = set(zip(Pacific['Lat'], Pacific['Long']))

# Create a set of tuples for the df_npp dataframe
npp_coords = set(zip(df_par_melt_m['Lat'], df_par_melt_m['Long']))

# Find the common coordinates
common_coords = pacific_coords.intersection(npp_coords)

print(f"Common coordinates between Pacific and df_npp: {len(common_coords)}")


Common coordinates between Pacific and df_npp: 16


In [53]:
import pandas as pd
import numpy as np

merged_dfs = []  # List to store merged dataframes

for ocean_name, ocean_df in zip(ocean_names, ocean_dfs):
    # merge ocean_df with all dataframes in df_melted_list
    for df_melted in df_melted_list:
        ocean_df = pd.merge(ocean_df, df_melted, on=['Lat', 'Long', 'Year'], how='left')

    # Fill NaN values with the mean of each column
    for col in ocean_df.select_dtypes(include=[np.number]).columns:
        ocean_df[col] = ocean_df[col].astype(float)
        ocean_df[col].fillna(ocean_df[col].mean(), inplace=True)
        if ocean_df[col].apply(float.is_integer).all():  # Check if all values are integer
            ocean_df[col] = ocean_df[col].astype('Int64')  # Change dtype back to integer

    merged_dfs.append(ocean_df)  # Append the merged dataframe to the list

with pd.ExcelWriter('merged_par_m.xlsx') as writer:
    for ocean_name, merged_df in zip(ocean_names, merged_dfs):
        merged_df.to_excel(writer, sheet_name=ocean_name)  # Write each merged dataframe to a different sheet
