## MODIS (1080x2160)

In [None]:
import requests
import os
from bs4 import BeautifulSoup
import urllib.parse

def download_file(url, filename):
    response = requests.get(url, stream=True)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

# Make sure 'data/' directory exists
if not os.path.exists('data/'):
    os.makedirs('data/')
    

# Download the data from MODIS (1080x2160):  http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.sst.modis.php

modis_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/sst.modis.r2022/hdf/sst.m.{}.tar'

for year in range(2002, 2023):  # 2023 is not included
    url = modis_url.format(year)
    filename = f'data/sst.m.{year}.tar'
    download_file(url, filename)
    print(f"Downloaded: {filename}")


# Download the satellite data (1080x2160): http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.vgpm.m.chl.m.sst.php

#base_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/vgpm.r2022.m.chl.m.sst/hdf/vgpm.m.{}.tar'

#for year in range(2002, 2023):  # 2023 is not included
    #url = base_url.format(year)
    #filename = f'data/vgpm.m.{year}.tar'
    #download_file(url, filename)
    #print(f"Downloaded: {filename}")

### Calculating Mean

In [None]:
import os
import tarfile
import gzip
import tempfile
!pip install pyhdf
from pyhdf.SD import *
import pandas as pd
import numpy as np

# Define the latitude and longitude arrays
lats = np.linspace(90, -90, 1080)
lons = np.linspace(-180, 180, 2160)

# Create a meshgrid of latitudes and longitudes
lon_grid, lat_grid = np.meshgrid(lons, lats, indexing='ij')

# Flatten the arrays for dataframe construction
lat_flat = lat_grid.flatten()
lon_flat = lon_grid.flatten()

# Create a multi-index from the latitude and longitude arrays
index = pd.MultiIndex.from_arrays([lat_flat, lon_flat], names=['Lat', 'Long'])

# Create an empty DataFrame with this index
df = pd.DataFrame(index=index)

# Loop through each tar file
for year in range(2002, 2023):
    # Open the tar file
    tar_filename = f'data/sst.m.{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:
        
        # Create an empty DataFrame to store data for this year
        temp_df = pd.DataFrame(index=index)

        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'sst' dataset
                        data = hdf_file.select('sst')[:]

                        # Replace '-9999.0' with NaN
                        data[data == -9999.0] = np.nan

                        # Flatten the data and add it to the temporary DataFrame
                        data_flat = data.flatten()
                        temp_df[member.name] = data_flat

                        # Close the file
                        hdf_file.end()

        # After going through all files for the year, calculate the mean for each location (ignoring NaNs)
        mean_data = temp_df.mean(axis=1, skipna=True)

        # Add this DataFrame to the main DataFrame
        df[str(year)] = mean_data

# Reset the index of the DataFrame, making 'Lat' and 'Long' normal columns
df.reset_index(inplace=True)

# Save the DataFrame to a CSV file
df.to_csv('sst_m_means.csv', index=False)

In [None]:
df_sst_m_means = pd.read_csv('sst_m_means.csv')
df_sst_m_means.head()

In [None]:
df_sst_m_means.describe()

### Read Cleaned Excel

In [7]:
# read new data
# read each page from excel file
import pandas as pd
import numpy as np

filename='cleaned'
Pacific=pd.read_excel(filename+'.xlsx',sheet_name='Pacific')
Atlantic=pd.read_excel(filename+'.xlsx',sheet_name='Atlantic')
Mediterranean=pd.read_excel(filename+'.xlsx',sheet_name='Mediterranean')
Southern=pd.read_excel(filename+'.xlsx',sheet_name='Southern Ocean')
Arctic=pd.read_excel(filename+'.xlsx',sheet_name='Arctic')


### Melt for Merge

In [8]:
# Melt the data
df_sst_m_melt = df_sst_m_means.melt(id_vars=['Lat', 'Long'], var_name='Year', value_name='sst')

### Select Decimal

In [9]:
# Then merge on these columns
# assuming df_melted_variable1-5 and ocean_dfs are your dataframes
ocean_names = ["Pacific", "Atlantic", "Mediterranean", "Southern", "Arctic"]

# change this to your dataframes
df_melted_list = [df_sst_m_melt]
ocean_dfs = [Pacific, Atlantic, Mediterranean, Southern, Arctic]


for df in df_melted_list + ocean_dfs:
    # convert types safely
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64')
    df['Lat'] = pd.to_numeric(df['Lat'], errors='coerce')
    df['Long'] = pd.to_numeric(df['Long'], errors='coerce')


# round lat and long to 0.0001
# I believe round is a way of thresholding the data, let me know if there is a better way of matching 
# the lat and longs
for df in df_melted_list + ocean_dfs:
    df['Lat'] = df['Lat'].round(1)
    df['Long'] = df['Long'].round(1)

In [None]:
# Check if there is a match
# Create a set of tuples for the Pacific dataframe
pacific_coords = set(zip(Pacific['Lat'], Pacific['Long']))

# Create a set of tuples for the df_npp dataframe
npp_coords = set(zip(df_sst_m_melt['Lat'], df_sst_m_melt['Long']))

# Find the common coordinates
common_coords = pacific_coords.intersection(npp_coords)

print(f"Common coordinates between Pacific and df_npp: {len(common_coords)}")

### Merge

In [11]:
import pandas as pd
import numpy as np

merged_dfs = []  # List to store merged dataframes

for ocean_name, ocean_df in zip(ocean_names, ocean_dfs):
    # merge ocean_df with all dataframes in df_melted_list
    for df_melted in df_melted_list:
        ocean_df = pd.merge(ocean_df, df_melted, on=['Lat', 'Long', 'Year'], how='left')

    # Fill NaN values with the mean of each column
    for col in ocean_df.select_dtypes(include=[np.number]).columns:
        ocean_df[col] = ocean_df[col].astype(float)
        ocean_df[col].fillna(ocean_df[col].mean(), inplace=True)
        if ocean_df[col].apply(float.is_integer).all():  # Check if all values are integer
            ocean_df[col] = ocean_df[col].astype('Int64')  # Change dtype back to integer

    merged_dfs.append(ocean_df)  # Append the merged dataframe to the list

with pd.ExcelWriter('merged_sst_m.xlsx') as writer:
    for ocean_name, merged_df in zip(ocean_names, merged_dfs):
        merged_df.to_excel(writer, sheet_name=ocean_name)  # Write each merged dataframe to a different sheet

## SeaWIFS/AVHRR (1024x2048)

In [None]:
import requests
import os
from bs4 import BeautifulSoup
import urllib.parse

def download_file(url, filename):
    response = requests.get(url, stream=True)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

# Make sure 'data/' directory exists
if not os.path.exists('data/'):
    os.makedirs('data/')


# Download the data from SeaWIFS/AVHRR (1024x2048): http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.sst.avhrr.php

seawifs_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/sst.avhrr/hdf/sst.a.{}.tar'

for year in range(1997, 2002):  # 2003 is not included
    url = seawifs_url.format(year)
    filename = f'data/sst.a.{year}.tar'
    download_file(url, filename)
    print(f"Downloaded: {filename}")


# Download satellite data (1080x2160): http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.vgpm.s.chl.a.sst.php

#new_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/vgpm.r2014.s.chl.a.sst/hdf/vgpm.s.{}.tar'

#for year in range(1997, 2002):  # 2023 is not included
    #url = new_url.format(year)
    #filename = f'data/vgpm.s.{year}.tar'
    #download_file(url, filename)
    #print(f"Downloaded: {filename}")

### Calculating Mean

In [None]:
import os
import tarfile
import gzip
import tempfile
!pip install pyhdf
from pyhdf.SD import *
import pandas as pd
import numpy as np

# Define the latitude and longitude arrays
lats = np.linspace(90, -90, 1024)
lons = np.linspace(-180, 180, 2048)

# Create a meshgrid of latitudes and longitudes
lon_grid, lat_grid = np.meshgrid(lons, lats, indexing='ij')

# Flatten the arrays for dataframe construction
lat_flat = lat_grid.flatten()
lon_flat = lon_grid.flatten()

# Create a multi-index from the latitude and longitude arrays
index = pd.MultiIndex.from_arrays([lat_flat, lon_flat], names=['Lat', 'Long'])

# Create an empty DataFrame with this index
df = pd.DataFrame(index=index)

# Loop through each tar file
for year in range(1997, 2002):
    # Open the tar file
    tar_filename = f'data/sst.a.{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:
        
        # Create an empty DataFrame to store data for this year
        temp_df = pd.DataFrame(index=index)

        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'sst' dataset
                        data = hdf_file.select('sst')[:]

                        # Replace '-9999.0' with NaN
                        data[data == -9999.0] = np.nan

                        # Flatten the data and add it to the temporary DataFrame
                        data_flat = data.flatten()
                        temp_df[member.name] = data_flat

                        # Close the file
                        hdf_file.end()

        # After going through all files for the year, calculate the mean for each location (ignoring NaNs)
        mean_data = temp_df.mean(axis=1, skipna=True)

        # Add this DataFrame to the main DataFrame
        df[str(year)] = mean_data

# Reset the index of the DataFrame, making 'Lat' and 'Long' normal columns
df.reset_index(inplace=True)

# Save the DataFrame to a CSV file
df.to_csv('sst_a_means.csv', index=False)

In [None]:
df_sst_a_means = pd.read_csv('sst_a_means.csv')
df_sst_a_means.head()

In [None]:
df_sst_a_means.describe()

### Read Cleaned Excel

In [17]:
# read new data
# read each page from excel file
import pandas as pd
import numpy as np

filename='cleaned'
Pacific=pd.read_excel(filename+'.xlsx',sheet_name='Pacific')
Atlantic=pd.read_excel(filename+'.xlsx',sheet_name='Atlantic')
Mediterranean=pd.read_excel(filename+'.xlsx',sheet_name='Mediterranean')
Southern=pd.read_excel(filename+'.xlsx',sheet_name='Southern Ocean')
Arctic=pd.read_excel(filename+'.xlsx',sheet_name='Arctic')

### Melt for Merge

In [18]:
# Melt the data
df_sst_a_melt = df_sst_a_means.melt(id_vars=['Lat', 'Long'], var_name='Year', value_name='sst')

### Select Decimal

In [19]:
# Then merge on these columns
# assuming df_melted_variable1-5 and ocean_dfs are your dataframes
ocean_names = ["Pacific", "Atlantic", "Mediterranean", "Southern", "Arctic"]

# change this to your dataframes
df_melted_list = [df_sst_a_melt]
ocean_dfs = [Pacific, Atlantic, Mediterranean, Southern, Arctic]


for df in df_melted_list + ocean_dfs:
    # convert types safely
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64')
    df['Lat'] = pd.to_numeric(df['Lat'], errors='coerce')
    df['Long'] = pd.to_numeric(df['Long'], errors='coerce')


# round lat and long to 0.0001
# I believe round is a way of thresholding the data, let me know if there is a better way of matching 
# the lat and longs
for df in df_melted_list + ocean_dfs:
    df['Lat'] = df['Lat'].round(1)
    df['Long'] = df['Long'].round(1)

In [None]:
# Check if there is a match
# Create a set of tuples for the Pacific dataframe
pacific_coords = set(zip(Pacific['Lat'], Pacific['Long']))

# Create a set of tuples for the df_npp dataframe
npp_coords = set(zip(df_sst_a_melt['Lat'], df_sst_a_melt['Long']))

# Find the common coordinates
common_coords = pacific_coords.intersection(npp_coords)

print(f"Common coordinates between Pacific and df_npp: {len(common_coords)}")

### Merge

In [21]:
import pandas as pd
import numpy as np

merged_dfs = []  # List to store merged dataframes

for ocean_name, ocean_df in zip(ocean_names, ocean_dfs):
    # merge ocean_df with all dataframes in df_melted_list
    for df_melted in df_melted_list:
        ocean_df = pd.merge(ocean_df, df_melted, on=['Lat', 'Long', 'Year'], how='left')

    # Fill NaN values with the mean of each column
    for col in ocean_df.select_dtypes(include=[np.number]).columns:
        ocean_df[col] = ocean_df[col].astype(float)
        ocean_df[col].fillna(ocean_df[col].mean(), inplace=True)
        if ocean_df[col].apply(float.is_integer).all():  # Check if all values are integer
            ocean_df[col] = ocean_df[col].astype('Int64')  # Change dtype back to integer

    merged_dfs.append(ocean_df)  # Append the merged dataframe to the list

with pd.ExcelWriter('merged_sst_a.xlsx') as writer:
    for ocean_name, merged_df in zip(ocean_names, merged_dfs):
        merged_df.to_excel(writer, sheet_name=ocean_name)  # Write each merged dataframe to a different sheet