### MODIS (1080x2160)

In [1]:
# Download the data from MODIS (1080x2160):  http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.sst.modis.php

import requests
import os
from bs4 import BeautifulSoup
import urllib.parse

def download_file(url, filename):
    response = requests.get(url, stream=True)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

# Make sure 'data/' directory exists
if not os.path.exists('data/'):
    os.makedirs('data/')

modis_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/sst.modis.r2022/hdf/sst.m.{}.tar'

for year in range(2002, 2023):  # 2023 is not included
    url = modis_url.format(year)
    filename = f'data/sst.m.{year}.tar'
    download_file(url, filename)
    print(f"Downloaded: {filename}")


# Download the satellite data (1080x2160): http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.vgpm.m.chl.m.sst.php

base_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/vgpm.r2022.m.chl.m.sst/hdf/vgpm.m.{}.tar'

for year in range(2002, 2023):  # 2023 is not included
    url = base_url.format(year)
    filename = f'data/vgpm.m.{year}.tar'
    download_file(url, filename)
    print(f"Downloaded: {filename}")

Downloaded: data/sst.m.2002.tar
Downloaded: data/sst.m.2003.tar
Downloaded: data/sst.m.2004.tar
Downloaded: data/sst.m.2005.tar
Downloaded: data/sst.m.2006.tar
Downloaded: data/sst.m.2007.tar
Downloaded: data/sst.m.2008.tar
Downloaded: data/sst.m.2009.tar
Downloaded: data/sst.m.2010.tar
Downloaded: data/sst.m.2011.tar
Downloaded: data/sst.m.2012.tar
Downloaded: data/sst.m.2013.tar
Downloaded: data/sst.m.2014.tar
Downloaded: data/sst.m.2015.tar
Downloaded: data/sst.m.2016.tar
Downloaded: data/sst.m.2017.tar
Downloaded: data/sst.m.2018.tar
Downloaded: data/sst.m.2019.tar
Downloaded: data/sst.m.2020.tar
Downloaded: data/sst.m.2021.tar
Downloaded: data/sst.m.2022.tar
Downloaded: data/vgpm.m.2002.tar
Downloaded: data/vgpm.m.2003.tar
Downloaded: data/vgpm.m.2004.tar
Downloaded: data/vgpm.m.2005.tar
Downloaded: data/vgpm.m.2006.tar
Downloaded: data/vgpm.m.2007.tar
Downloaded: data/vgpm.m.2008.tar
Downloaded: data/vgpm.m.2009.tar
Downloaded: data/vgpm.m.2010.tar
Downloaded: data/vgpm.m.2011.ta

In [3]:
# Merge the satellite data and MODIS data
# Issue: later while calculating mean of merged data, sst no longer exists
import subprocess
import os
from google.colab import files

# Directory to store the merged TAR files
merged_dir = "merged_tars"
os.makedirs(merged_dir, exist_ok=True)

# Iterate over the years and merge TAR files by year
for year in range(2002, 2023):
    sst_tar_file = f"data/sst.m.{year}.tar"
    vgpm_tar_file = f"data/vgpm.m.{year}.tar"
    
    # Name of merged TAR file
    filename = os.path.join(merged_dir, f"{year}.tar")
    
    # Command to merge TAR files
    command = ["tar", "-cf", filename, sst_tar_file, vgpm_tar_file]
    subprocess.run(command)    
    print(f"Merge complete for {year}.")

    # Download the merged TAR file
    download_file(url, filename)

Merge complete for year 2002.
Merge complete for year 2003.
Merge complete for year 2004.
Merge complete for year 2005.
Merge complete for year 2006.
Merge complete for year 2007.
Merge complete for year 2008.
Merge complete for year 2009.
Merge complete for year 2010.
Merge complete for year 2011.
Merge complete for year 2012.
Merge complete for year 2013.
Merge complete for year 2014.
Merge complete for year 2015.
Merge complete for year 2016.
Merge complete for year 2017.
Merge complete for year 2018.
Merge complete for year 2019.
Merge complete for year 2020.
Merge complete for year 2021.
Merge complete for year 2022.
All merges complete


### SeaWIFS/AVHRR (1024x2048) <-- ISSUE: SeaWIFS VGPM and AVHRR are not the same resolution

In [4]:
# Download the data from SeaWIFS/AVHRR (1024x2048): http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.sst.avhrr.php

seawifs_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/sst.avhrr/hdf/sst.a.{}.tar'

for year in range(1997, 2002):  # 2003 is not included
    url = seawifs_url.format(year)
    filename = f'data/sst.a.{year}.tar'
    download_file(url, filename)
    print(f"Downloaded: {filename}")

# Download satellite data (1080x2160): http://orca.science.oregonstate.edu/1080.by.2160.monthly.hdf.vgpm.s.chl.a.sst.php

new_url = 'http://orca.science.oregonstate.edu/data/1x2/monthly/vgpm.r2014.s.chl.a.sst/hdf/vgpm.s.{}.tar'

for year in range(1997, 2002):  # 2023 is not included
    url = new_url.format(year)
    filename = f'data/vgpm.s.{year}.tar'
    download_file(url, filename)
    print(f"Downloaded: {filename}")

# Merge the satellite data and SeaWIFS/AVHRR data
# Waiting to resolve resolution issue

Downloaded: data/sst.a.1997.tar
Downloaded: data/sst.a.1998.tar
Downloaded: data/sst.a.1999.tar
Downloaded: data/sst.a.2000.tar
Downloaded: data/sst.a.2001.tar
Downloaded: data/sst.a.2002.tar
Downloaded: data/vgpm.s.1997.tar
Downloaded: data/vgpm.s.1998.tar
Downloaded: data/vgpm.s.1999.tar
Downloaded: data/vgpm.s.2000.tar
Downloaded: data/vgpm.s.2001.tar
Downloaded: data/vgpm.s.2002.tar


### Calculating MODIS Mean

In [2]:
import os
import tarfile
import gzip
import tempfile
!pip install pyhdf
from pyhdf.SD import *
import pandas as pd
import numpy as np

# Create a DataFrame to store the results
df = pd.DataFrame()

# Loop through each tar file
for year in range(2002, 2023):
    # Open the tar file
    tar_filename = f'data/sst.m.{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:
        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'sst' dataset
                        data = hdf_file.select('sst')[:]

                        # Take the mean along the time (2nd) dimension and reshape to a 1D array
                        mean_data = np.mean(data, axis=1).reshape(-1)

                        # Add this to the DataFrame
                        df[str(year)] = mean_data

                        # Close the file
                        hdf_file.end()

# Save the DataFrame to a CSV file
df.to_csv('sst_m_means.csv', index=False)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyhdf
  Downloading pyhdf-0.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (739 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.8/739.8 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyhdf
Successfully installed pyhdf-0.10.5


In [3]:
# Create a DataFrame to store the results
df = pd.DataFrame()

# Loop through each tar file
for year in range(2002, 2023):
    # Open the tar file
    tar_filename = f'data/sst.m.{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:
        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'sst' dataset
                        data = hdf_file.select('sst')

                        # Retrieve the latitude and longitude attributes
                        latitude_center = data.attributes()['Latitude Center']
                        longitude_center = data.attributes()['Longitude Center']

                        # Take the mean along the time (2nd) dimension
                        mean_data = np.mean(data[:], axis=1).reshape(-1)

                        # Add this to the DataFrame
                        df_temp = pd.DataFrame({
                            'year': [year] * len(mean_data),
                            'latitude': [latitude_center] * len(mean_data),
                            'longitude': [longitude_center] * len(mean_data),
                            'sst': mean_data
                        })
                        df = pd.concat([df, df_temp])

                        # Close the file
                        hdf_file.end()

# Save the DataFrame to a CSV file
df.to_csv('sst_m_means_new.csv', index=False)


### Calculating Merged MODIS and Satellite Mean

In [8]:
# Waiting to resolve merge issue

import os
import tarfile
import gzip
import tempfile
!pip install pyhdf
from pyhdf.SD import *
import pandas as pd
import numpy as np

# Create a DataFrame to store the results
df = pd.DataFrame()

# Loop through each tar file
for year in range(2002, 2023):
    # Open the tar file
    tar_filename = f'merged_tars/{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:
        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'sst' dataset
                        data = hdf_file.select('sst')[:]

                        # Take the mean along the time (2nd) dimension and reshape to a 1D array
                        mean_data = np.mean(data, axis=1).reshape(-1)

                        # Add this to the DataFrame
                        df[str(year)] = mean_data

                        # Close the file
                        hdf_file.end()

# Save the DataFrame to a CSV file
df.to_csv('merged_sst_m_means.csv', index=False)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


HDF4Error: ignored

### Calculating SeaWIFS/AVHRR Mean

In [9]:
# Waiting to resolve merge issue

import os
import tarfile
import gzip
import tempfile
!pip install pyhdf
from pyhdf.SD import *
import pandas as pd
import numpy as np

# Create a DataFrame to store the results
df = pd.DataFrame()

# Loop through each tar file
for year in range(1997, 2002):
    # Open the tar file
    tar_filename = f'data/sst.a.{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:
        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'sst' dataset
                        data = hdf_file.select('sst')[:]

                        # Take the mean along the time (2nd) dimension and reshape to a 1D array
                        mean_data = np.mean(data, axis=1).reshape(-1)

                        # Add this to the DataFrame
                        df[str(year)] = mean_data

                        # Close the file
                        hdf_file.end()

# Save the DataFrame to a CSV file
df.to_csv('sst_a_means.csv', index=False)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
# Create a DataFrame to store the results
df = pd.DataFrame()

# Loop through each tar file
for year in range(1997, 2002):
    # Open the tar file
    tar_filename = f'data/sst.a.{year}.tar'
    with tarfile.open(tar_filename, "r:") as tar:
        # Loop through each member of the tar file
        for member in tar.getmembers():
            # If it's a .hdf.gz file
            if member.name.endswith('.hdf.gz'):
                # Open the .hdf.gz file
                f = tar.extractfile(member)
                with gzip.open(f, 'rb') as gz:
                    # Decompress the .hdf.gz file to a temporary file
                    with tempfile.NamedTemporaryFile() as tmp:
                        tmp.write(gz.read())
                        tmp.seek(0)  # Go back to the start of the file

                        # Open the temporary .hdf file
                        hdf_file = SD(tmp.name, SDC.READ)

                        # Access the 'sst' dataset
                        data = hdf_file.select('sst')

                        # Retrieve the latitude and longitude attributes
                        latitude_center = data.attributes()['Latitude Center']
                        longitude_center = data.attributes()['Longitude Center']

                        # Take the mean along the time (2nd) dimension
                        mean_data = np.mean(data[:], axis=1).reshape(-1)

                        # Add this to the DataFrame
                        df_temp = pd.DataFrame({
                            'year': [year] * len(mean_data),
                            'latitude': [latitude_center] * len(mean_data),
                            'longitude': [longitude_center] * len(mean_data),
                            'sst': mean_data
                        })
                        df = pd.concat([df, df_temp])

                        # Close the file
                        hdf_file.end()

# Save the DataFrame to a CSV file
df.to_csv('sst_a_means_new.csv', index=False)