In [1]:
import xarray as xr # for netCDF4 
import pandas as pd

# Open nc file

When you open the file without a group defined, you get the global attributes with no variables. You need to include a group='PRODUCT' to get the data product.

In [2]:
file_path = r"C:\\Users\\joonw\\Downloads\\t3.nc"

# Open the NetCDF file

# file_path = r"C:\\Users\\joonw\\Downloads\\ct5km_ssta_v3.1_20230101.nc"
ds = xr.open_dataset(file_path, group='PRODUCT')
# Close the NetCDF file
ds.head()
# ds.close()


In [16]:
dd = ds['time_utc']
dd.to_dataframe().reset_index()

dd = ds['delta_time']
dd = dd.to_dataframe().reset_index()
print(dd.iloc[1:5,:])
print(dd.iloc[100:105,:])
print(dd.iloc[200:205,:])
print(dd.iloc[20000:20005,:])


        time  scanline  ground_pixel              delta_time
1 2022-10-21       0.0           1.0 2022-10-21 18:12:55.073
2 2022-10-21       0.0           2.0 2022-10-21 18:12:55.073
3 2022-10-21       0.0           3.0 2022-10-21 18:12:55.073
4 2022-10-21       0.0           4.0 2022-10-21 18:12:55.073
          time  scanline  ground_pixel              delta_time
100 2022-10-21       0.0         100.0 2022-10-21 18:12:55.073
101 2022-10-21       0.0         101.0 2022-10-21 18:12:55.073
102 2022-10-21       0.0         102.0 2022-10-21 18:12:55.073
103 2022-10-21       0.0         103.0 2022-10-21 18:12:55.073
104 2022-10-21       0.0         104.0 2022-10-21 18:12:55.073
          time  scanline  ground_pixel              delta_time
200 2022-10-21       0.0         200.0 2022-10-21 18:12:55.073
201 2022-10-21       0.0         201.0 2022-10-21 18:12:55.073
202 2022-10-21       0.0         202.0 2022-10-21 18:12:55.073
203 2022-10-21       0.0         203.0 2022-10-21 18:12:55.073
20

For now, I want to use 'latitude', 'longitude', 'ozone_total_vertical_column' only.

In [17]:
# Select specific variables
selected_variables = ['latitude', 'longitude', 'ozone_total_vertical_column']
selected_ds = ds[selected_variables]

# Convert xarray.Dataset to pandas DataFrame
df = selected_ds.to_dataframe().reset_index()
# Somehow this gives 6 columns

df2 = df[['latitude', 'longitude', 'ozone_total_vertical_column']]

print(df2.shape)

# Many NaNs

df2_cleaned = df2.dropna(subset=['latitude', 'longitude', 'ozone_total_vertical_column'])

# Print the resulting cleaned DataFrame
print(df2_cleaned.shape)

(1877400, 3)
(1607313, 3)


Now I want to save the dataframe as csv file so chat I can work in R.

In [75]:
import os

# Assuming df2_cleaned is the cleaned DataFrame
directory_path = r"C:\\Users\\joonw\Downloads\\TCO_analysis"
csv_file_path = os.path.join(directory_path, 'df2_cleaned.csv')
# Save the DataFrame to a CSV file in the specified directory
df2_cleaned.to_csv(csv_file_path, index=False)

# How to read HDF file in Python

In [None]:
import numpy as np
import os
import pyhdf    # hd4
from pyhdf.SD import SD
import pandas as pd
import xarray as xr # for netCDF4 

# Sear surface temperature anomaly 

## Data types

National Environmental Satellite, Data and Information Service(NESDIS)

1) 2000 to 2020: Twice-weekly global 50km resolution HDF data.

Each data correponds to one day (a single observation in a day) and has a shape of (238320, 4) with longitude, latitude and SST_anomaly and surface_flag. 

https://coralreefwatch.noaa.gov/product/50km/index.php

2) 1985 to present: Daily regional 5km resolution NetCDF4 data.

Each data correponds to one day(single observation in a day)  has a shape of (641602, 6),(341902) with  'time', 'longitude', 'latitude', 'sea_surface_temperature_anomaly', 'mask' and 'crs'.     

https://coralreefwatch.noaa.gov/product/5km/index.php

3) 2000 to 2020: 227 stations' time series data recorded twice-weekly. 

Each data contains observations from 2000 to 2020 with fixed longtidue and latitude.


https://coralreefwatch.noaa.gov/product/50km/list_vs_group_latlon_201103.php


#### Extract data from the website 

https://coastwatch.pfeg.noaa.gov/erddap/griddap/NOAA_DHW.html

https://coastwatch.pfeg.noaa.gov/erddap/griddap/NOAA_DHW.csv?CRW_SSTANOMALY%5B(2024-02-1T12:00:00Z):1:(2024-02-1T12:00:00Z)%5D%5B(-20):1:(20)%5D%5B(-20):1:(20)%5D


Terminology:

BAA: bleaching alert area    
mask: pixel characteristics flag    
DHW: degree heating week, Celsius weeks   
HOTSPOT: coral bleaching hotspot, Celsius   
SEAICE: sea ice fraction, 1   

## Check if your HDF file is hdf4 or hdf5

hdf4 format:  b'\x0e\x03\x13\x01'   
hdf5 format:  b'\x89HDF'   


In [None]:
hdf4_file_path = r"C:\\Users\\joonw\\Downloads\\td1.hdf"
h=open(hdf4_file_path, 'rb'); bts=h.read(4); print(bts)

In [None]:
hdf_file = SD(hdf4_file_path)

# Print a list of dataset names
datasets = hdf_file.datasets()
print("Available Datasets:")
for dataset_name in datasets:
    print(dataset_name)
    
hdf_file.end()

## Reshape the data

In [None]:
hdf_file = SD(hdf4_file_path)

longitude_data = hdf_file.select("longitude").get()
latitude_data = hdf_file.select("latitude").get()
wind_data = hdf_file.select("CRW_SSTANOMALY").get()

hdf_file.end()

# print(data1.shape)
# print(data2.shape)
# print(data3.shape)

# Reshape latitude and longitude to match the wind data shape
longitude_data2 = np.tile(longitude_data, (1, len(latitude_data )))
latitude_data2 = np.repeat(latitude_data, len(longitude_data ))            # why longitude ordered - to + and latitude from + to -

# Create a DataFrame
df = pd.DataFrame({
    'Longitude': longitude_data2.flatten(),
    'Latitude': latitude_data2.flatten(),
    'SST_Anomaly': wind_data.flatten()
})

print(df.shape)
df.describe()