# Script 3 - Clusters on Satellite Data

In [None]:
#basic packages
import xarray as xr
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import Rectangle
import matplotlib as mpl
import gsw
import seaborn as sns
import glob

#clustering packages
import itertools
from scipy import linalg, interpolate
from scipy.interpolate import griddata
from scipy.spatial import ConvexHull
from sklearn import mixture
from sklearn.neighbors import NearestCentroid
import statsmodels.api as sm

#map packages
from shapely.geometry import mapping
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
import cartopy.feature as cfeature
import cartopy.crs as ccrs

#set random seed for reproducibility
SEED = 7

In [None]:
#open cali mask
cali = gpd.read_file('../data/California_Mask_Erased/California_Mask_Erased.shp')

#read in temperature data
mur = xr.open_dataset('../data/MUR-JPL-L4-GLOB-v4.1_201507-202309_Monthly.nc')
mur.close()
mur = mur.rename({'lat':'latitude', 'lon':'longitude'})

#replace mur values
mur['analysed_sst'] = xr.where(mur['analysed_sst'] > 100, mur['analysed_sst'] - 273.15, mur['analysed_sst'])

#read in salinity data
smap = xr.open_dataset('../data/SMAP_L3_SSS_201507-202309_Monthly_V5.0.nc')
smap.close()

#filter out salinity
smap = smap.where(smap.smap_sss >= 30, drop = True)

#resample to monthly
mur = mur.groupby(mur.time.dt.date).mean()#resample(time = '1MS').mean()
smap = smap.groupby(smap.time.dt.date).mean()#resample(time = '1MS').mean()

#reset SMAP date
smap_df = smap.to_dataframe().reset_index()
smap_df['date'] = pd.to_datetime(smap_df['date'].astype(str).str[:4] + '-' + smap_df['date'].astype(str).str[5:7] + '-01')
smap = smap_df.set_index(['date', 'latitude', 'longitude']).to_xarray()

#reset MUR date
mur['date'] = pd.to_datetime(mur['date'])

#coarce mur data to smap grid
mur = mur.interp_like(smap.smap_sss)#, kwargs={"fill_value": np.nan})

#join data together
comb = xr.merge([smap, mur]).to_dataframe().to_xarray()

#clip data to shapefile
comb.rio.set_spatial_dims(x_dim = "longitude", y_dim = "latitude", inplace = True)
comb.rio.write_crs("epsg:4326", inplace = True)

#clip the data
comb = comb.rio.clip(cali.geometry.apply(mapping), cali.crs, drop = True)

#rename date
comb = comb.rename({'date':'time'})
comb

In [None]:
###Add density calculations
#convert to dataframe, calculate density using gsw package, and convert back to xarray
comb_df = comb.to_dataframe()
comb_df['DENSITY_MEAN'] = gsw.sigma0(comb_df.smap_sss, comb_df.analysed_sst)
comb = comb_df.to_xarray()

#extract ou the minimum and maximum temp/salinity values
mint=np.min(comb_df['analysed_sst'])
maxt=np.max(comb_df['analysed_sst'])

mins=np.min(comb_df['smap_sss'])
maxs=np.max(comb_df['smap_sss'])

#create an evenly spaced series based on the range of values
tempL=np.linspace(mint-1,maxt+1,156)
salL=np.linspace(mins-0.25,maxs+0.25,156)

#create a meshgride and fill linspace with density values
Tg, Sg = np.meshgrid(tempL,salL)
sigma_theta = gsw.sigma0(Sg, Tg)
cnt = np.linspace(sigma_theta.min(), sigma_theta.max(),156)

comb

In [None]:
###Open centroids from in-situ data (script 1)
centroids = pd.read_csv('../data/saildrone_GMMcluster_centroids.csv', index_col = [0])
centroids

## 2015-2023 July thru September Mean

In [None]:
#calculate the climatology for the three months 
comb_clim = comb.groupby(comb.time.dt.month).mean().mean('month').to_dataframe().reset_index().dropna().reset_index()

###Add density calculations
#convert to dataframe, calculate density using gsw package, and convert back to xarray
comb_clim['DENSITY_MEAN'] = gsw.sigma0(comb_clim.smap_sss, comb_clim.analysed_sst)

# #extract out the minimum and maximum temp/salinity values
# mint=np.min(comb_clim['analysed_sst'])
# maxt=np.max(comb_clim['analysed_sst'])

# mins=np.min(comb_clim['smap_sss'])
# maxs=np.max(comb_clim['smap_sss'])

# #create an evenly spaced series based on the range of values
# # tempL=np.linspace(mint-2,maxt+1,156)
# # salL=np.linspace(mins-0.25,maxs+0.25,156)
# tempL = np.linspace(9,24)
# salL = np.linspace(30.5,35)


# #create a meshgride and fill linspace with density values
# Tg, Sg = np.meshgrid(tempL,salL)
# sigma_theta = gsw.sigma0(Sg, Tg)
# cnt = np.linspace(sigma_theta.min(), sigma_theta.max(),156)

comb_clim

In [None]:
###Recalculate distances
#normalize data 
def NormalizeData(data, col):
    return (data[col] - np.min(comb_clim[col])) / (np.max(comb_clim[col]) - np.min(comb_clim[col]))

comb_clim['smap_sss_NORM'] = NormalizeData(comb_clim, 'smap_sss')
comb_clim['analysed_sst_NORM'] = NormalizeData(comb_clim, 'analysed_sst')
col = 'smap_sss'
centroids['SAL_CTD_MEAN_NORM'] = (centroids['SAL_CTD_MEAN'] - np.min(comb_clim[col])) / (np.max(comb_clim[col]) - np.min(comb_clim[col]))
col = 'analysed_sst'
centroids['TEMP_CTD_MEAN_NORM'] = (centroids['TEMP_CTD_MEAN'] - np.min(comb_clim[col])) / (np.max(comb_clim[col]) - np.min(comb_clim[col]))

dist0 = []
dist1 = []
dist2 = []
dist3 = []
dist4 = []
dist5 = []

#loop through each point in the satellite data
for i in range(len(comb_clim)):
    #calculate the distance to each centroid
    dist0.append(np.sqrt((((comb_clim.iloc[i]['smap_sss_NORM'] - centroids.iloc[0]['SAL_CTD_MEAN_NORM'])**2) + ((comb_clim.iloc[i]['analysed_sst_NORM'] - centroids.iloc[0]['TEMP_CTD_MEAN_NORM'])**2))))
    dist1.append(np.sqrt((((comb_clim.iloc[i]['smap_sss_NORM'] - centroids.iloc[1]['SAL_CTD_MEAN_NORM'])**2) + ((comb_clim.iloc[i]['analysed_sst_NORM'] - centroids.iloc[1]['TEMP_CTD_MEAN_NORM'])**2))))
    dist2.append(np.sqrt((((comb_clim.iloc[i]['smap_sss_NORM'] - centroids.iloc[2]['SAL_CTD_MEAN_NORM'])**2) + ((comb_clim.iloc[i]['analysed_sst_NORM'] - centroids.iloc[2]['TEMP_CTD_MEAN_NORM'])**2))))
    dist3.append(np.sqrt((((comb_clim.iloc[i]['smap_sss_NORM'] - centroids.iloc[3]['SAL_CTD_MEAN_NORM'])**2) + ((comb_clim.iloc[i]['analysed_sst_NORM'] - centroids.iloc[3]['TEMP_CTD_MEAN_NORM'])**2))))
    dist4.append(np.sqrt((((comb_clim.iloc[i]['smap_sss_NORM'] - centroids.iloc[4]['SAL_CTD_MEAN_NORM'])**2) + ((comb_clim.iloc[i]['analysed_sst_NORM'] - centroids.iloc[4]['TEMP_CTD_MEAN_NORM'])**2))))
    dist5.append(np.sqrt((((comb_clim.iloc[i]['smap_sss_NORM'] - centroids.iloc[5]['SAL_CTD_MEAN_NORM'])**2) + ((comb_clim.iloc[i]['analysed_sst_NORM'] - centroids.iloc[5]['TEMP_CTD_MEAN_NORM'])**2))))

#combine into a dataframe
distances = pd.DataFrame({'Cluster_0':dist0, 'Cluster_1':dist1,
                         'Cluster_2':dist2, 'Cluster_3':dist3,
                         'Cluster_4':dist4, 'Cluster_5':dist5})

#add in a relative id column
distances = distances.reset_index()

#convert from wide to long
distances = distances.melt(id_vars=['index'], value_vars=['Cluster_0', 'Cluster_1', 'Cluster_2', 'Cluster_3', 'Cluster_4', 'Cluster_5'], value_name = 'distance', var_name = 'GMM_Labels_new')
distances = distances.dropna(subset = 'distance')

#select the smallest one from each point (ie the nearest centroid)
distances = distances.loc[distances.groupby('index').distance.idxmin()].set_index('index')#.reset_index(drop=True).set_index('index')

#join with the satellite data
sat_reclass = comb_clim.join(distances)
sat_reclass = sat_reclass.dropna(subset = 'GMM_Labels_new')

#add in the missing cluster
sat_reclass_temp = pd.DataFrame({'GMM_Labels_new':'Cluster_2'}, index = [0])
sat_reclass = pd.concat([sat_reclass, sat_reclass_temp])
sat_reclass

In [None]:
###TS plot with new clusters
#create a figure
fig, ax = plt.subplots(figsize=(5,3))

#add density lines
positions = [(31.25, 22), (33, 23.7), (34, 24), (35, 23.5), (35.5, 18), (35.5, 12)]
cs = ax.contour(Sg, Tg, sigma_theta, colors='darkgrey', zorder=1, alpha = 0.8)
fig.draw_without_rendering()
cl=plt.clabel(cs,fontsize=10,inline=True,fmt='%.0f', manual = positions)

#add colors to dataframe
colors = pd.DataFrame({'GMM_Labels_new': sat_reclass['GMM_Labels_new'].sort_values().unique(), 'color_new': ["navy", "turquoise", "cornflowerblue", "darkorange", "purple", "slategrey"]})
sat_reclass_col = sat_reclass.merge(colors, on = 'GMM_Labels_new')

#plot points colored by cluster
plt.scatter(y = sat_reclass_col['analysed_sst'], x = sat_reclass_col['smap_sss'], c = sat_reclass_col['color_new'], s = 10, alpha = 0.8)

#add centroids
#centroids = saildrone_df.groupby('GMM_Labels').mean(numeric_only = True)
plt.scatter(y = centroids['TEMP_CTD_MEAN'], x = centroids['SAL_CTD_MEAN'], c = 'yellow', s = 200, edgecolors='black', marker = '*', zorder = 2) ###star centroid   

#add title and format axes
#plt.title(f"GMM Cluster Results: Climatology")

#join the centroids and data
centroids_str = centroids.reset_index()[['GMM_Labels', 'TEMP_CTD_MEAN', 'SAL_CTD_MEAN']].rename({'GMM_Labels':'GMM_Labels_new', 'TEMP_CTD_MEAN':'TEMP_centroid', 'SAL_CTD_MEAN':'SAL_centroid'}, axis = 1)
centroids_str['GMM_Labels_new'] = centroids_str['GMM_Labels_new'].apply(lambda x: "{}{}".format('Cluster_', x))
sat_reclass_col = sat_reclass_col.merge(centroids_str)

#add lines
for ids, val in sat_reclass_col.iterrows():
    y = [val.analysed_sst, val.TEMP_centroid]
    x = [val.smap_sss, val.SAL_centroid]
    plt.plot(x, y, c = val.color_new, alpha = 0.15, zorder = 1)

plt.ylabel('MUR SST [°C]')
plt.xlabel('SMAP SSS [PSU]')
plt.ylim(10,24)
plt.xlim(30.5,36.1)
plt.subplots_adjust(hspace=0.35, bottom=0.02)
plt.savefig('../figures/Figure5_satellite_TS_with_GMMclusters_reclassified_Climatology_2015-2023.png', bbox_inches = 'tight')
plt.show()


In [None]:
###Map with cluster colors
#define latitude and longitude boundaries
latr = [min(comb['latitude']), max(comb['latitude'])] 
lonr = [max(comb['longitude']), min(comb['longitude'])] 

# Select a region of our data, giving it a margin
margin = 0.5 
region = np.array([[latr[0]-margin,latr[1]+margin],[lonr[0]+margin,lonr[1]-margin]]) 

#add state outlines
states_provinces = cfeature.NaturalEarthFeature(
        category='cultural',
        name='admin_1_states_provinces_lines',
        scale='50m',
        facecolor='none')

# Create and set the figure context
fig = plt.figure(figsize=(8,5), dpi = 72) 
ax = plt.axes(projection=ccrs.PlateCarree()) 
ax.coastlines(resolution='10m',linewidth=1,color='black') 
ax.add_feature(cfeature.LAND, color='grey', alpha=0.3)
ax.add_feature(states_provinces, linewidth = 0.5)
ax.add_feature(cfeature.BORDERS)
ax.set_extent([region[1,0],region[1,1],region[0,0],region[0,1]],crs=ccrs.PlateCarree()) 
ax.set_xticks(np.round([*np.arange(region[1,1],region[1,0]+1,4)][::-1],0), crs=ccrs.PlateCarree()) 
ax.set_yticks(np.round([*np.arange(region[0,0],region[0,1]+1,5)],0), crs=ccrs.PlateCarree()) 
ax.xaxis.set_major_formatter(LongitudeFormatter(zero_direction_label=True))
ax.yaxis.set_major_formatter(LatitudeFormatter())
ax.gridlines(linestyle = '--', linewidth = 0.5)

# Plot track data, color by temperature
cmap = (mpl.colors.ListedColormap(["navy", "turquoise", "cornflowerblue", "darkorange", "purple", "slategrey"])) #west coast
plt.scatter(x = sat_reclass_col['longitude'], y = sat_reclass_col['latitude'], c = sat_reclass_col['color_new'], alpha = 0.9, s = 10)

plt.savefig('../figures/Figure5_satellite_map_with_GMMclusters_reclassified_Climatology_2015-2023.png', bbox_inches = 'tight', dpi = 150)
plt.show()

## 5-Day Periods

In [None]:
#use custom function
import Saildrone_clustering as scluster

#last week of July in 2015
scluster.saildrone_classify('2015-07-27', '2015-07-31')

In [None]:
#use custom function
import Saildrone_clustering as scluster

#last week of July in 2021
scluster.saildrone_classify('2021-07-27', '2021-07-31') 