# Script 2 - Clusters on Collocated Saildrone Points

In [None]:
#basic packages
import xarray as xr
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import Rectangle
import matplotlib as mpl
import gsw
import seaborn as sns
import glob

#clustering packages
import itertools
from scipy import linalg, interpolate
from scipy.interpolate import griddata
from scipy.spatial import ConvexHull
from sklearn import mixture
from sklearn.neighbors import NearestCentroid
import statsmodels.api as sm

#map packages
from shapely.geometry import mapping
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
import cartopy.feature as cfeature
import cartopy.crs as ccrs

#set random seed for reproducibility
SEED = 7

In [None]:
###Open collocated data
#read in data
saildrone_col = xr.open_dataset('../data/saildrone_westcoast_collocated_combined.nc')
saildrone_col.close()

#resample per day
saildrone_col = saildrone_col.to_dataframe().groupby('relativeID').resample('1D').mean(numeric_only = True).drop('relativeID', axis = 1).reset_index('relativeID').to_xarray()

#extract out the year/time
saildrone_col = saildrone_col.where(((saildrone_col.time >= pd.to_datetime('2018-07-01')) & (saildrone_col.time <= pd.to_datetime('2018-09-30'))) |
                                    ((saildrone_col.time >= pd.to_datetime('2019-07-01')) & (saildrone_col.time <= pd.to_datetime('2019-09-30'))), drop = True)

#filter out salinity
saildrone_col = saildrone_col.where(saildrone_col.SAL_CTD_MEAN >= 30, drop = True)

#calculate collocation error
saildrone_col['sal_diff'] = saildrone_col['SAL_CTD_MEAN'] - saildrone_col['sat_smap_sss']
saildrone_col['temp_diff'] = saildrone_col['TEMP_CTD_MEAN'] - saildrone_col['analysed_sst']
saildrone_col

In [None]:
###Add density calculations
#convert to dataframe, calculate density using gsw package, and convert back to xarray
saildrone_df = saildrone_col.to_dataframe()
saildrone_df['DENSITY_MEAN'] = gsw.sigma0(saildrone_col.sat_smap_sss.values, saildrone_col.analysed_sst.values)
saildrone_col = saildrone_df.to_xarray()

#extract ou the minimum and maximum temp/salinity values
mint=np.min(saildrone_df['analysed_sst'])
maxt=np.max(saildrone_df['analysed_sst'])

mins=np.min(saildrone_df['sat_smap_sss'])
maxs=np.max(saildrone_df['sat_smap_sss'])

#create an evenly spaced series based on the range of values
# tempL=np.linspace(mint-1,maxt+1,156)
# salL=np.linspace(mins-0.25,maxs+0.25,156)
tempL = np.linspace(9,24)
salL = np.linspace(30.5,35)


#create a meshgride and fill linspace with density values
Tg, Sg = np.meshgrid(tempL,salL)
sigma_theta = gsw.sigma0(Sg, Tg)
cnt = np.linspace(sigma_theta.min(), sigma_theta.max(),156)

saildrone_col

In [None]:
###Open centroids from in-situ data (script 1)
centroids = pd.read_csv('../data/saildrone_GMMcluster_centroids.csv', index_col = [0])
centroids

### Plot of Biases

In [None]:
###Joint plots of collocation error
#add categorical distance to land feature
dist_land = []
saildrone_df = saildrone_col.to_dataframe().reset_index()

for i in range(len(saildrone_df)):
    if saildrone_df.iloc[i]['dist_land'] <= 50:
        dist_land.append('0-50 km')
    elif (saildrone_df.iloc[i]['dist_land'] > 50) & (saildrone_df.iloc[i]['dist_land'] <= 100):
        dist_land.append('51-100 km')
    else:
        dist_land.append('>101 km')
        
saildrone_df['dist_land_cat'] = dist_land

#scatter plot with histograms of differences
with sns.plotting_context("notebook", font_scale=1.5): 
    j = sns.jointplot(data=saildrone_df, x="sal_diff", y="temp_diff", hue="dist_land_cat", alpha = 0.5,  hue_order = ['0-50 km', '51-100 km', '>101 km'])
    j.set_axis_labels('Difference in Salinity (PSU)', 'Difference in Temperature (°C)')
    plt.axvline(0, alpha = 0.3, c = 'grey', linestyle = 'dashed', zorder = 0)
    plt.axhline(0, alpha = 0.3, c = 'grey', linestyle = 'dashed', zorder = 0)
    # j.ax_joint.legend_.remove()
    #j.fig.suptitle("Difference in Salinity and Temperature in Saildrone Data (Collocated minus In-Situ)")
    j.ax_joint.legend_.set_title('Distance from Land', prop={'size':'14'})
    sns.move_legend(j.ax_joint, "lower left", frameon=True, fontsize = 12)
    # plt.setp(j.get_legend().get_texts(), fontsize='10')  
    # plt.legend(bbox_to_anchor=(1.2, 0.3), loc='upper left')
    plt.savefig('../figures/Figure4D_Diff_Temp_Diff_Sal_Scatter.jpg', bbox_inches='tight', dpi = 300)
    plt.show()

#density plot for each distance
with sns.plotting_context("notebook", font_scale=1.5): 
    for dist, color, fig in zip(saildrone_df['dist_land_cat'].unique(), ['tab:orange', 'tab:blue', 'tab:green'], ['B', 'A', 'C']):
        #select out the data
        saildrone_df_temp = saildrone_df[saildrone_df['dist_land_cat'] == dist]
        #create a custom colormap 
        custom_cmap = LinearSegmentedColormap.from_list('', ['white', color])
        #plot the data
        j = sns.jointplot(data=saildrone_df_temp, x="sal_diff", y="temp_diff", kind="hex", cmap = custom_cmap, marginal_kws={'color': color})
        #change the axes labels
        j.set_axis_labels('Difference in Salinity (PSU)', 'Difference in Temperature (°C)')
        #add a title for the distance
        j.fig.suptitle(dist, y = 0.8, x = 0.35)
        #add lines at 0
        plt.axvline(0, alpha = 0.3, c = 'grey', linestyle = 'dashed', zorder = 5)
        plt.axhline(0, alpha = 0.3, c = 'grey', linestyle = 'dashed', zorder = 5)
        plt.xlim(-2, 2.5)
        plt.ylim(-4, 2)
        #j._legend.set_title('Distance from Land')
        plt.savefig('../figures/Figure4'+fig+'_Diff_Temp_Diff_Sal_Hex_'+dist+'.jpg', bbox_inches='tight', dpi = 300)
        plt.show()


## Reclassify Collocated Data

In [None]:
###Recalculate distances
#normalize data 
def NormalizeData(data, col):
    return (data[col] - np.min(saildrone_df[col])) / (np.max(saildrone_df[col]) - np.min(saildrone_df[col]))

saildrone_df = saildrone_col.to_dataframe().reset_index()
saildrone_df['sat_smap_sss_NORM'] = NormalizeData(saildrone_df, 'sat_smap_sss')
saildrone_df['analysed_sst_NORM'] = NormalizeData(saildrone_df, 'analysed_sst')
col = 'sat_smap_sss'
centroids['SAL_CTD_MEAN_NORM'] = (centroids['SAL_CTD_MEAN'] - np.min(saildrone_df[col])) / (np.max(saildrone_df[col]) - np.min(saildrone_df[col]))
col = 'analysed_sst'
centroids['TEMP_CTD_MEAN_NORM'] = (centroids['TEMP_CTD_MEAN'] - np.min(saildrone_df[col])) / (np.max(saildrone_df[col]) - np.min(saildrone_df[col]))

dist0 = []
dist1 = []
dist2 = []
dist3 = []
dist4 = []
dist5 = []

#loop through each point in the saildrone data
for i in range(len(saildrone_df)):
    #calculate the distance to each centroid
    dist0.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[0]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[0]['TEMP_CTD_MEAN_NORM'])**2))))
    dist1.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[1]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[1]['TEMP_CTD_MEAN_NORM'])**2))))
    dist2.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[2]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[2]['TEMP_CTD_MEAN_NORM'])**2))))
    dist3.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[3]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[3]['TEMP_CTD_MEAN_NORM'])**2))))
    dist4.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[4]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[4]['TEMP_CTD_MEAN_NORM'])**2))))
    dist5.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[5]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[5]['TEMP_CTD_MEAN_NORM'])**2))))

#combine into a dataframe
distances = pd.DataFrame({'Cluster_0':dist0, 'Cluster_1':dist1,
                          'Cluster_2':dist2, 'Cluster_3':dist3,
                          'Cluster_4':dist4, 'Cluster_5':dist5})

#add in a relative id column
distances = distances.reset_index()

#convert from wide to long
distances = distances.melt(id_vars=['index'], value_vars=['Cluster_0', 'Cluster_1', 'Cluster_2', 'Cluster_3', 'Cluster_4', 'Cluster_5'], value_name = 'distance', var_name = 'GMM_Labels_new')
distances = distances.dropna(subset = 'distance')

#select the smallest one from each point (ie the nearest centroid)
distances = distances.loc[distances.groupby('index').distance.idxmin()].set_index('index')#.reset_index(drop=True).set_index('index')

#join with the saildrone data
saildrone_reclass = saildrone_df.join(distances)
saildrone_reclass = saildrone_reclass.dropna(subset = 'GMM_Labels_new')
saildrone_reclass

In [None]:
###TS plot with new clusters (in normalized space)
#create a figure
fig, ax = plt.subplots(figsize=(5,3))

#add colors to dataframe
colors = pd.DataFrame({'GMM_Labels_new': saildrone_reclass['GMM_Labels_new'].sort_values().unique(), 'color_new': ["navy", "turquoise", "cornflowerblue", "darkorange", "purple", "slategrey"]})
saildrone_reclass_col = saildrone_reclass.merge(colors, on = 'GMM_Labels_new')

#plot points colored by cluster
plt.scatter(y = saildrone_reclass_col['analysed_sst_NORM'], x = saildrone_reclass_col['sat_smap_sss_NORM'], c = saildrone_reclass_col['color_new'], s = 10, alpha = 0.8)

#add centroids
#centroids = saildrone_df.groupby('GMM_Labels').mean(numeric_only = True)
plt.scatter(y = centroids['TEMP_CTD_MEAN_NORM'], x = centroids['SAL_CTD_MEAN_NORM'], c = 'yellow', s = 200, edgecolors='black', marker = '*', zorder = 2) ###star centroid   

#add title and format axes
# plt.title(
#     f"GMM Cluster Results: {gmm.covariance_type} model, "
#     f"{gmm.n_components} components"
# )

#join the centroids and data
centroids_str = centroids.reset_index()[['GMM_Labels', 'TEMP_CTD_MEAN', 'SAL_CTD_MEAN']].rename({'GMM_Labels':'GMM_Labels_new', 'TEMP_CTD_MEAN':'TEMP_centroid', 'SAL_CTD_MEAN':'SAL_centroid'}, axis = 1)
centroids_str['GMM_Labels_new'] = centroids_str['GMM_Labels_new'].apply(lambda x: "{}{}".format('Cluster_', x))
saildrone_reclass_col = saildrone_reclass_col.merge(centroids_str)

# #add lines
# for ids, val in saildrone_reclass_col.iterrows():
#     y = [val.analysed_sst, val.TEMP_centroid]
#     x = [val.sat_smap_sss, val.SAL_centroid]
#     plt.plot(x, y, c = val.color_new, alpha = 0.15, zorder = 1)

plt.ylabel('MUR SST [°C]')
plt.xlabel('SMAP SSS [PSU]')
plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.subplots_adjust(hspace=0.35, bottom=0.02)
plt.title('Remote sensing collocated (Normalized Space)', fontdict = {'fontsize' : 12})
plt.gca().set_aspect('equal')
plt.savefig('../figures/FigureA1A_saildrone_TS_with_GMMclusters_collocated_reclassified_normalized.png', bbox_inches = 'tight', dpi = 300)
plt.show()


In [None]:
###TS plot with new clusters
#create a figure
fig, ax = plt.subplots(figsize=(5,3))

#add density lines
positions = [(31.25, 22), (32.5, 23), (34, 23), (34.75, 20), (34.75, 17), (34.75, 12)]
cs = ax.contour(Sg, Tg, sigma_theta, colors='darkgrey', zorder=1, alpha = 0.8)
fig.draw_without_rendering()
cl=plt.clabel(cs,fontsize=10,inline=True,fmt='%.0f', manual = positions)

#add colors to dataframe
colors = pd.DataFrame({'GMM_Labels_new': saildrone_reclass['GMM_Labels_new'].sort_values().unique(), 'color_new': ["navy", "turquoise", "cornflowerblue", "darkorange", "purple", "slategrey"]})
saildrone_reclass_col = saildrone_reclass.merge(colors, on = 'GMM_Labels_new')

#plot points colored by cluster
plt.scatter(y = saildrone_reclass_col['analysed_sst'], x = saildrone_reclass_col['sat_smap_sss'], c = saildrone_reclass_col['color_new'], s = 10, alpha = 0.8)

#add centroids
#centroids = saildrone_df.groupby('GMM_Labels').mean(numeric_only = True)
plt.scatter(y = centroids['TEMP_CTD_MEAN'], x = centroids['SAL_CTD_MEAN'], c = 'yellow', s = 200, edgecolors='black', marker = '*', zorder = 2) ###star centroid   

#add title and format axes
# plt.title(
#     f"GMM Cluster Results: {gmm.covariance_type} model, "
#     f"{gmm.n_components} components"
# )

#join the centroids and data
centroids_str = centroids.reset_index()[['GMM_Labels', 'TEMP_CTD_MEAN', 'SAL_CTD_MEAN']].rename({'GMM_Labels':'GMM_Labels_new', 'TEMP_CTD_MEAN':'TEMP_centroid', 'SAL_CTD_MEAN':'SAL_centroid'}, axis = 1)
centroids_str['GMM_Labels_new'] = centroids_str['GMM_Labels_new'].apply(lambda x: "{}{}".format('Cluster_', x))
saildrone_reclass_col = saildrone_reclass_col.merge(centroids_str)

# #add lines
# for ids, val in saildrone_reclass_col.iterrows():
#     y = [val.analysed_sst, val.TEMP_centroid]
#     x = [val.sat_smap_sss, val.SAL_centroid]
#     plt.plot(x, y, c = val.color_new, alpha = 0.15, zorder = 1)

plt.ylabel('MUR SST [°C]')
plt.xlabel('SMAP SSS [PSU]')
plt.xlim(30.5,35)
plt.ylim(9,24)
plt.xticks([*range(31, 36, 1)])
# plt.xlim(-0.05, 1.05)
# plt.ylim(-0.05, 1.05)
plt.subplots_adjust(hspace=0.35, bottom=0.02)
plt.title('Remote sensing collocated', fontdict = {'fontsize' : 12})
# plt.gca().set_aspect('equal')
plt.savefig('../figures/Figure2E_saildrone_TS_with_GMMclusters_collocated_reclassified.png', bbox_inches = 'tight', dpi = 300)
plt.show()


In [None]:
###Map with cluster colors
#define latitude and longitude boundaries
latr = [min(saildrone_col['lat']), max(saildrone_col['lat'])] 
lonr = [min(saildrone_col['lon']), max(saildrone_col['lon'])] 

# Select a region of our data, giving it a margin
margin = 0.75
region = np.array([[latr[0]-margin,latr[1]+margin],[lonr[0]-margin,lonr[1]+margin]]) 

#add state outlines
states_provinces = cfeature.NaturalEarthFeature(
        category='cultural',
        name='admin_1_states_provinces_lines',
        scale='50m',
        facecolor='none')

# Create and set the figure context
fig = plt.figure(figsize=(8,5), dpi = 72) 
ax = plt.axes(projection=ccrs.PlateCarree()) 
ax.coastlines(resolution='10m',linewidth=1,color='black') 
ax.add_feature(cfeature.LAND, color='grey', alpha=0.3)
ax.add_feature(states_provinces, linewidth = 0.5)
ax.add_feature(cfeature.BORDERS)
ax.set_extent([region[1,0],region[1,1],region[0,0],region[0,1]],crs=ccrs.PlateCarree()) 
gl = ax.gridlines(linestyle = 'dashed', linewidth = 0.5, alpha = 0.8, zorder = 10, draw_labels=True, y_inline = False, x_inline = False)
gl.top_labels = False
gl.right_labels = False

ax.xaxis.set_major_formatter(LongitudeFormatter(zero_direction_label=True))
ax.yaxis.set_major_formatter(LatitudeFormatter())

# Plot track data, color by temperature
cmap = (mpl.colors.ListedColormap(["navy", "turquoise", "cornflowerblue", "darkorange", "purple", "slategrey"])) #west coast
plt.scatter(x = saildrone_reclass_col['lon'], y = saildrone_reclass_col['lat'], c = saildrone_reclass_col['color_new'], alpha = 0.9, s = 10)

plt.title('Remote sensing collocated', fontdict = {'fontsize' : 12}, x = 0.5)
plt.savefig('../figures/Figure4B_saildrone_map_with_GMMclusters_collocated_reclassified.png', bbox_inches = 'tight', dpi = 150)
plt.show()

## Correctly Labelled Points Bias

Reclassify *in-situ* data from colocated points and the colocated satellite points using the clusters from the raw data. 

In [None]:
#read in data
saildrone_col = xr.open_dataset('../data/saildrone_westcoast_collocated_combined.nc')
saildrone_col.close()

#resample per day
saildrone_col = saildrone_col.to_dataframe().groupby('relativeID').resample('1D').mean(numeric_only = True).drop('relativeID', axis = 1).reset_index('relativeID').to_xarray()

#extract out the year/time
saildrone_col = saildrone_col.where(((saildrone_col.time >= pd.to_datetime('2018-07-01')) & (saildrone_col.time <= pd.to_datetime('2018-09-30'))) |
                                    ((saildrone_col.time >= pd.to_datetime('2019-07-01')) & (saildrone_col.time <= pd.to_datetime('2019-09-30'))), drop = True)

#filter out salinity
saildrone_col = saildrone_col.where(saildrone_col.SAL_CTD_MEAN >= 31, drop = True)

#calculate collocation error
saildrone_col['sal_diff'] = saildrone_col['SAL_CTD_MEAN'] - saildrone_col['sat_smap_sss']
saildrone_col['temp_diff'] = saildrone_col['TEMP_CTD_MEAN'] - saildrone_col['analysed_sst']
saildrone_col

In [None]:
###Classify Satellite points
#normalize data 
def NormalizeData(data, col):
    return (data[col] - np.min(saildrone_df[col])) / (np.max(saildrone_df[col]) - np.min(saildrone_df[col]))

saildrone_df = saildrone_col.to_dataframe().reset_index().dropna(subset = ['TEMP_CTD_MEAN', 'SAL_CTD_MEAN', 'sat_smap_sss', 'analysed_sst'], axis = 0).reset_index(drop = True)
saildrone_df['sat_smap_sss_NORM'] = NormalizeData(saildrone_df, 'sat_smap_sss')
saildrone_df['analysed_sst_NORM'] = NormalizeData(saildrone_df, 'analysed_sst')
col = 'sat_smap_sss'
centroids['SAL_CTD_MEAN_NORM'] = (centroids['SAL_CTD_MEAN'] - np.min(saildrone_df[col])) / (np.max(saildrone_df[col]) - np.min(saildrone_df[col]))
col = 'analysed_sst'
centroids['TEMP_CTD_MEAN_NORM'] = (centroids['TEMP_CTD_MEAN'] - np.min(saildrone_df[col])) / (np.max(saildrone_df[col]) - np.min(saildrone_df[col]))

dist0 = []
dist1 = []
dist2 = []
dist3 = []
dist4 = []
dist5 = []

#loop through each point in the saildrone data
for i in range(len(saildrone_df)):
    #calculate the distance to each centroid
    dist0.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[0]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[0]['TEMP_CTD_MEAN_NORM'])**2))))
    dist1.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[1]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[1]['TEMP_CTD_MEAN_NORM'])**2))))
    dist2.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[2]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[2]['TEMP_CTD_MEAN_NORM'])**2))))
    dist3.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[3]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[3]['TEMP_CTD_MEAN_NORM'])**2))))
    dist4.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[4]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[4]['TEMP_CTD_MEAN_NORM'])**2))))
    dist5.append(np.sqrt((((saildrone_df.iloc[i]['sat_smap_sss_NORM'] - centroids.iloc[5]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['analysed_sst_NORM'] - centroids.iloc[5]['TEMP_CTD_MEAN_NORM'])**2))))

#combine into a dataframe
distances = pd.DataFrame({'Cluster_0':dist0, 'Cluster_1':dist1,
                         'Cluster_2':dist2, 'Cluster_3':dist3,
                         'Cluster_4':dist4, 'Cluster_5':dist5})

#add in a relative id column
distances = distances.reset_index()

#convert from wide to long
distances = distances.melt(id_vars=['index'], value_vars=['Cluster_0', 'Cluster_1', 'Cluster_2', 'Cluster_3', 'Cluster_4', 'Cluster_5'], value_name = 'distance', var_name = 'GMM_Labels_new')
distances = distances.dropna(subset = 'distance')

#select the smallest one from each point (ie the nearest centroid)
distances = distances.loc[distances.groupby('index').distance.idxmin()].set_index('index')#.reset_index(drop=True).set_index('index')

#join with the saildrone data
saildrone_reclass_sat = saildrone_df.join(distances)
saildrone_reclass_sat = saildrone_reclass_sat.dropna(subset = 'GMM_Labels_new').rename({'GMM_Labels_new':'GMM_Labels_new_sat'}, axis = 1)

#add colors to dataframe
# colors = pd.DataFrame({'GMM_Labels': range(6), 'color': ["navy", "turquoise", "cornflowerblue", "darkorange", "purple", "slategrey"]})
colors = pd.DataFrame({'GMM_Labels_new_sat': ['Cluster_' + str(i) for i in range(6)], 'color': ["navy", "turquoise", "cornflowerblue", "darkorange", "purple", "slategrey"]})
saildrone_reclass_sat = saildrone_reclass_sat.merge(colors)
saildrone_reclass_sat

In [None]:
###Classify collocated points
#normalize data 
def NormalizeData(data, col):
    return (data[col] - np.min(saildrone_df[col])) / (np.max(saildrone_df[col]) - np.min(saildrone_df[col]))

saildrone_df = saildrone_col.to_dataframe().reset_index().dropna(subset = ['TEMP_CTD_MEAN', 'SAL_CTD_MEAN', 'sat_smap_sss', 'analysed_sst'], axis = 0).reset_index(drop = True)
saildrone_df['SAL_CTD_MEAN_NORM'] = NormalizeData(saildrone_df, 'SAL_CTD_MEAN')
saildrone_df['TEMP_CTD_MEAN_NORM'] = NormalizeData(saildrone_df, 'TEMP_CTD_MEAN')
col = 'sat_smap_sss'
centroids['SAL_CTD_MEAN_NORM'] = (centroids['SAL_CTD_MEAN'] - np.min(saildrone_df[col])) / (np.max(saildrone_df[col]) - np.min(saildrone_df[col]))
col = 'analysed_sst'
centroids['TEMP_CTD_MEAN_NORM'] = (centroids['TEMP_CTD_MEAN'] - np.min(saildrone_df[col])) / (np.max(saildrone_df[col]) - np.min(saildrone_df[col]))

dist0 = []
dist1 = []
dist2 = []
dist3 = []
dist4 = []
dist5 = []

#loop through each point in the saildrone data
for i in range(len(saildrone_df)):
    #calculate the distance to each centroid
    dist0.append(np.sqrt((((saildrone_df.iloc[i]['SAL_CTD_MEAN_NORM'] - centroids.iloc[0]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['TEMP_CTD_MEAN_NORM'] - centroids.iloc[0]['TEMP_CTD_MEAN_NORM'])**2))))
    dist1.append(np.sqrt((((saildrone_df.iloc[i]['SAL_CTD_MEAN_NORM'] - centroids.iloc[1]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['TEMP_CTD_MEAN_NORM'] - centroids.iloc[1]['TEMP_CTD_MEAN_NORM'])**2))))
    dist2.append(np.sqrt((((saildrone_df.iloc[i]['SAL_CTD_MEAN_NORM'] - centroids.iloc[2]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['TEMP_CTD_MEAN_NORM'] - centroids.iloc[2]['TEMP_CTD_MEAN_NORM'])**2))))
    dist3.append(np.sqrt((((saildrone_df.iloc[i]['SAL_CTD_MEAN_NORM'] - centroids.iloc[3]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['TEMP_CTD_MEAN_NORM'] - centroids.iloc[3]['TEMP_CTD_MEAN_NORM'])**2))))
    dist4.append(np.sqrt((((saildrone_df.iloc[i]['SAL_CTD_MEAN_NORM'] - centroids.iloc[4]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['TEMP_CTD_MEAN_NORM'] - centroids.iloc[4]['TEMP_CTD_MEAN_NORM'])**2))))
    dist5.append(np.sqrt((((saildrone_df.iloc[i]['SAL_CTD_MEAN_NORM'] - centroids.iloc[5]['SAL_CTD_MEAN_NORM'])**2) + ((saildrone_df.iloc[i]['TEMP_CTD_MEAN_NORM'] - centroids.iloc[5]['TEMP_CTD_MEAN_NORM'])**2))))

#combine into a dataframe
distances = pd.DataFrame({'Cluster_0':dist0, 'Cluster_1':dist1,
                         'Cluster_2':dist2, 'Cluster_3':dist3,
                         'Cluster_4':dist4, 'Cluster_5':dist5})

#add in a relative id column
distances = distances.reset_index()

#convert from wide to long
distances = distances.melt(id_vars=['index'], value_vars=['Cluster_0', 'Cluster_1', 'Cluster_2', 'Cluster_3', 'Cluster_4', 'Cluster_5'], value_name = 'distance', var_name = 'GMM_Labels_new')
distances = distances.dropna(subset = 'distance')

#select the smallest one from each point (ie the nearest centroid)
distances = distances.loc[distances.groupby('index').distance.idxmin()].set_index('index')#.reset_index(drop=True).set_index('index')

#join with the saildrone data
saildrone_reclass_col = saildrone_df.join(distances)
saildrone_reclass_col = saildrone_reclass_col.dropna(subset = 'GMM_Labels_new').rename({'GMM_Labels_new':'GMM_Labels_new_col'}, axis = 1)

#add colors to dataframe
# colors = pd.DataFrame({'GMM_Labels': range(6), 'color': ["navy", "turquoise", "cornflowerblue", "darkorange", "purple", "slategrey"]})
colors = pd.DataFrame({'GMM_Labels_new_col': ['Cluster_' + str(i) for i in range(6)], 'color': ["navy", "turquoise", "cornflowerblue", "darkorange", "purple", "slategrey"]})
saildrone_reclass_col = saildrone_reclass_col.merge(colors)
saildrone_reclass_col

In [None]:
# # join the data of the points classified by the saildrone clusters and colocated clusters
# joined = saildrone_reclass.join(saildrone_reclass_col[['GMM_Labels_new_col', 'time', 'analysed_sst', 'sat_smap_sss']], rsuffix = '_col')
# joined = saildrone_reclass.merge(saildrone_reclass_col[['time', 'lat', 'lon', 'GMM_Labels_new_col']], on = ['time', 'lat', 'lon'], how = 'left')

# # convert to geopandas dataframe
# saildrone_reclass_gpd = gpd.GeoDataFrame(saildrone_reclass, geometry=gpd.points_from_xy(saildrone_reclass.lon, saildrone_reclass.lat), crs="EPSG:4326")
# saildrone_reclass_gpd = saildrone_reclass_gpd.to_crs('3857')
# saildrone_reclass_col_gpd = gpd.GeoDataFrame(saildrone_reclass_col, geometry=gpd.points_from_xy(saildrone_reclass_col.lon, saildrone_reclass_col.lat), crs="EPSG:4326")
# saildrone_reclass_col_gpd = saildrone_reclass_col_gpd.to_crs('3857')

# joined = pd.DataFrame()

# #loop through all the times
# for date in saildrone_reclass.time.unique():
#     saildrone_temp = saildrone_reclass_gpd[saildrone_reclass_gpd['time'] == date]
#     saildrone_col_temp = saildrone_reclass_col_gpd[saildrone_reclass_col_gpd['time'] == date]

#     temp = gpd.sjoin_nearest(saildrone_col_temp[['time', 'geometry', 'GMM_Labels_new_col', 'analysed_sst', 'sat_smap_sss']], saildrone_temp)#, on = ['geometry'], direction='nearest')
#     joined = pd.concat([joined, temp])

joined = saildrone_reclass_col.merge(saildrone_reclass_sat[['time', 'relativeID', 'GMM_Labels_new_sat']], on = ['time', 'relativeID'], how = 'left')

#calculate the number of correct and incorrect points 
joined['correct'] = np.where(joined['GMM_Labels_new_sat'] == joined['GMM_Labels_new_col'], 'correct', 'incorrect')

#add a new column for colors
joined['color'] = np.where(joined['correct'] == 'correct', 'forestgreen', 'firebrick')
joined

In [None]:
#plot proportion of correct points
fig, ax = plt.subplots(figsize = (8, 1))

sns.histplot(
    data=joined,
    y=0, hue="correct",
    multiple="fill", stat="proportion",
    discrete=True, shrink=.9,
    legend=False, 
)

ax.patches[0].set_facecolor('forestgreen')
ax.patches[1].set_facecolor('firebrick')

ax2 = ax.twinx()

ax.tick_params(left=False)
ax.set_ylabel('Correct', rotation = 359)
ax.set_yticklabels([])  # remove the tick labels
ax.yaxis.set_label_coords(-.06, .5)

# ax.axvline(0.7846790890269151, c = 'grey', ymin=-.4, ymax=1, clip_on = False)

ax2.tick_params(right=False)
ax2.set_ylabel('Incorrect', rotation = 359)
ax2.set_yticklabels([])  # remove the tick labels
ax2.yaxis.set_label_coords(1.06, .5)

ax3 = ax.twiny()
xticks = [*np.arange(0, len(joined['correct']) + 1, len(joined['correct']) / 5)]
xticks = [ '%.0f' % elem for elem in xticks ]
ax3.set_xticklabels(xticks)
ax3.set_xlabel('Count')

# plt.savefig('../figures/Proportion_correct.png', bbox_inches = 'tight', dpi = 300)
plt.show()

In [None]:
###TS plot with clusters
#extract ou the minimum and maximum temp/salinity values
mint=np.min(joined['analysed_sst'])
maxt=np.max(joined['analysed_sst'])

mins=np.min(joined['sat_smap_sss'])
maxs=np.max(joined['sat_smap_sss'])

#create an evenly spaced series based on the range of values
#tempL=np.linspace(mint-1,maxt+1,156)
#salL=np.linspace(mins-0.25,maxs+0.25,156)
tempL = np.linspace(9,24)
salL = np.linspace(30.5,36)

#create a meshgride and fill linspace with density values
Tg, Sg = np.meshgrid(tempL,salL)
sigma_theta = gsw.sigma0(Sg, Tg)
cnt = np.linspace(sigma_theta.min(), sigma_theta.max(),156)

#create a figure
fig, ax = plt.subplots(figsize=(5,3))

#add density lines
positions = [(31.25, 22), (32.5, 23), (34, 23), (35, 23), (35.5, 20), (35.5, 15), (35.5, 12)]
cs = ax.contour(Sg, Tg, sigma_theta, colors='darkgrey', zorder=1, alpha = 0.8)
fig.draw_without_rendering()
cl=plt.clabel(cs,fontsize=10,inline=True,fmt='%.0f', manual = positions)

#plot points colored by cluster
plt.scatter(y = joined.loc[joined['correct'] == 'correct']['analysed_sst'], x = joined.loc[joined['correct'] == 'correct']['sat_smap_sss'], c = joined.loc[joined['correct'] == 'correct']['color'], label = 'correct', s = 10, alpha = 0.8)
plt.scatter(y = joined.loc[joined['correct'] == 'incorrect']['analysed_sst'], x = joined.loc[joined['correct'] == 'incorrect']['sat_smap_sss'], c = joined.loc[joined['correct'] == 'incorrect']['color'], label = 'incorrect', s = 10, alpha = 0.8)

#add centroids
# centroids = saildrone_df.groupby('GMM_Labels').mean(numeric_only = True)
# plt.scatter(y = centroids['TEMP_CTD_MEAN'], x = centroids['SAL_CTD_MEAN'], c = 'yellow', s = 200, edgecolors='black', marker = '*', zorder = 2, label = 'Cluster Centroid') ###star centroid
# legend = plt.legend(loc = 'lower left')
# legend.get_frame().set_alpha(None)


#add title and format axes
# plt.title(
#     f"GMM Cluster Results: {gmm.covariance_type} model, "
#     f"{gmm.n_components} components"
# )
plt.ylabel('Temperature [°C]')
plt.xlabel('Salinity [PSU]')
plt.subplots_adjust(hspace=0.35, bottom=0.02)
plt.xlim(30.5,35)
plt.ylim(9,24)
plt.xticks([*range(31, 37, 1)])
plt.legend()
plt.savefig('../figures/Figure3B_saildrone_TS_with_correct_incorrect.png', bbox_inches = 'tight', dpi = 300)
plt.show()


In [None]:
###Map with cluster colors
#define latitude and longitude boundaries
latr = [min(saildrone_df['lat']), max(saildrone_df['lat'])] 
lonr = [min(saildrone_df['lon']), max(saildrone_df['lon'])] 

# Select a region of our data, giving it a margin
margin = 0.75
region = np.array([[latr[0]-margin,latr[1]+margin],[lonr[0]-margin,lonr[1]+margin]]) 

#add state outlines
states_provinces = cfeature.NaturalEarthFeature(
        category='cultural',
        name='admin_1_states_provinces_lines',
        scale='50m',
        facecolor='none')

# Create and set the figure context
fig = plt.figure(figsize=(8,5), dpi = 72) 
ax = plt.axes(projection=ccrs.PlateCarree()) 
ax.coastlines(resolution='10m',linewidth=1,color='black') 
ax.add_feature(cfeature.LAND, color='grey', alpha=0.3)
ax.add_feature(states_provinces, linewidth = 0.5)
ax.add_feature(cfeature.BORDERS)
ax.set_extent([region[1,0],region[1,1],region[0,0],region[0,1]],crs=ccrs.PlateCarree()) 
gl = ax.gridlines(linestyle = 'dashed', linewidth = 0.5, alpha = 0.8, zorder = 10, draw_labels=True, y_inline = False, x_inline = False)
gl.top_labels = False
gl.right_labels = False

# ax.set_xticks(np.round([*np.arange(region_raw[1,0],region_raw[1,1]+1,1)][::-1],0), crs=ccrs.PlateCarree()) 
# ax.set_yticks(np.round([*np.arange(np.floor(region_raw[0,0]),region_raw[0,1]+1,1)],1), crs=ccrs.PlateCarree()) 
ax.xaxis.set_major_formatter(LongitudeFormatter(zero_direction_label=True))
ax.yaxis.set_major_formatter(LatitudeFormatter())

plt.scatter(x = joined.loc[joined['correct'] == 'correct']['lon'], y = joined.loc[joined['correct'] == 'correct']['lat'], c = joined.loc[joined['correct'] == 'correct']['color'], label = 'correct', alpha = 0.9, s = 10)
plt.scatter(x = joined.loc[joined['correct'] == 'incorrect']['lon'], y = joined.loc[joined['correct'] == 'incorrect']['lat'], c = joined.loc[joined['correct'] == 'incorrect']['color'], label = 'incorrect', alpha = 0.9, s = 10)

# plt.title('Correct/Incorrectly Labelled Colocated Points', fontdict = {'fontsize' : 12}, x = 0.5)
plt.legend()
plt.savefig('../figures/Figure3A_saildrone_map_with_GMMclusters_correct_incorrect.png', bbox_inches = 'tight', dpi = 150)
plt.show()