### K Means Plot Making

In [1]:
%matplotlib inline
##so-called "math" related imports
from netCDF4 import Dataset as ncread
import numpy as np
from scipy.io import loadmat
import pandas as pd
import h5py
import math
from random import seed
from random import randint
from random import sample
from scipy.ndimage import gaussian_filter
from sklearn.metrics import brier_score_loss
import xarray as xr

import pickle

#plotting related imports
import matplotlib.pyplot as plt
from matplotlib import rcParams #For changing text properties
import cmocean #A package with beautiful colormaps
from cartopy import crs as ccrs #Useful for plotting maps
import cartopy.util #Requires separate import
from cartopy.util import add_cyclic_point
import cartopy.feature as cf
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
import matplotlib.path as mpath
import matplotlib.colors as mcolors

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
#import statements for the clustering-related portions
from sklearn.cluster import KMeans as km

#### KMEANS CLUSTERING

In [3]:
##for composites
infile = open("comp_pos.p",'rb')
comp_pos = pickle.load(infile)
infile.close()
comp_pos = np.array(comp_pos)

infile = open("compFpos.p",'rb')
compFpos = pickle.load(infile)
infile.close()
compFpos = np.array(compFpos)

infile = open("comp_neg.p",'rb')
comp_neg = pickle.load(infile)
infile.close()
comp_neg = np.array(comp_neg)

infile = open("compFneg.p",'rb')
compFneg = pickle.load(infile)
infile.close()
compFneg = np.array(compFneg)

In [4]:
##DEFINITION STATEMENTS
from sklearn.decomposition import PCA
def kmeans_elbow(dataset):
    inertia = []
    clusters = np.arange(1,9,1) #testing number of clusters 
    for n in clusters:
        #provides labels of clusters
        h_km = km(n_clusters = n).fit(dataset)
        #Getting unique labels
        h_labels = np.unique(h_km)
        h_inertia = h_km.inertia_
        inertia.append(h_inertia)

    return inertia, h_km;

def kmeans(dataset, h_km, clusters):
    pca = PCA(clusters)
    pcatrans = pca.fit_transform(dataset)
    #plotting the results:
    h_km = km(n_clusters = clusters, random_state=0)
    label = h_km.fit_predict(dataset)
    #Getting unique labels
    h_labels = np.unique(label)

    #initialize separation of clusters
    C1 = []
    C2 = []
    C3 = []
    
    
    for k in range(len(label)):
        if label[k] == 0:
            C1.append("k")
        if label[k] == 1:
            #print(k)
            C2.append("k")
        if label[k] == 2:
            #print(k)
            C3.append("k")
    
    c1 = np.empty((len(C1),777))
    c2 = np.empty((len(C2),777))
    c3 = np.empty((len(C3),777))
    
    #set the initial index for each empty array to 0 so it can increas with each aded value
    index1 = 0
    index2 = 0
    index3 = 0
    index4 = 0
    index5 = 0
    
    #loop, create clustered arrays, yay
    for k in range(len(label)):
        if label[k] == 0:
            c1[index1,:] = dataset[k,:]
            index1 = index1 + 1
            
        if label[k] == 1:
            c2[index2,:] = dataset[k,:]
            index2 = index2 + 1
            
        if label[k] == 2:
            c3[index3,:] = dataset[k,:]
            index3 = index3 + 1
            
    C1 = c1.reshape(len(c1),21,37)
    C2 = c2.reshape(len(c2),21,37)
    C3 = c3.reshape(len(c3),21,37)
    
    return C1, C2, C3;          

In [5]:
#lat
lats = np.arange(80, 29.5, -2.5)
#lon
lons = np.arange(-100, -9.5, 2.5)

In [6]:
##Start with Correct Negative Clusters

In [7]:
inertia, h_km = kmeans_elbow(comp_pos)

C1, C2, C3 = kmeans(comp_pos, h_km, 3)

perC1 = (len(C1)/len(comp_pos)) *100
perC2 = (len(C2)/len(comp_pos)) *100
perC3 = (len(C3)/len(comp_pos)) *100


percentage_dict = {"C1": perC2, "C2": perC1, "C3": perC3}
gph_dict = {"C1": C2, "C2": C1, "C3": C3}

#######################################
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(30, 10), subplot_kw={'projection': ccrs.PlateCarree()})
fig.suptitle("KMeans Clusters of GPH Anoms at 500hPa for 10% Most Confident & Correct Positive Temp Anom Predictions",
             fontsize=26)
color = 'bwr' 
colorbarMin = -100
colorbarMax = 100
colorspace = 0.05
level = np.arange(colorbarMin,colorbarMax+colorspace,colorspace)

# Define the projections for each panel
sub = [ccrs.PlateCarree(central_longitude=180),ccrs.PlateCarree(central_longitude=180),ccrs.PlateCarree(central_longitude=180)]
sub_titles = ['Cluster 1', 'Cluster 2', 'Cluster 3']
dict_keys = ['C1', 'C2', 'C3']

# Iterate over the axes to create the three panels with contour plots
for ax, proj, title, keys in zip(axes, sub, sub_titles, dict_keys):
    ax.set_title(title, fontsize=24)

    gph = gph_dict[keys]  
    #Add in the coordinate system:
    long = np.arange(-100, -10, 10)
    latg = np.arange(80, 30 ,-5)
    ax.set_xticks(long, crs=ccrs.PlateCarree());
    ax.set_yticks(latg, crs=ccrs.PlateCarree());
    ax.set_xticklabels(long,fontsize=18)
    ax.set_yticklabels(latg,fontsize=18)
    ax.set_ylabel('Latitude ($^o$N)',fontsize=18);
    ax.set_xlabel('Longitude ($^o$W)',fontsize=18);

    #Add in the continents
    #define the coastlines, the color (#000000) and the resolution (110m) 
    feature1 = cf.NaturalEarthFeature(
        name='coastline', category='physical',
        scale='110m',
        edgecolor='#000000', facecolor='none')
    
    ax.add_feature(feature1)
    
    h = ax.contourf(lons, lats, np.nanmean(gph, axis=0), level, transform=ccrs.PlateCarree(), cmap=color,extend='both')
    #plt.contourf(lons, lats, LRP_heatmaps_mean, level, cmap=color,extend='both')
    cbar = plt.colorbar(h, orientation='horizontal', shrink=1,fraction=0.09,pad=0.09,aspect=35)
    cbar.set_label( "GPH Anomaly", fontsize=20) 
     
    cbar.ax.tick_params(labelsize=14) 
    
    # Add percentage label to the subplot
    percentage = np.round(percentage_dict[keys],2) # Retrieve the percentage for the specific projection
    ax.text(0.05, 0.95, f'{percentage}%', transform=ax.transAxes, 
            fontsize=22, verticalalignment='top', horizontalalignment='left', 
            bbox=dict(facecolor='white', alpha=0.6, edgecolor='none', boxstyle='round,pad=0.5'))

fig.tight_layout()
fig.subplots_adjust(top=2)
#plt.savefig("KMeans_CPos.png", bbox_inches='tight')
# Show the plot
plt.show()

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [8]:
inertia, h_km = kmeans_elbow(comp_neg)

C1, C2, C3 = kmeans(comp_neg, h_km,3)

perC1 = (len(C1)/len(comp_neg)) *100
perC2 = (len(C2)/len(comp_neg)) *100
perC3 = (len(C3)/len(comp_neg)) *100


percentage_dict = {"C1": perC1, "C2": perC2, "C3": perC3}
gph_dict = {"C1": C1, "C2": C2, "C3": C3}

#######################################
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(30, 10), subplot_kw={'projection': ccrs.PlateCarree()})
fig.suptitle("KMeans Clusters of GPH Anoms at 500hPa for 10% Most Confident & Correct Negative Temp Anom Predictions",
             fontsize=26)
color = 'bwr' 
colorbarMin = -100
colorbarMax = 100
colorspace = 0.05
level = np.arange(colorbarMin,colorbarMax+colorspace,colorspace)

# Define the projections for each panel
sub = [ccrs.PlateCarree(central_longitude=180),ccrs.PlateCarree(central_longitude=180),ccrs.PlateCarree(central_longitude=180)]
sub_titles = ['Cluster 1', 'Cluster 2', 'Cluster 3']
dict_keys = ['C1', 'C2', 'C3']

# Iterate over the axes to create the three panels with contour plots
for ax, proj, title, keys in zip(axes, sub, sub_titles, dict_keys):
    ax.set_title(title, fontsize=24)

    gph = gph_dict[keys]  
    #Add in the coordinate system:
    long = np.arange(-100, -10, 10)
    latg = np.arange(80, 30 ,-5)
    ax.set_xticks(long, crs=ccrs.PlateCarree());
    ax.set_yticks(latg, crs=ccrs.PlateCarree());
    ax.set_xticklabels(long,fontsize=18)
    ax.set_yticklabels(latg,fontsize=18)
    ax.set_ylabel('Latitude ($^o$N)',fontsize=18);
    ax.set_xlabel('Longitude ($^o$W)',fontsize=18);

    #Add in the continents
    #define the coastlines, the color (#000000) and the resolution (110m) 
    feature1 = cf.NaturalEarthFeature(
        name='coastline', category='physical',
        scale='110m',
        edgecolor='#000000', facecolor='none')
    
    ax.add_feature(feature1)
    
    h = ax.contourf(lons, lats, np.nanmean(gph, axis=0), level, transform=ccrs.PlateCarree(), cmap=color,extend='both')
    #plt.contourf(lons, lats, LRP_heatmaps_mean, level, cmap=color,extend='both')
    cbar = plt.colorbar(h, orientation='horizontal', shrink=1,fraction=0.09,pad=0.09,aspect=35)
    cbar.set_label( "GPH Anomaly", fontsize=20) 
     
    cbar.ax.tick_params(labelsize=14) 
    
    # Add percentage label to the subplot
    percentage = np.round(percentage_dict[keys],2) # Retrieve the percentage for the specific projection
    ax.text(0.05, 0.95, f'{percentage}%', transform=ax.transAxes, 
            fontsize=22, verticalalignment='top', horizontalalignment='left', 
            bbox=dict(facecolor='white', alpha=0.6, edgecolor='none', boxstyle='round,pad=0.5'))

fig.tight_layout()
fig.subplots_adjust(top=2)
plt.savefig("KMeans_CNeg.png", bbox_inches='tight')
# Show the plot
plt.show()

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values