In [1]:
import math
from math import sqrt, log, floor
from sklearn.metrics import mean_squared_error
from statistics import mean
from fastdtw import fastdtw
from scipy import stats
from scipy.spatial.distance import pdist
 
#Euclidean
def euclidean(x, y):
    r=np.linalg.norm(x-y)
    if math.isnan(r):
        r=1
    #print(r)
    return r

#Fast Dynamic time warping
def fast_DTW(x, y):
    r, _ = fastdtw(x, y, dist=euclidean)
    if math.isnan(r):
        r=1
    #print(r)
    return r

#Spearman
def scorr(x, y):
    r = stats.spearmanr(x, y)[0]
    if math.isnan(r):
        r=0
    #print(r)
    return 1 - r

#RMSE
def rmse(x, y):
    r=sqrt(mean_squared_error(x,y))
    if math.isnan(r):
        r=1
    #print(r)
    return r

def lcs(a, b):  
    lengths = [[0 for j in range(len(b)+1)] for i in range(len(a)+1)]
    # row 0 and column 0 are initialized to 0 already
    for i, x in enumerate(a):
        for j, y in enumerate(b):
            if x == y:
                lengths[i+1][j+1] = lengths[i][j] + 1
            else:
                lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
    x, y = len(a), len(b)
    result = lengths[x][y]
    return result

def discretise(x):
    return int(x * 10)

def multidim_lcs(a, b):
    a = a.applymap(discretise)
    b = b.applymap(discretise)
    rows, dims = a.shape
    lcss = [lcs(a[i+2], b[i+2]) for i in range(dims)]
    return 1 - sum(lcss) / (rows * dims)

#Correlation
def corr(x, y):
    r=np.dot(x-mean(x),y-mean(y))/((np.linalg.norm(x-mean(x)))*(np.linalg.norm(y-mean(y))))
    if math.isnan(r):
        r=0
    #print(r)
    return 1 - r

In [2]:
from sklearn.preprocessing import LabelEncoder
#from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
import scipy.cluster.hierarchy as hac
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score, calinski_harabasz_score

DIAMETER_METHODS = ['mean_cluster', 'farthest']
CLUSTER_DISTANCE_METHODS = ['nearest', 'farthest']

def inter_cluster_distances(labels, distances, method='nearest'):
    """Calculates the distances between the two nearest points of each cluster.
    :param labels: a list containing cluster labels for each of the n elements
    :param distances: an n x n numpy.array containing the pairwise distances between elements
    :param method: `nearest` for the distances between the two nearest points in each cluster, or `farthest`
    """
    if method not in CLUSTER_DISTANCE_METHODS:
        raise ValueError(
            'method must be one of {}'.format(CLUSTER_DISTANCE_METHODS))

    if method == 'nearest':
        return __cluster_distances_by_points(labels, distances)
    elif method == 'farthest':
        return __cluster_distances_by_points(labels, distances, farthest=True)


def __cluster_distances_by_points(labels, distances, farthest=False):
    n_unique_labels = len(np.unique(labels))
    cluster_distances = np.full((n_unique_labels, n_unique_labels),
                                float('inf') if not farthest else 0)

    np.fill_diagonal(cluster_distances, 0)

    for i in np.arange(0, len(labels) - 1):
        for ii in np.arange(i, len(labels)):
            if labels[i] != labels[ii] and (
                (not farthest and
                 distances[i, ii] < cluster_distances[labels[i], labels[ii]])
                    or
                (farthest and
                 distances[i, ii] > cluster_distances[labels[i], labels[ii]])):
                cluster_distances[labels[i], labels[ii]] = cluster_distances[
                    labels[ii], labels[i]] = distances[i, ii]
    return cluster_distances


def diameter(labels, distances, method='farthest'):
    """Calculates cluster diameters
    :param labels: a list containing cluster labels for each of the n elements
    :param distances: an n x n numpy.array containing the pairwise distances between elements
    :param method: either `mean_cluster` for the mean distance between all elements in each cluster, or `farthest` for the distance between the two points furthest from each other
    """
    if method not in DIAMETER_METHODS:
        raise ValueError('method must be one of {}'.format(DIAMETER_METHODS))

    n_clusters = len(np.unique(labels))
    diameters = np.zeros(n_clusters)

    if method == 'mean_cluster':
        for i in range(0, len(labels) - 1):
            for ii in range(i + 1, len(labels)):
                if labels[i] == labels[ii]:
                    diameters[labels[i]] += distances[i, ii]

        for i in range(len(diameters)):
            diameters[i] /= sum(labels == i)

    elif method == 'farthest':
        for i in range(0, len(labels) - 1):
            for ii in range(i + 1, len(labels)):
                if labels[i] == labels[ii] and distances[i, ii] > diameters[
                        labels[i]]:
                    diameters[labels[i]] = distances[i, ii]
    return diameters

def dunn(labels, distances, diameter_method='farthest',
         cdist_method='nearest'):
    """
    Dunn index for cluster validation (larger is better).
    
    .. math:: D = \\min_{i = 1 \\ldots n_c; j = i + 1\ldots n_c} \\left\\lbrace \\frac{d \\left( c_i,c_j \\right)}{\\max_{k = 1 \\ldots n_c} \\left(diam \\left(c_k \\right) \\right)} \\right\\rbrace
    
    where :math:`d(c_i,c_j)` represents the distance between
    clusters :math:`c_i` and :math:`c_j`, and :math:`diam(c_k)` is the diameter of cluster :math:`c_k`.
    Inter-cluster distance can be defined in many ways, such as the distance between cluster centroids or between their closest elements. Cluster diameter can be defined as the mean distance between all elements in the cluster, between all elements to the cluster centroid, or as the distance between the two furthest elements.
    The higher the value of the resulting Dunn index, the better the clustering
    result is considered, since higher values indicate that clusters are
    compact (small :math:`diam(c_k)`) and far apart (large :math:`d \\left( c_i,c_j \\right)`).
    :param labels: a list containing cluster labels for each of the n elements
    :param distances: an n x n numpy.array containing the pairwise distances between elements
    :param diameter_method: see :py:function:`diameter` `method` parameter
    :param cdist_method: see :py:function:`diameter` `method` parameter
    
    .. [Kovacs2005] Kovács, F., Legány, C., & Babos, A. (2005). Cluster validity measurement techniques. 6th International Symposium of Hungarian Researchers on Computational Intelligence.
    """

    labels = LabelEncoder().fit(labels).transform(labels)
    
    

    ic_distances = inter_cluster_distances(labels, distances, cdist_method)
    #print("IC",ic_distances)
    if len(ic_distances[ic_distances.nonzero()])==0:
        min_distance = 0
    else:
        min_distance = min(ic_distances[ic_distances.nonzero()])
    max_diameter = max(diameter(labels, distances, diameter_method))
    
    

    return min_distance / max_diameter

# Distancias

In [3]:
import pandas as pd
import numpy as np
features = pd.read_csv('FB2_2009al2013.csv', sep=',')

In [4]:
features = features.iloc[:, 1:]
listadistrito = features.Distrito.values
features = features.drop('Distrito', axis=1)

In [5]:
features.shape[0]

200

In [6]:
lista_excluded = features.columns.tolist()
k=7
n = features.shape[0]

In [7]:
for mm in list(range(10)):
    KM_EUCLIDEAN=[]
    KM_CORRELATION=[]
    KM_SPEARMAN=[]
    KM_DTW=[]
    for i in list(range(len(features.columns))):
        features2 = features.drop(features.columns[i], axis=1)
        n = features2.shape[0]
        #Euclidean
        f_euclidean_dist = np.zeros((n,n))
        for i in range(0,n):
            #print("i",i)
            for j in range(1,n):
                #print("j",j)
                f_euclidean_dist[i,j] = euclidean(features2.iloc[i].values.flatten(), features2.iloc[j].values.flatten())
        #Corr
        f_corr_dist = np.zeros((n,n))
        for i in range(0,n):
            #print("i",i)
            for j in range(0,n):
               # print("j",j)
                f_corr_dist[i,j] = corr(features2.iloc[i].values.flatten(), features2.iloc[j].values.flatten())

        #scorr
        f_scorr_dist = np.zeros((n,n))
        for i in range(0,n):
            #print("i",i)
            for j in range(0,n):
               # print("j",j)
                f_scorr_dist[i,j] = scorr(features2.iloc[i].values.flatten(), features2.iloc[j].values.flatten())
        #DTW
        f_dtw_dist = np.zeros((n,n))
        for i in range(0,n):
            #print("i",i)
            for j in range(0,n):
                # print("j",j)
                f_dtw_dist[i,j] = fast_DTW(features2.iloc[i].values.flatten(), features2.iloc[j].values.flatten())
        print("--------------K-MEANS------------------")
        #Experimentos K-Means
        km_euc = KMeans(n_clusters=k).fit_predict(f_euclidean_dist)
        print("KM + euclidian distance: ")
        sil = silhouette_score(f_euclidean_dist, km_euc)
        print("SC: ", sil)
        KM_EUCLIDEAN.append(sil)

        km_corr = KMeans(n_clusters=k).fit_predict(f_corr_dist)
        print("KM + corr distance: ")
        sil = silhouette_score(f_corr_dist, km_corr)
        print("SC: ", sil)
        KM_CORRELATION.append(sil)

        km_scorr = KMeans(n_clusters=k).fit_predict(f_scorr_dist)
        print("KM + scorr distance: ")
        sil = silhouette_score(f_scorr_dist, km_scorr)
        print("SC: ", sil)
        KM_SPEARMAN.append(sil)

        km_dtw = KMeans(n_clusters=k).fit_predict(f_dtw_dist)
        print("KM + dtw distance: ")
        sil = silhouette_score(f_dtw_dist, km_dtw)
        print("SC: ", sil)
        KM_DTW.append(sil)
        
    print("Máximo de Euclidean ", np.argmax(KM_EUCLIDEAN))
    print("Máximo de Pearson ", np.argmax(KM_CORRELATION))
    print("Máximo de Spearman ", np.argmax(KM_SPEARMAN))
    print("Máximo de DWT ", np.argmax(KM_DTW))  
    print("ELIMINANDO ", lista_excluded[np.argmax(KM_EUCLIDEAN)])
    del lista_excluded[np.argmax(KM_EUCLIDEAN)]
    max_KM_EUCLIDEAN = max(KM_EUCLIDEAN)
    features = features.drop(features.columns[np.argmax(KM_EUCLIDEAN)], axis = 1)

--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8315934530369973
KM + corr distance: 
SC:  0.6623593004545724
KM + scorr distance: 
SC:  0.65336756533146
KM + dtw distance: 
SC:  0.8272416549959626
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.40523387963873275
KM + corr distance: 
SC:  0.5840397304304501
KM + scorr distance: 
SC:  0.659379113271878
KM + dtw distance: 
SC:  0.39753091999404605
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319197305959701
KM + corr distance: 
SC:  0.6367033712717791
KM + scorr distance: 
SC:  0.6324428581336405
KM + dtw distance: 
SC:  0.8276476016434549
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.831920773757083
KM + corr distance: 
SC:  0.6364885981881592
KM + scorr distance: 
SC:  0.6217880645952418
KM + dtw distance: 
SC:  0.8276397133250185
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319207960493507
KM + corr di

--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.4393823927401467
KM + corr distance: 
SC:  0.5829804475851992
KM + scorr distance: 
SC:  0.6775862925973795
KM + dtw distance: 
SC:  0.39652083329744975
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319197305959741
KM + corr distance: 
SC:  0.6327265405903139
KM + scorr distance: 
SC:  0.6367136921730845
KM + dtw distance: 
SC:  0.8276476016434549
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319207737570801
KM + corr distance: 
SC:  0.6359902608066925
KM + scorr distance: 
SC:  0.6412224754230555
KM + dtw distance: 
SC:  0.8276397133250185
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319207960493503
KM + corr distance: 
SC:  0.6364627272898528
KM + scorr distance: 
SC:  0.6383303727746482
KM + dtw distance: 
SC:  0.8276155626231142
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319202415268129
KM + corr

KM + dtw distance: 
SC:  0.8276187008008784
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319203952300223
KM + corr distance: 
SC:  0.6345662115295417
KM + scorr distance: 
SC:  0.6758983860569839
KM + dtw distance: 
SC:  0.8276864277595373
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319207959944278
KM + corr distance: 
SC:  0.6349701326324443
KM + scorr distance: 
SC:  0.5993649273649325
KM + dtw distance: 
SC:  0.8276313237931961
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319197989670214
KM + corr distance: 
SC:  0.6361846462026273
KM + scorr distance: 
SC:  0.6896837367964429
KM + dtw distance: 
SC:  0.8276317789839073
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.831920796042152
KM + corr distance: 
SC:  0.6349691789899108
KM + scorr distance: 
SC:  0.6029083906369553
KM + dtw distance: 
SC:  0.8276159701183095
--------------K-MEANS------------------
KM + euclidian di

--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319197303765818
KM + corr distance: 
SC:  0.6357982486604159
KM + scorr distance: 
SC:  0.5569387507694511
KM + dtw distance: 
SC:  0.8276518658177303
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319207735380755
KM + corr distance: 
SC:  0.6357512807262957
KM + scorr distance: 
SC:  0.5522923583304487
KM + dtw distance: 
SC:  0.8276348513164226
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319202413076647
KM + corr distance: 
SC:  0.636053188319268
KM + scorr distance: 
SC:  0.583628245037105
KM + dtw distance: 
SC:  0.8276542426268259
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319207953519792
KM + corr distance: 
SC:  0.631955270425301
KM + scorr distance: 
SC:  0.4799184143623388
KM + dtw distance: 
SC:  0.827616680278237
--------------K-MEANS------------------
KM + euclidian distance: 
SC:  0.8319203950162142
KM + corr dist

In [8]:
import csv

with open('SeleccionadosPorEliminacionKMEANS', 'w') as f:
      
    # using csv.writer method from CSV package
    write = csv.writer(f)
      
    write.writerow(lista_excluded)

In [10]:
features

Unnamed: 0,Mean,Var,ACF1,Curvature,Peak,Entropy,Fspots,Cpoints
0,6.904494,5.239362e+01,8.135193e-01,-0.863262,1.000000e+00,5.978370e-01,172.0,6.0
1,44.781615,4.841265e+03,9.371226e-01,0.391511,1.000000e-07,1.698908e-01,49.0,10.0
2,44.819130,1.042302e+04,9.285754e-01,-0.361289,1.000000e-07,1.317796e-01,58.0,2.0
3,18.306891,8.273701e+02,7.051710e-01,-0.677443,1.000000e+00,5.381565e-01,120.0,3.0
4,59.676847,1.191727e+04,9.103049e-01,-0.154082,1.000000e-07,1.524750e-01,84.0,4.0
...,...,...,...,...,...,...,...,...
195,33.333333,1.000000e-07,1.000000e-08,-0.016121,1.000000e-07,1.000000e-08,193.0,0.0
196,27.129680,1.000000e-07,1.000000e-08,-0.016121,1.000000e-07,1.000000e-08,193.0,0.0
197,23.540490,1.000000e-07,1.000000e-08,-0.144232,1.000000e-07,1.000000e-08,191.0,1.0
198,56.497175,1.000000e-07,1.000000e-08,-0.016121,1.000000e-07,1.000000e-08,193.0,0.0
