# SELECCION DE ATRIBUTOS WRAPPER - HAC

In [1]:
import math
from math import sqrt, log, floor
from sklearn.metrics import mean_squared_error
from statistics import mean
from fastdtw import fastdtw
from scipy import stats
from scipy.spatial.distance import pdist
 
#Euclidean
def euclidean(x, y):
    r=np.linalg.norm(x-y)
    if math.isnan(r):
        r=1
    #print(r)
    return r

#Fast Dynamic time warping
def fast_DTW(x, y):
    r, _ = fastdtw(x, y, dist=euclidean)
    if math.isnan(r):
        r=1
    #print(r)
    return r

#Spearman
def scorr(x, y):
    r = stats.spearmanr(x, y)[0]
    if math.isnan(r):
        r=0
    #print(r)
    return 1 - r

#RMSE
def rmse(x, y):
    r=sqrt(mean_squared_error(x,y))
    if math.isnan(r):
        r=1
    #print(r)
    return r

def lcs(a, b):  
    lengths = [[0 for j in range(len(b)+1)] for i in range(len(a)+1)]
    # row 0 and column 0 are initialized to 0 already
    for i, x in enumerate(a):
        for j, y in enumerate(b):
            if x == y:
                lengths[i+1][j+1] = lengths[i][j] + 1
            else:
                lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
    x, y = len(a), len(b)
    result = lengths[x][y]
    return result

def discretise(x):
    return int(x * 10)

def multidim_lcs(a, b):
    a = a.applymap(discretise)
    b = b.applymap(discretise)
    rows, dims = a.shape
    lcss = [lcs(a[i+2], b[i+2]) for i in range(dims)]
    return 1 - sum(lcss) / (rows * dims)

#Correlation
def corr(x, y):
    r=np.dot(x-mean(x),y-mean(y))/((np.linalg.norm(x-mean(x)))*(np.linalg.norm(y-mean(y))))
    if math.isnan(r):
        r=0
    #print(r)
    return 1 - r

In [2]:
from sklearn.preprocessing import LabelEncoder
#from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
import scipy.cluster.hierarchy as hac
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score, calinski_harabasz_score

DIAMETER_METHODS = ['mean_cluster', 'farthest']
CLUSTER_DISTANCE_METHODS = ['nearest', 'farthest']

def inter_cluster_distances(labels, distances, method='nearest'):
    """Calculates the distances between the two nearest points of each cluster.
    :param labels: a list containing cluster labels for each of the n elements
    :param distances: an n x n numpy.array containing the pairwise distances between elements
    :param method: `nearest` for the distances between the two nearest points in each cluster, or `farthest`
    """
    if method not in CLUSTER_DISTANCE_METHODS:
        raise ValueError(
            'method must be one of {}'.format(CLUSTER_DISTANCE_METHODS))

    if method == 'nearest':
        return __cluster_distances_by_points(labels, distances)
    elif method == 'farthest':
        return __cluster_distances_by_points(labels, distances, farthest=True)


def __cluster_distances_by_points(labels, distances, farthest=False):
    n_unique_labels = len(np.unique(labels))
    cluster_distances = np.full((n_unique_labels, n_unique_labels),
                                float('inf') if not farthest else 0)

    np.fill_diagonal(cluster_distances, 0)

    for i in np.arange(0, len(labels) - 1):
        for ii in np.arange(i, len(labels)):
            if labels[i] != labels[ii] and (
                (not farthest and
                 distances[i, ii] < cluster_distances[labels[i], labels[ii]])
                    or
                (farthest and
                 distances[i, ii] > cluster_distances[labels[i], labels[ii]])):
                cluster_distances[labels[i], labels[ii]] = cluster_distances[
                    labels[ii], labels[i]] = distances[i, ii]
    return cluster_distances


def diameter(labels, distances, method='farthest'):
    """Calculates cluster diameters
    :param labels: a list containing cluster labels for each of the n elements
    :param distances: an n x n numpy.array containing the pairwise distances between elements
    :param method: either `mean_cluster` for the mean distance between all elements in each cluster, or `farthest` for the distance between the two points furthest from each other
    """
    if method not in DIAMETER_METHODS:
        raise ValueError('method must be one of {}'.format(DIAMETER_METHODS))

    n_clusters = len(np.unique(labels))
    diameters = np.zeros(n_clusters)

    if method == 'mean_cluster':
        for i in range(0, len(labels) - 1):
            for ii in range(i + 1, len(labels)):
                if labels[i] == labels[ii]:
                    diameters[labels[i]] += distances[i, ii]

        for i in range(len(diameters)):
            diameters[i] /= sum(labels == i)

    elif method == 'farthest':
        for i in range(0, len(labels) - 1):
            for ii in range(i + 1, len(labels)):
                if labels[i] == labels[ii] and distances[i, ii] > diameters[
                        labels[i]]:
                    diameters[labels[i]] = distances[i, ii]
    return diameters

def dunn(labels, distances, diameter_method='farthest',
         cdist_method='nearest'):
    """
    Dunn index for cluster validation (larger is better).
    
    .. math:: D = \\min_{i = 1 \\ldots n_c; j = i + 1\ldots n_c} \\left\\lbrace \\frac{d \\left( c_i,c_j \\right)}{\\max_{k = 1 \\ldots n_c} \\left(diam \\left(c_k \\right) \\right)} \\right\\rbrace
    
    where :math:`d(c_i,c_j)` represents the distance between
    clusters :math:`c_i` and :math:`c_j`, and :math:`diam(c_k)` is the diameter of cluster :math:`c_k`.
    Inter-cluster distance can be defined in many ways, such as the distance between cluster centroids or between their closest elements. Cluster diameter can be defined as the mean distance between all elements in the cluster, between all elements to the cluster centroid, or as the distance between the two furthest elements.
    The higher the value of the resulting Dunn index, the better the clustering
    result is considered, since higher values indicate that clusters are
    compact (small :math:`diam(c_k)`) and far apart (large :math:`d \\left( c_i,c_j \\right)`).
    :param labels: a list containing cluster labels for each of the n elements
    :param distances: an n x n numpy.array containing the pairwise distances between elements
    :param diameter_method: see :py:function:`diameter` `method` parameter
    :param cdist_method: see :py:function:`diameter` `method` parameter
    
    .. [Kovacs2005] Kovács, F., Legány, C., & Babos, A. (2005). Cluster validity measurement techniques. 6th International Symposium of Hungarian Researchers on Computational Intelligence.
    """

    labels = LabelEncoder().fit(labels).transform(labels)
    
    

    ic_distances = inter_cluster_distances(labels, distances, cdist_method)
    #print("IC",ic_distances)
    if len(ic_distances[ic_distances.nonzero()])==0:
        min_distance = 0
    else:
        min_distance = min(ic_distances[ic_distances.nonzero()])
    max_diameter = max(diameter(labels, distances, diameter_method))
    
    

    return min_distance / max_diameter

# Distancias

In [3]:
import pandas as pd
import numpy as np
import os
my_path = os.path.abspath('')
my_path = my_path.split('\\')
my_path_py = "\\".join(my_path[:-1])

features = pd.read_csv(my_path_py+'\\2_FeatureBased\\FB2_2009al2013.csv', sep=',')

In [4]:
features = features.iloc[:, 1:]
listadistrito = features.Distrito.values
features = features.drop('Distrito', axis=1)

In [5]:
lista_excluded = features.columns.tolist()
#lista_excluded.insert(0, 'None')
#lista_excluded

In [6]:
k = 5
#KM_EUCLIDEAN.insert(0, max_KM_EUCLIDEAN)
#KM_CORRELATION.insert(0, max_KM_CORRELATION)
#KM_SPEARMAN.insert(0, max_KM_SPEARMAN)
#KM_DTW.insert(0, max_KM_DTW)
for mm in list(range(10)):
    #Experimentos HAC
    HAC_EUCLIDEAN=[]
    HAC_CORRELATION=[]
    HAC_SPEARMAN=[]
    HAC_DTW=[]
    
    for i in list(range(len(features.columns))):
        features2 = features.drop(features.columns[i], axis=1)
        #Euclidean
        n = features.shape[0]
        f_euclidean_dist = np.zeros((n,n))
        for i in range(0,n):
            #print("i",i)
            for j in range(1,n):
                #print("j",j)
                f_euclidean_dist[i,j] = euclidean(features2.iloc[i].values.flatten(), features2.iloc[j].values.flatten())
        #Corr
        f_corr_dist = np.zeros((n,n))
        for i in range(0,n):
            #print("i",i)
            for j in range(0,n):
               # print("j",j)
                f_corr_dist[i,j] = corr(features2.iloc[i].values.flatten(), features2.iloc[j].values.flatten())

        #scorr
        f_scorr_dist = np.zeros((n,n))
        for i in range(0,n):
            #print("i",i)
            for j in range(0,n):
               # print("j",j)
                f_scorr_dist[i,j] = scorr(features2.iloc[i].values.flatten(), features2.iloc[j].values.flatten())
        #DTW
        f_dtw_dist = np.zeros((n,n))
        for i in range(0,n):
            #print("i",i)
            for j in range(0,n):
                # print("j",j)
                f_dtw_dist[i,j] = fast_DTW(features2.iloc[i].values.flatten(), features2.iloc[j].values.flatten())
                
        HAC_euc = AgglomerativeClustering(n_clusters=k).fit_predict(f_euclidean_dist)
        print("HAC + euclidian distance: ")
        sil = silhouette_score(f_euclidean_dist, HAC_euc)
        print("SC: ", sil)
        HAC_EUCLIDEAN.append(sil)

        HAC_corr = AgglomerativeClustering(n_clusters=k).fit_predict(f_corr_dist)
        print("HAC + corr distance: ")
        sil = silhouette_score(f_corr_dist, HAC_corr)
        print("SC: ", sil)
        HAC_CORRELATION.append(sil)

        HAC_scorr = AgglomerativeClustering(n_clusters=k).fit_predict(f_scorr_dist)
        print("HAC + scorr distance: ")
        sil = silhouette_score(f_scorr_dist, HAC_scorr)
        print("SC: ", sil)
        HAC_SPEARMAN.append(sil)

        HAC_dtw = AgglomerativeClustering(n_clusters=k).fit_predict(f_dtw_dist)
        print("HAC + dtw distance: ")
        sil = silhouette_score(f_dtw_dist, HAC_dtw)
        print("SC: ", sil)
        HAC_DTW.append(sil)

    print("Máximo de Euclidean ---> ", np.argmax(HAC_EUCLIDEAN))
    print("Máximo de Pearson ", np.argmax(HAC_CORRELATION))
    print("Máximo de Spearman ", np.argmax(HAC_SPEARMAN))
    print("Máximo de DWT ", np.argmax(HAC_DTW))  
    print("ELIMINANDO ", lista_excluded[np.argmax(HAC_EUCLIDEAN)])
    del lista_excluded[np.argmax(HAC_EUCLIDEAN)]
    max_HAC_EUCLIDEAN = max(HAC_EUCLIDEAN)
    features = features.drop(features.columns[np.argmax(HAC_EUCLIDEAN)], axis = 1)

HAC + euclidian distance: 
SC:  0.8366531567484077
HAC + corr distance: 
SC:  0.6521099197742336
HAC + scorr distance: 
SC:  0.6322153062170762
HAC + dtw distance: 
SC:  0.8340257672771032


  return linkage(y, method='ward', metric='euclidean')
  return linkage(y, method='ward', metric='euclidean')


HAC + euclidian distance: 
SC:  0.5627955917208324
HAC + corr distance: 
SC:  0.5422219573183724
HAC + scorr distance: 
SC:  0.64063466348969
HAC + dtw distance: 
SC:  0.5321607742585003


  return linkage(y, method='ward', metric='euclidean')
  return linkage(y, method='ward', metric='euclidean')


HAC + euclidian distance: 
SC:  0.836811270049642
HAC + corr distance: 
SC:  0.6300294199540093
HAC + scorr distance: 
SC:  0.6074577123461969
HAC + dtw distance: 
SC:  0.8344765254093334


  return linkage(y, method='ward', metric='euclidean')
  return linkage(y, method='ward', metric='euclidean')


HAC + euclidian distance: 
SC:  0.8368118950426964
HAC + corr distance: 
SC:  0.6323358483495387
HAC + scorr distance: 
SC:  0.6502890689629701
HAC + dtw distance: 
SC:  0.8344758448575843


  return linkage(y, method='ward', metric='euclidean')
  return linkage(y, method='ward', metric='euclidean')


HAC + euclidian distance: 
SC:  0.8368119076263152
HAC + corr distance: 
SC:  0.6300707142176256
HAC + scorr distance: 
SC:  0.5777531474806724
HAC + dtw distance: 
SC:  0.8344616977394743


  return linkage(y, method='ward', metric='euclidean')
  return linkage(y, method='ward', metric='euclidean')


KeyboardInterrupt: 

In [None]:
import csv

with open('SeleccionadosPorEliminacionKHAC', 'w') as f:
      
    # using csv.writer method from CSV package
    write = csv.writer(f)
      
    write.writerow(lista_excluded)

In [None]:
features