# Proposito

## 0.1 - Cargamos la matriz de distancias entre features

In [1]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from timeit import default_timer as timer

# Cargamos la matriz de distancias entre cada par de atributos del dataSet (a partir de los datos normalizados)
start = timer()
manhattan_dist = pd.read_parquet("../allen-molecular/data_mtg/48278/continuous/exon_data_48278_manhattan_attributes_float32.gzip")
end = timer()
print(str(end- start) + "segundos para cargar")
print(manhattan_dist.shape)

112.99707464699999segundos para cargar
(48278, 48278)


## 0.2 - Normalizamos las distancias al rango [0,1]

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
manhattan_dist = scaler.fit_transform(manhattan_dist)
manhattan_dist = manhattan_dist.astype(np.float32)
manhattan_dist.shape

## 0.3 - Almacenamos los datos

In [4]:
manhattan_dist = pd.DataFrame(manhattan_dist)
manhattan_dist.columns = manhattan_dist.columns.astype(str)
manhattan_dist.to_parquet("exon_data_48278_manhattan_attributes_normalized_float32.gzip", compression="gzip")

### --------------------------------------------------------

## 1 - Cargamos los datos normalizados

In [2]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from timeit import default_timer as timer

# Cargamos la matriz de distancias entre cada par de atributos del dataSet (a partir de los datos normalizados)
start = timer()
manhattan_dist = pd.read_parquet("../../allen-molecular/data_mtg/48278/continuous/exon_data_48278_manhattan_attributes_normalized_float32.gzip")
end = timer()
print(str(end- start) + "segundos para cargar")
print(manhattan_dist.shape)

91.274559364segundos para cargar
(48278, 48278)


### 1.1 - Generamos la matriz simetrica (no son simetricos por pequeños decimales del final)

In [3]:
upper_tri = np.triu(manhattan_dist)
upper_tril = np.transpose(upper_tri)
manhattan_dist = upper_tri + upper_tril
np.fill_diagonal(manhattan_dist, 0)
upper_tri=0
upper_tril=0

### 1.2 - Comprobamos que la matriz es simetrica

In [4]:
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

check_symmetric(manhattan_dist)

True

## 2 - Transformamos la matriz de distancias en una matriz de afinidad

In [5]:
#manhattan_dist = 1 - manhattan_dist # Si lo comento es para probar distancia versus afinidad
np.fill_diagonal(manhattan_dist, 0) # Mantenemos 0s en la diagonal por scipy

## 3 - Generamos el objeto distancia para Scipy

In [6]:
from scipy.spatial import distance

manhattan_dist = distance.squareform(manhattan_dist)

## 4 - Aprendemos el modelo de clustering con k clusters

In [7]:
import scipy.cluster.hierarchy as hclust

n_feature_clusters = 100

start = timer()
hclust_result = hclust.fcluster(hclust.centroid(manhattan_dist), n_feature_clusters, criterion="maxclust")
end = timer()

print(str(end - start) + "segundos para generar")

113.21261928500002segundos para generar


In [10]:
from joblib import dump

dump(hclust_result, "hclust_result_manhattan_100_distance_2.joblib")

['hclust_result_manhattan_100_distance_2.joblib']

### --------------------------------------------------------

## 5 - Cargamos el modelo de clustering

In [11]:
from joblib import load

hclust_result = load("hclust_result_manhattan_100_distance_2.joblib")

In [12]:
import numpy as np
unique, counts = np.unique(hclust_result, return_counts=True)
print(np.asarray((unique, counts)).T)

[[    1 48179]
 [    2     1]
 [    3     1]
 [    4     1]
 [    5     1]
 [    6     1]
 [    7     1]
 [    8     1]
 [    9     1]
 [   10     1]
 [   11     1]
 [   12     1]
 [   13     1]
 [   14     1]
 [   15     1]
 [   16     1]
 [   17     1]
 [   18     1]
 [   19     1]
 [   20     1]
 [   21     1]
 [   22     1]
 [   23     1]
 [   24     1]
 [   25     1]
 [   26     1]
 [   27     1]
 [   28     1]
 [   29     1]
 [   30     1]
 [   31     1]
 [   32     1]
 [   33     1]
 [   34     1]
 [   35     1]
 [   36     1]
 [   37     1]
 [   38     1]
 [   39     1]
 [   40     1]
 [   41     1]
 [   42     1]
 [   43     1]
 [   44     1]
 [   45     1]
 [   46     1]
 [   47     1]
 [   48     1]
 [   49     1]
 [   50     1]
 [   51     1]
 [   52     1]
 [   53     1]
 [   54     1]
 [   55     1]
 [   56     1]
 [   57     1]
 [   58     1]
 [   59     1]
 [   60     1]
 [   61     1]
 [   62     1]
 [   63     1]
 [   64     1]
 [   65     1]
 [   66     1]
 [   67   

## 6 - Cargamos los datos originales

In [13]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from timeit import default_timer as timer

# Cargamos la matriz de distancias entre cada par de atributos del dataSet (a partir de los datos normalizados)
start = timer()
data = pd.read_parquet("../../allen-molecular/data_mtg/48278/continuous/exon_data_48278.gzip")
end = timer()
print(str(end- start) + "segundos para cargar")
print(data.shape)

7.472053420999998segundos para cargar
(15928, 48278)


## 7 - Calculamos las distancias al centroide de los clusters y para cada uno escoger el atributo mas cercano

In [14]:
import sys
from scipy.spatial import distance

# Organizamos los elementos de cada cluster en listas

indices = []
for i in range(1, n_feature_clusters + 1):
    indices_i = []
    for j in range(0, len(hclust_result)):
        if hclust_result[j] == i:
            indices_i.append(j)
    indices.append(indices_i)

In [15]:
import sys
from scipy.spatial import distance

# Calculamos el centroide cada cluster
selected_columns = []
for k in range(0, len(indices)):
    indices_k_data = data.drop([ data.columns[i] for i in range(0, len(data.columns)) if i not in indices[k]], axis = 1)
    transposed_indices_k_data = indices_k_data.transpose()
    mean_value = transposed_indices_k_data.mean()
    # Calculamos la distancia de cada elemento
    min_dist = sys.float_info.max
    index_of_closest_element = -1
    for j in range(0, len(transposed_indices_k_data)):
        distance_to_mean = distance.cityblock(transposed_indices_k_data.iloc[j], mean_value)
        if distance_to_mean < min_dist:
            min_dist = distance_to_mean
            index_of_closest_element = j

    column_of_closest_element = indices_k_data.columns[index_of_closest_element]
    selected_columns.append(column_of_closest_element)

In [16]:
selected_columns

['X3324',
 'X26038',
 'X9456',
 'X334',
 'X6138',
 'X23253',
 'X94081',
 'X54843',
 'X3274',
 'X221692',
 'X6000',
 'X254531',
 'X2908',
 'X100130155',
 'X114088',
 'X23230',
 'X116966',
 'X64062',
 'X9645',
 'X6262',
 'X55814',
 'X23613',
 'X10147',
 'X54904',
 'X4898',
 'X5411',
 'X55704',
 'X440270',
 'X4897',
 'X23077',
 'X3535',
 'X9284',
 'X5310',
 'X4297',
 'X5297',
 'X8499',
 'X55904',
 'X57035',
 'X10905',
 'X28978',
 'X6134',
 'X54460',
 'X80205',
 'X56853',
 'X577',
 'X9324',
 'X5789',
 'X54212',
 'X4331',
 'X64848',
 'X6432',
 'X5144',
 'X140890',
 'X23262',
 'X9475',
 'X8997',
 'X55082',
 'X9699',
 'X10439',
 'X9987',
 'X4673',
 'X23112',
 'X57468',
 'X9515',
 'X3778',
 'X55294',
 'X23518',
 'X5101',
 'X6894',
 'X285175',
 'X286411',
 'X54715',
 'X10915',
 'X58517',
 'X55112',
 'X1655',
 'X8925',
 'X780813',
 'X347746',
 'X54737',
 'X9295',
 'X7345',
 'X151613',
 'X100128906',
 'X490',
 'X9406',
 'X4133',
 'X2823',
 'X9611',
 'X6430',
 'X1759',
 'X7267',
 'X1479',
 'X2891'

## 8 - Filtramos los datos originales con las columnas seleccionadas

In [17]:
data[selected_columns].to_parquet("exon_data_48278_hclust100_2.gzip", compression="gzip")

In [18]:
data[selected_columns].to_csv("exon_data_48278_hclust100_2.csv", index=False)