# Feature clustering

El objetivo es aplicar varios algoritmos de clustering sobre la transpuesta de los datos y quedarnos con el que nos de un numero de clusters adecuado.

## 1 - Cargamos la matriz de distancias entre features

In [1]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from timeit import default_timer as timer

# Cargamos la matriz de distancias entre cada par de atributos del dataSet (a partir de los datos normalizados)
start = timer()
manhattan_dist = pd.read_parquet("../../allen-molecular/data_mtg/48278/continuous/exon_data_48278_manhattan_attributes.gzip")
end = timer()
print(str(end- start) + "segundos para cargar")
print(manhattan_dist.shape)

40.736904445segundos para cargar
(48278, 48278)


## 2 - Normalizamos

In [2]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
manhattan_dist = scaler.fit_transform(manhattan_dist)
#manhattan_dist = manhattan_dist.astype(np.float32)
manhattan_dist.shape

(48278, 48278)

## 3 - Almacenamos los datos

In [3]:
manhattan_dist = pd.DataFrame(manhattan_dist)
manhattan_dist.columns = manhattan_dist.columns.astype(str)
manhattan_dist.to_parquet("exon_data_48278_manhattan_attributes_normalized.gzip", compression="gzip")

# -------------------------------------------------------------

## Cargamos los datos normalizados

In [1]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from timeit import default_timer as timer

# Cargamos la matriz de distancias entre cada par de atributos del dataSet (a partir de los datos normalizados)
start = timer()
manhattan_dist = pd.read_parquet("../../allen-molecular/data_mtg/48278/continuous/exon_data_48278_manhattan_attributes_normalized_float32.gzip")
end = timer()
print(str(end- start) + "segundos para cargar")
print(manhattan_dist.shape)

110.63944766399982segundos para cargar
(48278, 48278)


In [42]:
manhattan_dist_diag = np.diag(manhattan_dist)
manhattan_dist_upper = np.triu(manhattan_dist)
new_manhattan_dist = np.zeros((48278, 48278))
new_manhattan_dist = new_manhattan_dist + manhattan_dist_diag + manhattan_dist_upper + np.transpose(manhattan_dist_upper)

In [45]:
print(manhattan_dist.iloc[2,0])
print(new_manhattan_dist[2,0])
print(manhattan_dist.iloc[0,2])
print(new_manhattan_dist[0,2])

0.0016881251
0.0016884792130440474
0.0016884792
0.0016884792130440474


## 2 - Llamamos a los diferentes metodos de clustering

### 2.1 - Affinity propagation

In [2]:
from sklearn.cluster import AffinityPropagation
from joblib import dump

start = timer()
ap_result = AffinityPropagation(affinity = "precomputed", max_iter = 1500).fit(manhattan_dist)
end = timer()
print(str(end- start) + "segundos para aprender")

dump(ap_result, "ap_result.joblib")

76674.238733777segundos para aprender


['ap_result.joblib']

In [8]:
myset = set(ap_result.labels_)
print(myset)

{0, 1}


### 2.2 - DBSCAN

In [46]:
from sklearn.cluster import DBSCAN
from joblib import dump

start = timer()
dbscan_result = DBSCAN(metric = "precomputed", eps = 0.05, n_jobs = -1, min_samples = 5).fit(new_manhattan_dist)
end = timer()
print(str(end- start) + "segundos para aprender")

dump(dbscan_result, "dbscan_result_0_05.joblib")

64.27273856100055segundos para aprender


['dbscan_result_0_05.joblib']

In [47]:
from collections import Counter
c = Counter(list(dbscan_result.labels_))
print(Counter(el for el in c.elements() if c[el] >= 1))

Counter({0: 41532, -1: 6746})


### 2.3 - OPTICS (no se puede ejecutar, todavia en fase beta)

In [None]:
from sklearn.cluster import OPTICS
from joblib import dump

start = timer()
optics_result = OPTICS(metric = "precomputed", n_jobs = -1).fit(manhattan_dist)
end = timer()
print(str(end- start) + "segundos para aprender")

dump(optics_result, "optics_result.joblib")

In [None]:
import sklearn.cluster.