In [1]:
import sklearn as sk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.manifold import TSNE
from sklearn import metrics
%matplotlib inline  

# Lee dataset

In [2]:
lista_attr = []
with open('dataset.txt', 'r') as dataset:
    for line in dataset:
        line.strip()
        initia = line.find('[')
        lista_list = [float(x) for x in line[initia + 1: -2].split(', ')]
        lista_attr.append(lista_list)


# PCA

In [3]:
X = np.array(lista_attr)
pca = PCA(n_components = 50)
X_new = pca.fit_transform(X) 

In [4]:
print(pca.explained_variance_ratio_)
print(X_new.shape)
print(X_new)

[ 0.09677511  0.05886284  0.04549392  0.0336914   0.02658878  0.02501407
  0.02369812  0.02124501  0.01907082  0.01629285  0.01416668  0.01343007
  0.01297935  0.01197357  0.01121607  0.01039287  0.0093623   0.00887111
  0.00817023  0.00741812  0.00722397  0.00699888  0.00679285  0.0063146
  0.00619671  0.00586309  0.0056263   0.00529969  0.00510283  0.00481782
  0.00443305  0.00438449  0.00436361  0.00398347  0.00382859  0.00372668
  0.00355258  0.00351121  0.00331462  0.00327738  0.00317497  0.00308566
  0.00301051  0.00291343  0.00283867  0.00273589  0.00262605  0.0025985
  0.00256995  0.00251465]
(4214, 50)
[[ 107.86096449 -106.96748849  173.17571537 ...,  -17.71059721
   -46.64998284   33.60180537]
 [ -20.16786548   56.97777189 -142.40258096 ...,  -19.42941919
    17.80564715  -10.16072596]
 [ 230.29590346 -136.36689034  110.9021059  ...,   -6.47510932
    -7.06062784   19.67313856]
 ..., 
 [  32.55423293  -24.75614384  -77.32635939 ...,   -8.76670369
     3.55616005   -2.83493723

In [5]:
X_backup = X_new

# KMeans

In [6]:

k1 = 10
k2 = 20
k3 = 30
k4 = 100
klist = [k1,k2,k3,k4]
labels = []
centroids = []
silavg = []
for k in klist :
    kmeans = cluster.KMeans(n_clusters=k)
    kmeans.fit(X_new)
    cluster_labels = kmeans.labels_
    labels += [cluster_labels]
    centroids += [kmeans.cluster_centers_]
    silhouette_avg = silhouette_score(X_new, cluster_labels, metric='euclidean')
    print(silhouette_avg)
    print(metrics.calinski_harabaz_score(X_new, cluster_labels))
#Silhoutte Score 

0.0620599464588
192.990930275
0.0599385091704
126.288049418
0.0505567007558
97.924527086
0.0465901482874
43.8251679258


In [None]:
#Plot de cada K
n = 0
for k in klist :
    for i in range(k):
        # select only data observations with cluster label == i
        ds = X_new[np.where(labels[n]==i)]
        # plot the data observations
        plt.plot(ds[:,0],ds[:,1],'o')
        # plot the centroids
        lines = plt.plot(centroids[n][i,0],centroids[n][i,1],'kx')
        # make the centroid x's bigger
        plt.setp(lines,ms=15.0)
        plt.setp(lines,mew=2.0)
    n += 1
    plt.show()

In [None]:
for cen in labels:
    c = 0
    for cent in cen:
        c+=1
    print(c)
print(labels)
print(centroids)

# MeanShift

In [7]:
bandwidth = estimate_bandwidth(X_new)
ms = MeanShift(bandwidth=bandwidth)
ms.fit(X_new)
labels_ms = ms.labels_
cluster_centers_ms = ms.cluster_centers_
n_clusters_ = labels_ms.max()+1
print(n_clusters_)
print(labels_ms)
print(bandwidth)

12
[0 0 0 ..., 0 0 0]
369.431065758


In [9]:
silhouette_avg = silhouette_score(X_new, labels_ms)
print(silhouette_avg)
print(metrics.calinski_harabaz_score(X_new, labels_ms))

0.212359256564
14.2873224254


In [None]:
for i in range(k):
    # select only data observations with cluster label == i
    ds = X_new[np.where(labels_ms==i)]
    # plot the data observations
    plt.plot(ds[:,0],ds[:,1],'o')
    # plot the centroids
    lines = plt.plot(cluster_centers_ms[i,0],cluster_centers_ms[i,1],'kx')
    # make the centroid x's bigger
    plt.setp(lines,ms=15.0)
    plt.setp(lines,mew=2.0)
plt.show()

# DBSCAN

In [10]:
db = DBSCAN(eps = 100, min_samples = 4).fit(X_new)
db1 = DBSCAN(eps = 270, min_samples = 3).fit(X_new)
db2 = DBSCAN(eps = 300, min_samples = 2).fit(X_new)
dblabels = db.labels_
dblabels1 = db1.labels_
dblabels2 = db2.labels_

n_clusters_ = len(set(dblabels)) - (1 if -1 in dblabels else 0)
n_clusters_1 = len(set(dblabels1)) - (1 if -1 in dblabels1 else 0)
n_clusters_2 = len(set(dblabels2)) - (1 if -1 in dblabels2 else 0)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of clusters: %d' % n_clusters_1)
print('Estimated number of clusters: %d' % n_clusters_2)

Estimated number of clusters: 3
Estimated number of clusters: 8
Estimated number of clusters: 13


In [11]:
print(silhouette_score(X_new, dblabels))
print(silhouette_score(X_new, dblabels1))
print(silhouette_score(X_new, dblabels2))
print(metrics.calinski_harabaz_score(X_new, dblabels))
print(metrics.calinski_harabaz_score(X_new, dblabels1))
print(metrics.calinski_harabaz_score(X_new, dblabels2))

-0.113593679695
0.0951594037197
0.141705862929
54.7463111284
23.787623849
10.5161811362


In [None]:
np.set_printoptions(threshold=np.inf)
print(dblabels)
print(dblabels1)
print(dblabels2)

In [None]:
X_backup = X_new

In [None]:
modelo = TSNE(n_components = 2, perplexity = 50)
np.set_printoptions(suppress=True)
t_sne=modelo.fit_transform(X_backup)

In [None]:
print(t_sne)

# T-SNE

In [None]:
for i in t_sne:
    plt.plot(i[0], i[1], 'o')

# CSV

In [None]:
ids = []
with open('dataset.txt', 'r') as dataset:
    for line in dataset:
        id_start = line.find('/', 3)
        id_end = line.find('.', 3)
        ids.append(int(line[id_start+1:id_end]))
        
x = []
y = []
for t in t_sne:
    x.append(t[0])
    y.append(t[1])


DataSet = list(zip(ids, x, y))
df = pd.DataFrame(data = DataSet, columns=['Id', 'x', 'y'])
df
df.to_csv('datas.csv',index=False,header=False)