In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets

In [2]:
#### olivetti faces data
faces = datasets.fetch_olivetti_faces()
X = faces.data
y = faces.target

In [3]:
# data scalling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)

In [4]:
# http://stackoverflow.com/questions/16856788/slice-2d-array-into-smaller-2d-arrays
def blocks(A,nrows,ncols):
    h,w=A.shape
    return (A.reshape(h//nrows,nrows,-1,ncols).swapaxes(1,2).reshape(-1,nrows,ncols))

In [5]:
def splitIntoBlocks(X,n,m):
    Xb = []
    Yb = []
    for i in range(X.shape[0]):
        B = blocks(X[i,].reshape(n,n),nrows=m,ncols=m)
        for j in range(B.shape[0]):
            Xb.append(B[j].reshape(m*m))
            Yb.append(j)
    return np.asarray(Xb),np.asarray(Yb)

In [38]:
pict_size = 64
frame_size = 16
grid_size = pict_size//frame_size
n_components = grid_size**2
n_clusters = n_components

In [30]:
Xb, Yb = splitIntoBlocks(X,pict_size,frame_size)

In [31]:
from sklearn.decomposition import PCA
# PCA
pca = PCA(n_components=n_components)
pca.fit(Xb)

PCA(copy=True, n_components=16, whiten=False)

In [32]:
# face PARTS principal components
face = pca.components_
rng = np.arange(1,n_components+1,1)
for i in rng:
    plt.subplot(grid_size, grid_size,i)
    vmax = max(face[i-1].max(), -face[i-1].min())
    plt.imshow(face[i-1].reshape([frame_size, frame_size]), cmap=plt.cm.gray)
plt.show()

In [39]:
# clustering
from sklearn.cluster import KMeans
km = KMeans(n_clusters = n_clusters)
km.fit(Xb)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=16, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [10]:
Yb[0:64]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63])

In [11]:
km.labels_[0:64]

array([ 8, 45, 45, 33,  3,  8, 42, 57, 63, 42, 35,  7, 35,  7,  7, 26, 12,
       39,  4, 19,  4,  6, 12, 51,  3, 45, 45, 15,  3, 45,  8, 42, 39, 52,
       42, 35,  7,  7, 52, 12, 61, 10, 52,  7, 18, 10, 18, 57, 34, 10, 52,
        0,  0, 18, 18, 50, 34, 10, 10, 18,  7, 18, 18, 57])

In [42]:
plt.scatter(Yb,km.labels_,alpha=0.5)
plt.show()

In [41]:
tbl = {}
tbl['alphabet'] = Yb
tbl['labels'] = km.labels_
tbl = pd.DataFrame(tbl)
#tbl

In [36]:
grouped = tbl.groupby(['alphabet'])
tbl1 = grouped.aggregate(lambda x:tuple(x))
tbl1.head()

Unnamed: 0_level_0,labels
alphabet,Unnamed: 1_level_1
0,"(60, 47, 60, 27, 32, 33, 33, 63, 40, 60, 20, 2..."
1,"(47, 2, 47, 60, 32, 47, 28, 32, 28, 12, 15, 15..."
2,"(47, 47, 47, 0, 35, 60, 47, 39, 39, 35, 53, 53..."
3,"(37, 43, 47, 39, 10, 39, 37, 37, 10, 10, 60, 6..."
4,"(32, 32, 32, 27, 32, 17, 7, 11, 23, 7, 54, 7, ..."


In [43]:
# face PARTS cluster centers
face = km.cluster_centers_
rng = np.arange(1,n_clusters+1,1)
for i in rng:
    plt.subplot(grid_size, grid_size,i)
    vmax = max(face[i-1].max(), -face[i-1].min())
    plt.imshow(face[i-1].reshape([frame_size, frame_size]), cmap=plt.cm.gray)
plt.show()

In [167]:
from sklearn.decomposition import FastICA
ica = FastICA(n_components = n_components, algorithm = 'deflation', fun = 'cube')
ica.fit(Xb)

FastICA(algorithm='deflation', fun='cube', fun_args=None, max_iter=200,
    n_components=64, random_state=None, tol=0.0001, w_init=None,
    whiten=True)

In [174]:
from sklearn.decomposition import SparsePCA
# Sparse PCA
ica = SparsePCA(n_components=n_components, alpha = 5)
ica.fit(Xb)

SparsePCA(U_init=None, V_init=None, alpha=5, max_iter=1000, method='lars',
     n_components=64, n_jobs=1, random_state=None, ridge_alpha=0.01,
     tol=1e-08, verbose=False)

In [197]:
from sklearn.decomposition import FactorAnalysis
ica = FactorAnalysis(n_components=n_components)
ica.fit(Xb)

FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=64,
        noise_variance_init=None, random_state=0, svd_method='randomized',
        tol=0.01)

In [198]:
# face PARTS principal components
face = ica.components_
rng = np.arange(1,n_components+1,1)
for i in rng:
    plt.subplot(8, 8,i)
    vmax = max(face[i-1].max(), -face[i-1].min())
    plt.imshow(face[i-1].reshape([8, 8]), cmap=plt.cm.gray)
plt.show()