#### Unsupervised Feature Selection Algorithm implementation

Principal Feature Analysis (PFA). The following algorithm is taken from this [SO post](https://stats.stackexchange.com/questions/108743/methods-in-r-or-python-to-perform-feature-selection-in-unsupervised-learning)

In [1]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

class PFA(object):
    def __init__(self, n_features, q=None):
        self.q = q
        self.n_features = n_features

    def fit(self, X):
        if not self.q:
            self.q = X.shape[1]

        sc = StandardScaler()
        X = sc.fit_transform(X)

        pca = PCA(n_components=self.q).fit(X)
        A_q = pca.components_.T

        kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_

        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))

        self.indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features_ = X[:, self.indices_]

In [2]:
# Algorithm usage
import numpy as np
X = np.random.random((1000,1000))

pfa = PFA(n_features=10)
pfa.fit(X)

# To get the transformed matrix
X = pfa.features_

# To get the column indices of the kept features
column_indices = pfa.indices_

In [3]:
print(X)

[[-0.13023933 -1.30419708  1.14933476 ... -0.66435575  1.60746058
   0.76064346]
 [ 1.17312482  0.52622633 -0.400502   ... -1.03156042 -0.81912935
  -0.92818753]
 [-1.42008414  1.11894752  0.17319657 ... -0.80451841 -0.26180847
   0.73906683]
 ...
 [ 0.74854926  1.66229826  0.68285143 ... -0.24538595 -1.03514538
  -1.37725857]
 [ 1.22003961  0.47226095 -0.71505813 ...  1.17475542  0.25842982
  -0.15810318]
 [ 1.44676221  0.08842886  1.46956314 ... -0.37690115  1.48156863
  -1.22843251]]


In [4]:
print(column_indices)

[89, 43, 169, 30, 64, 162, 40, 88, 25, 522]


##### Other useful links

1. Variable selection for mixed data implementation in [R](http://varsellcm.r-forge.r-project.org/)