# Features Selection For Unsupervised Learning

Implementation: 
https://stats.stackexchange.com/questions/108743/methods-in-r-or-python-to-perform-feature-selection-in-unsupervised-learning

Original Paper:
http://venom.cs.utsa.edu/dmz/techrep/2007/CS-TR-2007-011.pdf

Note: I run five times of this algorithm for 100 features and finally aggregate all of the features and drop any duplicate!

In [1]:
import pandas as pd
import numpy as np

## Import Data

In [4]:
# Import CSV
df_events_all_accounts_day_7 = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_7.csv", 
            sep="\t")

In [5]:
# The features in the 
features_list = list(df_events_all_accounts_day_7)

# Remove the 'systemid'
del features_list[0]

In [6]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

# Features selection for unsupervised learning
class PFA(object):
    def __init__(self, n_features, q=None):
        self.q = q
        self.n_features = n_features

    def fit(self, X):
        if not self.q:
            self.q = X.shape[1]

        sc = StandardScaler()
        X = sc.fit_transform(X)

        pca = PCA(n_components=self.q).fit(X)
        A_q = pca.components_.T

        kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_

        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))

        self.indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features_ = X[:, self.indices_]

In [7]:
pfa = PFA(n_features=100)
pfa.fit(df_events_all_accounts_day_7)

# To get the transformed matrix
X = pfa.features_

# To get the column indices of the kept features
column_indices = pfa.indices_

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [8]:
# Printing the important features
for i in column_indices:
    print(features_list[i])

activateclient
timeentriesunbilled
clientimportcsvfailed
adddiscounttoinvoice
activatestaff
activateproject
api
deletebusinessaccountant
migrationhub-index-desktop-nav-move-later-blue-cta
createtax
customdefaultinvoiceterms
stripepaymentsuccessful
identitysignedupwithgoogle
passwordchanged
hoursupdatedapi
contractorinvitationsubscribe
ariasupplementalplandowngraded
bulkimportitemsandservicescomplete
balancesheetviewed
migrationhub-existfeature-know-and-love-start-move-cta
createbankaccount
emailfirstlatereminder
clickthrough:weekly_account_summary_report
clickthrough:weekly_account_summary_settings
ariasupplementalplanupgraded
estimatenotviewedbyclientnotification
adminautobilladdcardattempt
migrationhub-index-desktop-nav-start-move-cta
invoiceoverdue
migrationhub-index-modal-start-move-cta
deleteinvoice
subscriptiondeferredupgradeplancreated
expenseimportfailed
deletestaff
upgradeform-paymenterror
enabledoauth
adminde-activation
onlinepaymentrefund
collaboratorinvitationaccepted
migra